cumo 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +18 -37
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +28 -21
- data/CHANGELOG.md +28 -0
- data/Dockerfile +34 -0
- data/cumo.gemspec +1 -1
- data/docker-build.sh +4 -0
- data/docker-launch.sh +4 -0
- data/docs/src-tree.md +1 -1
- data/ext/cumo/cuda/cudnn_impl.cpp +25 -3
- data/ext/cumo/cuda/driver.c +8 -0
- data/ext/cumo/depend.erb +1 -1
- data/ext/cumo/extconf.rb +1 -1
- data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +14 -7
- data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +3 -3
- data/ext/cumo/include/cumo/narray.h +2 -0
- data/ext/cumo/include/cumo/types/complex.h +2 -2
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +15 -4
- data/ext/cumo/include/cumo/types/real_accum_kernel.h +15 -4
- data/ext/cumo/include/cumo/types/xint_macro_kernel.h +11 -3
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/narray/array.c +5 -3
- data/ext/cumo/narray/data.c +25 -26
- data/ext/cumo/narray/gen/tmpl/accum.c +2 -2
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +1 -1
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +4 -1
- data/ext/cumo/narray/gen/tmpl/allocate.c +1 -0
- data/ext/cumo/narray/gen/tmpl/aref.c +18 -18
- data/ext/cumo/narray/gen/tmpl/aset.c +16 -16
- data/ext/cumo/narray/gen/tmpl/batch_norm.c +4 -1
- data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +4 -1
- data/ext/cumo/narray/gen/tmpl/bincount.c +7 -7
- data/ext/cumo/narray/gen/tmpl/clip.c +11 -15
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/cum.c +1 -1
- data/ext/cumo/narray/gen/tmpl/each.c +4 -2
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +5 -2
- data/ext/cumo/narray/gen/tmpl/fixed_batch_norm.c +4 -1
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/logseq.c +6 -5
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +5 -6
- data/ext/cumo/narray/gen/tmpl/median.c +2 -2
- data/ext/cumo/narray/gen/tmpl/minmax.c +1 -1
- data/ext/cumo/narray/gen/tmpl/poly.c +4 -4
- data/ext/cumo/narray/gen/tmpl/rand.c +8 -6
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +18 -16
- data/ext/cumo/narray/gen/tmpl/seq.c +5 -4
- data/ext/cumo/narray/gen/tmpl/sort.c +2 -2
- data/ext/cumo/narray/gen/tmpl/sort_index.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +1 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +26 -32
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +18 -30
- data/ext/cumo/narray/index.c +1 -1
- data/ext/cumo/narray/narray.c +116 -21
- data/lib/cumo/narray/extra.rb +160 -156
- data/test/cuda/device_test.rb +2 -1
- data/test/cudnn_test.rb +2 -2
- data/test/narray_test.rb +80 -0
- data/test/ractor_test.rb +5 -3
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 17cb9dfdf9be41292bcd0204a67c5f919da60588d005d4441ad632767dce504c
|
|
4
|
+
data.tar.gz: 4c6e388cdb5b3b9f99d45a989a610d63e493fe53b3baad570c7bb2656f72d86c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 917adaa087836d673a143364f88fb9ddf91ad84cbcc064b115258e32f4b22e70a63c45f3e655a2e56ad58e70d6a4330f3e63ce177b5ad9e0a0c9c11685b39503
|
|
7
|
+
data.tar.gz: c5ec5a4179266a1cf4c5f15b5bc0d7e8b0b01a5a69bb7a75ad3c1387e861666da8b571156bc90295f2cf702558f18e332ce7fd68823330f834ea8d9c4f8b6419
|
data/.rubocop_todo.yml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on
|
|
3
|
+
# on 2026-01-09 18:33:26 UTC using RuboCop version 1.82.1.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
@@ -25,16 +25,8 @@ Bundler/OrderedGems:
|
|
|
25
25
|
Exclude:
|
|
26
26
|
- 'Gemfile'
|
|
27
27
|
|
|
28
|
-
# Offense count: 1
|
|
29
|
-
# Configuration parameters: EnforcedStyle, AllowedGems.
|
|
30
|
-
# SupportedStyles: required, forbidden
|
|
31
|
-
Gemspec/DependencyVersion:
|
|
32
|
-
Exclude:
|
|
33
|
-
- 'cumo.gemspec'
|
|
34
|
-
|
|
35
28
|
# Offense count: 2
|
|
36
29
|
# This cop supports safe autocorrection (--autocorrect).
|
|
37
|
-
# Configuration parameters: Severity.
|
|
38
30
|
Gemspec/DeprecatedAttributeAssignment:
|
|
39
31
|
Exclude:
|
|
40
32
|
- '3rd_party/mkmf-cu/mkmf-cu.gemspec'
|
|
@@ -49,14 +41,12 @@ Gemspec/DevelopmentDependencies:
|
|
|
49
41
|
|
|
50
42
|
# Offense count: 2
|
|
51
43
|
# This cop supports safe autocorrection (--autocorrect).
|
|
52
|
-
# Configuration parameters: Severity.
|
|
53
44
|
Gemspec/RequireMFA:
|
|
54
45
|
Exclude:
|
|
55
46
|
- '3rd_party/mkmf-cu/mkmf-cu.gemspec'
|
|
56
47
|
- 'cumo.gemspec'
|
|
57
48
|
|
|
58
49
|
# Offense count: 1
|
|
59
|
-
# Configuration parameters: Severity.
|
|
60
50
|
Gemspec/RequiredRubyVersion:
|
|
61
51
|
Exclude:
|
|
62
52
|
- '3rd_party/mkmf-cu/mkmf-cu.gemspec'
|
|
@@ -275,13 +265,15 @@ Layout/MultilineOperationIndentation:
|
|
|
275
265
|
- 'lib/cumo/narray/extra.rb'
|
|
276
266
|
- 'test/narray_test.rb'
|
|
277
267
|
|
|
278
|
-
# Offense count:
|
|
268
|
+
# Offense count: 27
|
|
279
269
|
# This cop supports safe autocorrection (--autocorrect).
|
|
280
270
|
# Configuration parameters: InspectBlocks.
|
|
281
271
|
Layout/RedundantLineBreak:
|
|
282
272
|
Exclude:
|
|
283
273
|
- '3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb'
|
|
274
|
+
- '3rd_party/mkmf-cu/mkmf-cu.gemspec'
|
|
284
275
|
- '3rd_party/mkmf-cu/test/test_mkmf-cu.rb'
|
|
276
|
+
- 'ext/cumo/extconf.rb'
|
|
285
277
|
- 'ext/cumo/narray/gen/narray_def.rb'
|
|
286
278
|
- 'test/bit_test.rb'
|
|
287
279
|
- 'test/cudnn_test.rb'
|
|
@@ -379,7 +371,7 @@ Lint/ConstantDefinitionInBlock:
|
|
|
379
371
|
Exclude:
|
|
380
372
|
- 'test/cuda/compiler_test.rb'
|
|
381
373
|
|
|
382
|
-
# Offense count:
|
|
374
|
+
# Offense count: 665
|
|
383
375
|
# Configuration parameters: Only, Ignore.
|
|
384
376
|
Lint/ConstantResolution:
|
|
385
377
|
Enabled: false
|
|
@@ -396,7 +388,7 @@ Lint/ErbNewArguments:
|
|
|
396
388
|
Exclude:
|
|
397
389
|
- 'ext/cumo/narray/gen/erbpp2.rb'
|
|
398
390
|
|
|
399
|
-
# Offense count:
|
|
391
|
+
# Offense count: 15
|
|
400
392
|
Lint/FloatComparison:
|
|
401
393
|
Exclude:
|
|
402
394
|
- 'test/narray_test.rb'
|
|
@@ -419,18 +411,20 @@ Lint/NonAtomicFileOperation:
|
|
|
419
411
|
Exclude:
|
|
420
412
|
- 'lib/cumo/cuda/compiler.rb'
|
|
421
413
|
|
|
422
|
-
# Offense count:
|
|
414
|
+
# Offense count: 34
|
|
423
415
|
# This cop supports unsafe autocorrection (--autocorrect-all).
|
|
424
416
|
# Configuration parameters: AllowedMethods, AllowedPatterns, IgnoredClasses.
|
|
425
417
|
# IgnoredClasses: Time, DateTime
|
|
426
418
|
Lint/NumberConversion:
|
|
427
419
|
Exclude:
|
|
420
|
+
- '3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb'
|
|
428
421
|
- 'bench/cumo_bench.rb'
|
|
429
422
|
- 'bench/numo_bench.rb'
|
|
430
423
|
- 'ext/cumo/narray/gen/cogen_kernel.rb'
|
|
431
424
|
- 'ext/cumo/narray/gen/erbln.rb'
|
|
432
425
|
- 'lib/cumo/narray/extra.rb'
|
|
433
426
|
- 'test/cudnn_test.rb'
|
|
427
|
+
- 'test/narray_test.rb'
|
|
434
428
|
|
|
435
429
|
# Offense count: 2
|
|
436
430
|
# This cop supports safe autocorrection (--autocorrect).
|
|
@@ -522,17 +516,6 @@ Naming/MethodParameterName:
|
|
|
522
516
|
- 'lib/cumo/narray/extra.rb'
|
|
523
517
|
- 'test/ractor_test.rb'
|
|
524
518
|
|
|
525
|
-
# Offense count: 1
|
|
526
|
-
# Configuration parameters: NamePrefix, ForbiddenPrefixes, AllowedMethods, MethodDefinitionMacros, UseSorbetSigs.
|
|
527
|
-
# NamePrefix: is_, has_, have_, does_
|
|
528
|
-
# ForbiddenPrefixes: is_, has_, have_, does_
|
|
529
|
-
# AllowedMethods: is_a?
|
|
530
|
-
# MethodDefinitionMacros: define_method, define_singleton_method
|
|
531
|
-
Naming/PredicatePrefix:
|
|
532
|
-
Exclude:
|
|
533
|
-
- 'spec/**/*'
|
|
534
|
-
- 'ext/cumo/extconf.rb'
|
|
535
|
-
|
|
536
519
|
# Offense count: 1
|
|
537
520
|
# Configuration parameters: EnforcedStyle, CheckMethodNames, CheckSymbols, AllowedIdentifiers, AllowedPatterns.
|
|
538
521
|
# SupportedStyles: snake_case, normalcase, non_integer
|
|
@@ -719,7 +702,7 @@ Style/Documentation:
|
|
|
719
702
|
- 'lib/cumo/linalg.rb'
|
|
720
703
|
- 'lib/cumo/narray/extra.rb'
|
|
721
704
|
|
|
722
|
-
# Offense count:
|
|
705
|
+
# Offense count: 202
|
|
723
706
|
# Configuration parameters: AllowedMethods, RequireForNonPublicMethods.
|
|
724
707
|
Style/DocumentationMethod:
|
|
725
708
|
Enabled: false
|
|
@@ -767,7 +750,7 @@ Style/FileWrite:
|
|
|
767
750
|
Exclude:
|
|
768
751
|
- 'lib/cumo/cuda/compiler.rb'
|
|
769
752
|
|
|
770
|
-
# Offense count:
|
|
753
|
+
# Offense count: 27
|
|
771
754
|
# Configuration parameters: AllowedVariables.
|
|
772
755
|
Style/GlobalVars:
|
|
773
756
|
Exclude:
|
|
@@ -775,12 +758,11 @@ Style/GlobalVars:
|
|
|
775
758
|
- 'ext/cumo/narray/gen/cogen.rb'
|
|
776
759
|
- 'ext/cumo/narray/gen/cogen_kernel.rb'
|
|
777
760
|
|
|
778
|
-
# Offense count:
|
|
761
|
+
# Offense count: 10
|
|
779
762
|
# This cop supports safe autocorrection (--autocorrect).
|
|
780
763
|
# Configuration parameters: MinBodyLength, AllowConsecutiveConditionals.
|
|
781
764
|
Style/GuardClause:
|
|
782
765
|
Exclude:
|
|
783
|
-
- 'ext/cumo/extconf.rb'
|
|
784
766
|
- 'ext/cumo/narray/gen/erbpp2.rb'
|
|
785
767
|
- 'lib/cumo/cuda/link_state.rb'
|
|
786
768
|
- 'lib/cumo/cuda/module.rb'
|
|
@@ -865,9 +847,9 @@ Style/InvertibleUnlessCondition:
|
|
|
865
847
|
- 'lib/cumo/cuda/compiler.rb'
|
|
866
848
|
- 'lib/cumo/cuda/device.rb'
|
|
867
849
|
|
|
868
|
-
# Offense count:
|
|
850
|
+
# Offense count: 119
|
|
869
851
|
# This cop supports safe autocorrection (--autocorrect).
|
|
870
|
-
# Configuration parameters: IgnoreMacros, AllowedMethods, AllowedPatterns, IncludedMacros, AllowParenthesesInMultilineCall, AllowParenthesesInChaining, AllowParenthesesInCamelCaseMethod, AllowParenthesesInStringInterpolation, EnforcedStyle.
|
|
852
|
+
# Configuration parameters: IgnoreMacros, AllowedMethods, AllowedPatterns, IncludedMacros, IncludedMacroPatterns, AllowParenthesesInMultilineCall, AllowParenthesesInChaining, AllowParenthesesInCamelCaseMethod, AllowParenthesesInStringInterpolation, EnforcedStyle.
|
|
871
853
|
# SupportedStyles: require_parentheses, omit_parentheses
|
|
872
854
|
Style/MethodCallWithArgsParentheses:
|
|
873
855
|
Enabled: false
|
|
@@ -888,7 +870,7 @@ Style/MethodCalledOnDoEndBlock:
|
|
|
888
870
|
- 'ext/cumo/narray/gen/cogen_kernel.rb'
|
|
889
871
|
- 'lib/cumo/narray/extra.rb'
|
|
890
872
|
|
|
891
|
-
# Offense count:
|
|
873
|
+
# Offense count: 105
|
|
892
874
|
# This cop supports safe autocorrection (--autocorrect).
|
|
893
875
|
# Configuration parameters: EnforcedStyle.
|
|
894
876
|
# SupportedStyles: if, case, both
|
|
@@ -923,13 +905,12 @@ Style/MutableConstant:
|
|
|
923
905
|
- '3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb'
|
|
924
906
|
- 'test/test_helper.rb'
|
|
925
907
|
|
|
926
|
-
# Offense count:
|
|
908
|
+
# Offense count: 13
|
|
927
909
|
# This cop supports safe autocorrection (--autocorrect).
|
|
928
910
|
# Configuration parameters: EnforcedStyle.
|
|
929
911
|
# SupportedStyles: both, prefix, postfix
|
|
930
912
|
Style/NegatedIf:
|
|
931
913
|
Exclude:
|
|
932
|
-
- 'ext/cumo/extconf.rb'
|
|
933
914
|
- 'ext/cumo/narray/gen/erbpp2.rb'
|
|
934
915
|
- 'ext/cumo/narray/gen/spec.rb'
|
|
935
916
|
- 'lib/cumo/narray/extra.rb'
|
|
@@ -1190,7 +1171,7 @@ Style/StringHashKeys:
|
|
|
1190
1171
|
Exclude:
|
|
1191
1172
|
- '3rd_party/mkmf-cu/test/test_mkmf-cu.rb'
|
|
1192
1173
|
|
|
1193
|
-
# Offense count:
|
|
1174
|
+
# Offense count: 1369
|
|
1194
1175
|
# This cop supports safe autocorrection (--autocorrect).
|
|
1195
1176
|
# Configuration parameters: EnforcedStyle, ConsistentQuotesInMultiline.
|
|
1196
1177
|
# SupportedStyles: single_quotes, double_quotes
|
|
@@ -1229,7 +1210,7 @@ Style/TernaryParentheses:
|
|
|
1229
1210
|
- 'ext/cumo/narray/gen/narray_def.rb'
|
|
1230
1211
|
- 'lib/cumo/narray/extra.rb'
|
|
1231
1212
|
|
|
1232
|
-
# Offense count:
|
|
1213
|
+
# Offense count: 8
|
|
1233
1214
|
Style/TopLevelMethodDefinition:
|
|
1234
1215
|
Exclude:
|
|
1235
1216
|
- 'bench/cumo_bench.rb'
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "mkmf"
|
|
3
4
|
require "open3"
|
|
4
5
|
require_relative "nvcc"
|
|
5
6
|
|
|
@@ -37,29 +38,35 @@ module MakeMakefileCuda
|
|
|
37
38
|
cmd = "nvcc #{s}"
|
|
38
39
|
if ENV['CUMO_NVCC_GENERATE_CODE']
|
|
39
40
|
cmd << " --generate-code=#{ENV['CUMO_NVCC_GENERATE_CODE']}"
|
|
40
|
-
elsif ENV['DEBUG']
|
|
41
|
-
cmd << " -arch=sm_35"
|
|
42
41
|
else
|
|
43
|
-
|
|
44
|
-
if
|
|
45
|
-
|
|
46
|
-
capability = [
|
|
47
|
-
elsif cuda_version >= Gem::Version.new("12.9")
|
|
48
|
-
# CUDA 12.9
|
|
49
|
-
capability = [50, 60, 70, 75, 87, 89, 90, 121]
|
|
50
|
-
elsif cuda_version >= Gem::Version.new("12.8")
|
|
51
|
-
# CUDA 12.8
|
|
52
|
-
capability = [50, 60, 70, 75, 87, 89, 90, 120]
|
|
53
|
-
elsif cuda_version >= Gem::Version.new("12.0")
|
|
54
|
-
# CUDA 12.0 – 12.6
|
|
55
|
-
capability = [50, 60, 70, 75, 87, 89, 90]
|
|
56
|
-
elsif cuda_version >= Gem::Version.new("11.8")
|
|
57
|
-
# CUDA 11.8
|
|
58
|
-
capability = [35, 50, 60, 70, 75, 87, 89, 90]
|
|
59
|
-
else
|
|
60
|
-
# CUDA 11.0
|
|
61
|
-
capability = [35, 50, 60, 70, 75, 80]
|
|
42
|
+
capability = nil
|
|
43
|
+
if find_executable('nvidia-smi')
|
|
44
|
+
arch_version = `nvidia-smi --query-gpu=compute_cap --format=csv,noheader`.strip
|
|
45
|
+
capability = [(arch_version.to_f * 10).to_i] unless arch_version.empty?
|
|
62
46
|
end
|
|
47
|
+
unless capability
|
|
48
|
+
# Ref. https://en.wikipedia.org/wiki/CUDA
|
|
49
|
+
if cuda_version >= Gem::Version.new("13.0")
|
|
50
|
+
# CUDA 13.0
|
|
51
|
+
capability = [75, 80, 86, 87, 89, 90, 100, 103, 110, 120, 121]
|
|
52
|
+
elsif cuda_version >= Gem::Version.new("12.9")
|
|
53
|
+
# CUDA 12.9
|
|
54
|
+
capability = [50, 60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90, 100, 103, 110, 120, 121]
|
|
55
|
+
elsif cuda_version >= Gem::Version.new("12.8")
|
|
56
|
+
# CUDA 12.8
|
|
57
|
+
capability = [50, 60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90, 100, 103, 110, 120]
|
|
58
|
+
elsif cuda_version >= Gem::Version.new("12.0")
|
|
59
|
+
# CUDA 12.0 – 12.6
|
|
60
|
+
capability = [50, 60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90]
|
|
61
|
+
elsif cuda_version >= Gem::Version.new("11.8")
|
|
62
|
+
# CUDA 11.8
|
|
63
|
+
capability = [35, 50, 60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90]
|
|
64
|
+
else
|
|
65
|
+
# CUDA 11.0
|
|
66
|
+
capability = [35, 50, 60, 61, 62, 70, 72, 75, 80]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
63
70
|
capability.each do |arch|
|
|
64
71
|
cmd << " --generate-code=arch=compute_#{arch},code=sm_#{arch}"
|
|
65
72
|
end
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,31 @@
|
|
|
1
|
+
# 0.5.2 (2025/01/25)
|
|
2
|
+
|
|
3
|
+
Fixes:
|
|
4
|
+
|
|
5
|
+
* Backport: Add support for copy on write with store_binary and frozen string
|
|
6
|
+
* Remove unnecessary debug code
|
|
7
|
+
* Fix capability list
|
|
8
|
+
* Build only with supported capabilities to reduce compilation time
|
|
9
|
+
* Fix SEGV when calling {mean, var, stddev, rms} on a single-element array (#154)
|
|
10
|
+
* Suppress warning message for deprecated declarations
|
|
11
|
+
* Fix variable typo in complex log2 and log10 functions (#152)
|
|
12
|
+
|
|
13
|
+
# 0.5.1 (2025/12/30)
|
|
14
|
+
|
|
15
|
+
Enhancements:
|
|
16
|
+
|
|
17
|
+
* Add CUDA 13 support (#153)
|
|
18
|
+
* Add cuDNN 9 support
|
|
19
|
+
|
|
20
|
+
Fixes:
|
|
21
|
+
|
|
22
|
+
* Backport: fix example code
|
|
23
|
+
* Backport: fix example code
|
|
24
|
+
* Backport: fix doc
|
|
25
|
+
* Backport: fix documents
|
|
26
|
+
* Backport: fix document of logseq
|
|
27
|
+
* Backport: trim comment out
|
|
28
|
+
|
|
1
29
|
# 0.5.0 (2025/11/01)
|
|
2
30
|
|
|
3
31
|
Fixes:
|
data/Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
|
2
|
+
|
|
3
|
+
ARG RUBY_VERSION=3.4.7
|
|
4
|
+
|
|
5
|
+
ENV DEBIAN_FRONTEND=noninteractive
|
|
6
|
+
ENV RBENV_ROOT="/root/.rbenv"
|
|
7
|
+
ENV PATH="${RBENV_ROOT}/bin:${RBENV_ROOT}/shims:${PATH}"
|
|
8
|
+
|
|
9
|
+
ENV CUDA_PATH=/usr/local/cuda
|
|
10
|
+
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
|
|
11
|
+
ENV CPATH=/usr/local/cuda/include:${CPATH}
|
|
12
|
+
ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH}
|
|
13
|
+
|
|
14
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
15
|
+
git \
|
|
16
|
+
build-essential \
|
|
17
|
+
wget \
|
|
18
|
+
curl \
|
|
19
|
+
vim \
|
|
20
|
+
ca-certificates \
|
|
21
|
+
libssl-dev \
|
|
22
|
+
libreadline-dev \
|
|
23
|
+
zlib1g-dev \
|
|
24
|
+
libyaml-dev \
|
|
25
|
+
libffi-dev \
|
|
26
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
27
|
+
|
|
28
|
+
RUN git clone --depth 1 https://github.com/rbenv/ruby-build.git && \
|
|
29
|
+
cd ruby-build/bin && ./ruby-build ${RUBY_VERSION} /usr && \
|
|
30
|
+
git config --global --add safe.directory /workspace
|
|
31
|
+
|
|
32
|
+
WORKDIR /workspace
|
|
33
|
+
|
|
34
|
+
CMD ["/bin/bash"]
|
data/cumo.gemspec
CHANGED
|
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
|
19
19
|
spec.required_ruby_version = ">= 3.0.0"
|
|
20
20
|
|
|
21
21
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
|
22
|
-
f.match(%r{^(test|spec|features)/})
|
|
22
|
+
f.match(%r{^(test|spec|features|docker)/})
|
|
23
23
|
end
|
|
24
24
|
spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
|
25
25
|
spec.bindir = "exe"
|
data/docker-build.sh
ADDED
data/docker-launch.sh
ADDED
data/docs/src-tree.md
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* Technically, it is not possible to use CRuby API such as `VALUE` in .cu files.
|
|
7
7
|
* CRuby API is not callable from CUDA kernel because they do not have `__device__` modifier.
|
|
8
8
|
* nvcc does not support `#include RUBY_EXTCONF_H`, so can not include `ruby.h`.
|
|
9
|
-
* (RULE) It is allowed to use C++
|
|
9
|
+
* (RULE) It is allowed to use C++17 codes in .cu files.
|
|
10
10
|
* Rest of `*.{h,c}` files are for host (CPU).
|
|
11
11
|
* Call C wrapper functions defined in .cu files.
|
|
12
12
|
* It can use CRuby API.
|
|
@@ -74,6 +74,25 @@ cumo_cuda_cudnn_CreateTensorDescriptor(
|
|
|
74
74
|
status = cudnnSetTensor4dDescriptor(
|
|
75
75
|
*desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
|
|
76
76
|
}
|
|
77
|
+
else if (ndim < 4) {
|
|
78
|
+
// cuDNN 9 fix: Force 4D (N, C, H, W)
|
|
79
|
+
int pad_shape[4] = {1, 1, 1, 1};
|
|
80
|
+
|
|
81
|
+
if (ndim == 1) {
|
|
82
|
+
// 1D: arrays are treated as "Channel" (1, C, 1, 1)
|
|
83
|
+
pad_shape[1] = (int)(shape[0]);
|
|
84
|
+
} else {
|
|
85
|
+
// 2D: [N, C] -> [N, C, 1, 1]
|
|
86
|
+
// 3D: [N, C, H] -> [N, C, H, 1]
|
|
87
|
+
for (int idim = 0; idim < ndim; ++idim) {
|
|
88
|
+
pad_shape[idim] = (int)(shape[idim]);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
status = cudnnSetTensor4dDescriptor(
|
|
93
|
+
*desc, CUDNN_TENSOR_NCHW, cudnn_dtype,
|
|
94
|
+
pad_shape[0], pad_shape[1], pad_shape[2], pad_shape[3]);
|
|
95
|
+
}
|
|
77
96
|
else {
|
|
78
97
|
int int_shape[CUMO_NA_MAX_DIMENSION];
|
|
79
98
|
for (int idim = 0; idim < ndim; ++idim) {
|
|
@@ -514,8 +533,11 @@ cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
|
|
|
514
533
|
// TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
|
|
515
534
|
cudnnBatchNormMode_t
|
|
516
535
|
cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
|
|
517
|
-
if (ndim == 1
|
|
518
|
-
return
|
|
536
|
+
if (ndim == 1) {
|
|
537
|
+
return CUDNN_BATCHNORM_SPATIAL;
|
|
538
|
+
}
|
|
539
|
+
if (ndim == 2) {
|
|
540
|
+
return CUDNN_BATCHNORM_SPATIAL;
|
|
519
541
|
}
|
|
520
542
|
if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
|
|
521
543
|
(ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) { // (1, channels, (1, )1, 1)
|
|
@@ -533,7 +555,7 @@ cumo_cuda_cudnn_CreateBNTensorDescriptor(
|
|
|
533
555
|
{
|
|
534
556
|
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
|
535
557
|
status = cudnnCreateTensorDescriptor(desc);
|
|
536
|
-
if (status
|
|
558
|
+
if (status == CUDNN_STATUS_SUCCESS) return status;
|
|
537
559
|
|
|
538
560
|
status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
|
|
539
561
|
return status;
|
data/ext/cumo/cuda/driver.c
CHANGED
|
@@ -33,7 +33,11 @@ rb_cuCtxCreate(VALUE self, VALUE flags, VALUE dev)
|
|
|
33
33
|
CUcontext _pctx;
|
|
34
34
|
CUresult status;
|
|
35
35
|
|
|
36
|
+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
|
|
37
|
+
status = cuCtxCreate(&_pctx, NULL, _flags, _dev);
|
|
38
|
+
#else
|
|
36
39
|
status = cuCtxCreate(&_pctx, _flags, _dev);
|
|
40
|
+
#endif
|
|
37
41
|
|
|
38
42
|
check_status(status);
|
|
39
43
|
return SIZET2NUM((size_t)_pctx);
|
|
@@ -418,5 +422,9 @@ Init_cumo_cuda_driver()
|
|
|
418
422
|
|
|
419
423
|
cuInit(0);
|
|
420
424
|
cuDeviceGet(&cuDevice, 0);
|
|
425
|
+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
|
|
426
|
+
cuCtxCreate(&context, NULL, 0, cuDevice);
|
|
427
|
+
#else
|
|
421
428
|
cuCtxCreate(&context, 0, cuDevice);
|
|
429
|
+
#endif
|
|
422
430
|
}
|
data/ext/cumo/depend.erb
CHANGED
|
@@ -55,6 +55,6 @@ run-ctest : <%= __dir__ %>/cuda/memory_pool_impl_test.exe
|
|
|
55
55
|
./$<
|
|
56
56
|
|
|
57
57
|
<%= __dir__ %>/cuda/memory_pool_impl_test.exe: <%= __dir__ %>/cuda/memory_pool_impl_test.cpp <%= __dir__ %>/cuda/memory_pool_impl.cpp <%= __dir__ %>/cuda/memory_pool_impl.hpp
|
|
58
|
-
nvcc -std=c++
|
|
58
|
+
nvcc -std=c++17 <%= ENV['DEBUG'] ? '-g -O0 --compiler-options -Wall' : '' %> -L. -L$(libdir) -I. $(INCFLAGS) -o $@ $< <%= __dir__ %>/cuda/memory_pool_impl.cpp
|
|
59
59
|
|
|
60
60
|
CLEANOBJS = <%= __dir__ %>/*.o <%= __dir__ %>/*/*.o <%= __dir__ %>/*/*/*.o <%= __dir__ %>/*.bak <%= __dir__ %>/narray/types/*.c <%= __dir__ %>/narray/types/*_kernel.cu <%= __dir__ %>/*.exe <%= __dir__ %>/*/*.exe
|
data/ext/cumo/extconf.rb
CHANGED
|
@@ -29,7 +29,7 @@ MakeMakefileCuda.install!(cxx: true)
|
|
|
29
29
|
if ENV['DEBUG']
|
|
30
30
|
$CFLAGS << " -g -O0 -Wall"
|
|
31
31
|
end
|
|
32
|
-
$CXXFLAGS << " -std=c++
|
|
32
|
+
$CXXFLAGS << " -std=c++17"
|
|
33
33
|
#$CFLAGS=" $(cflags) -O3 -m64 -msse2 -funroll-loops"
|
|
34
34
|
#$CFLAGS=" $(cflags) -O3"
|
|
35
35
|
$INCFLAGS = "-I$(srcdir)/include -I$(srcdir)/narray -I$(srcdir)/cuda #{$INCFLAGS}"
|
|
@@ -26,10 +26,12 @@ class cumo_thrust_strided_range
|
|
|
26
26
|
{
|
|
27
27
|
public:
|
|
28
28
|
|
|
29
|
-
typedef typename thrust::
|
|
29
|
+
typedef typename thrust::iterator_traits<Iterator>::difference_type difference_type;
|
|
30
30
|
|
|
31
|
-
struct stride_functor
|
|
31
|
+
struct stride_functor
|
|
32
32
|
{
|
|
33
|
+
using argument_type = difference_type;
|
|
34
|
+
using result_type = difference_type;
|
|
33
35
|
difference_type stride;
|
|
34
36
|
|
|
35
37
|
stride_functor(difference_type stride)
|
|
@@ -86,8 +88,10 @@ struct cumo_thrust_minmax_pair
|
|
|
86
88
|
// returns a cumo_thrust_minmax_pair whose minimum and maximum values
|
|
87
89
|
// are initialized to x.
|
|
88
90
|
template <typename T>
|
|
89
|
-
struct cumo_thrust_minmax_unary_op
|
|
91
|
+
struct cumo_thrust_minmax_unary_op
|
|
90
92
|
{
|
|
93
|
+
using argument_type = T;
|
|
94
|
+
using result_type = cumo_thrust_minmax_pair<T>;
|
|
91
95
|
__host__ __device__ cumo_thrust_minmax_pair<T> operator()(const T& x) const
|
|
92
96
|
{
|
|
93
97
|
cumo_thrust_minmax_pair<T> result;
|
|
@@ -102,8 +106,11 @@ struct cumo_thrust_minmax_unary_op : public thrust::unary_function< T, cumo_thru
|
|
|
102
106
|
// maximum values are the min() and max() respectively of
|
|
103
107
|
// the minimums and maximums of the input pairs
|
|
104
108
|
template <typename T>
|
|
105
|
-
struct cumo_thrust_minmax_binary_op
|
|
109
|
+
struct cumo_thrust_minmax_binary_op
|
|
106
110
|
{
|
|
111
|
+
using first_argument_type = cumo_thrust_minmax_pair<T>;
|
|
112
|
+
using second_argument_type = cumo_thrust_minmax_pair<T>;
|
|
113
|
+
using result_type = cumo_thrust_minmax_pair<T>;
|
|
107
114
|
__host__ __device__ cumo_thrust_minmax_pair<T> operator()(const cumo_thrust_minmax_pair<T>& x, const cumo_thrust_minmax_pair<T>& y) const
|
|
108
115
|
{
|
|
109
116
|
cumo_thrust_minmax_pair<T> result;
|
|
@@ -157,10 +164,10 @@ struct cumo_thrust_variance_unary_op
|
|
|
157
164
|
// all values that have been agregated so far
|
|
158
165
|
template <typename T>
|
|
159
166
|
struct cumo_thrust_variance_binary_op
|
|
160
|
-
: public thrust::binary_function<const cumo_thrust_variance_data<T>&,
|
|
161
|
-
const cumo_thrust_variance_data<T>&,
|
|
162
|
-
cumo_thrust_variance_data<T> >
|
|
163
167
|
{
|
|
168
|
+
using first_argument_type = const cumo_thrust_variance_data<T>&;
|
|
169
|
+
using second_argument_type = const cumo_thrust_variance_data<T>&;
|
|
170
|
+
using result_type = cumo_thrust_variance_data<T>;
|
|
164
171
|
__host__ __device__
|
|
165
172
|
cumo_thrust_variance_data<T> operator()(const cumo_thrust_variance_data<T>& x, const cumo_thrust_variance_data <T>& y) const
|
|
166
173
|
{
|
|
@@ -49,10 +49,10 @@ struct cumo_thrust_complex_variance_unary_op
|
|
|
49
49
|
// all values that have been agregated so far
|
|
50
50
|
template <typename T, typename R>
|
|
51
51
|
struct cumo_thrust_complex_variance_binary_op
|
|
52
|
-
: public thrust::binary_function<const cumo_thrust_complex_variance_data<T,R>&,
|
|
53
|
-
const cumo_thrust_complex_variance_data<T,R>&,
|
|
54
|
-
cumo_thrust_complex_variance_data<T,R> >
|
|
55
52
|
{
|
|
53
|
+
using first_argument_type = const cumo_thrust_complex_variance_data<T,R>&;
|
|
54
|
+
using second_argument_type = const cumo_thrust_complex_variance_data<T,R>&;
|
|
55
|
+
using result_type = cumo_thrust_complex_variance_data<T,R>;
|
|
56
56
|
__host__ __device__
|
|
57
57
|
cumo_thrust_complex_variance_data<T,R> operator()(const cumo_thrust_complex_variance_data<T,R>& x, const cumo_thrust_complex_variance_data<T,R>& y) const
|
|
58
58
|
{
|
|
@@ -226,6 +226,7 @@ typedef struct {
|
|
|
226
226
|
typedef struct {
|
|
227
227
|
cumo_narray_t base;
|
|
228
228
|
char *ptr;
|
|
229
|
+
bool owned;
|
|
229
230
|
} cumo_narray_data_t;
|
|
230
231
|
|
|
231
232
|
|
|
@@ -360,6 +361,7 @@ _cumo_na_get_narray_t(VALUE obj, unsigned char cumo_na_type)
|
|
|
360
361
|
#define CUMO_NA_DATA(na) ((cumo_narray_data_t*)(na))
|
|
361
362
|
#define CUMO_NA_VIEW(na) ((cumo_narray_view_t*)(na))
|
|
362
363
|
#define CUMO_NA_DATA_PTR(na) (CUMO_NA_DATA(na)->ptr)
|
|
364
|
+
#define CUMO_NA_DATA_OWNED(na) (CUMO_NA_DATA(na)->owned)
|
|
363
365
|
#define CUMO_NA_VIEW_DATA(na) (CUMO_NA_VIEW(na)->data)
|
|
364
366
|
#define CUMO_NA_VIEW_OFFSET(na) (CUMO_NA_VIEW(na)->offset)
|
|
365
367
|
#define CUMO_NA_VIEW_STRIDX(na) (CUMO_NA_VIEW(na)->stridx)
|
|
@@ -166,14 +166,14 @@ static inline dtype c_log(dtype x) {
|
|
|
166
166
|
static inline dtype c_log2(dtype x) {
|
|
167
167
|
dtype z;
|
|
168
168
|
z = c_log(x);
|
|
169
|
-
z = c_mul_r(
|
|
169
|
+
z = c_mul_r(z,M_LOG2E);
|
|
170
170
|
return z;
|
|
171
171
|
}
|
|
172
172
|
|
|
173
173
|
static inline dtype c_log10(dtype x) {
|
|
174
174
|
dtype z;
|
|
175
175
|
z = c_log(x);
|
|
176
|
-
z = c_mul_r(
|
|
176
|
+
z = c_mul_r(z,M_LOG10E);
|
|
177
177
|
return z;
|
|
178
178
|
}
|
|
179
179
|
|
|
@@ -157,18 +157,27 @@ __host__ __device__ static inline dtype f_seq(dtype x, dtype y, double c)
|
|
|
157
157
|
/* --------- thrust ----------------- */
|
|
158
158
|
#include "cumo/cuda/cumo_thrust_complex.hpp"
|
|
159
159
|
|
|
160
|
-
struct cumo_thrust_plus
|
|
160
|
+
struct cumo_thrust_plus
|
|
161
161
|
{
|
|
162
|
+
using first_argument_type = dtype;
|
|
163
|
+
using second_argument_type = dtype;
|
|
164
|
+
using result_type = dtype;
|
|
162
165
|
__host__ __device__ dtype operator()(dtype x, dtype y) { return m_add(x,y); }
|
|
163
166
|
};
|
|
164
167
|
|
|
165
|
-
struct cumo_thrust_multiplies
|
|
168
|
+
struct cumo_thrust_multiplies
|
|
166
169
|
{
|
|
170
|
+
using first_argument_type = dtype;
|
|
171
|
+
using second_argument_type = dtype;
|
|
172
|
+
using result_type = dtype;
|
|
167
173
|
__host__ __device__ dtype operator()(dtype x, dtype y) { return m_mul(x,y); }
|
|
168
174
|
};
|
|
169
175
|
|
|
170
|
-
struct cumo_thrust_multiplies_mulsum_nan
|
|
176
|
+
struct cumo_thrust_multiplies_mulsum_nan
|
|
171
177
|
{
|
|
178
|
+
using first_argument_type = dtype;
|
|
179
|
+
using second_argument_type = dtype;
|
|
180
|
+
using result_type = dtype;
|
|
172
181
|
__host__ __device__ dtype operator()(dtype x, dtype y) {
|
|
173
182
|
if (not_nan(x) && not_nan(y)) {
|
|
174
183
|
return m_mul(x, y);
|
|
@@ -178,8 +187,10 @@ struct cumo_thrust_multiplies_mulsum_nan : public thrust::binary_function<dtype,
|
|
|
178
187
|
}
|
|
179
188
|
};
|
|
180
189
|
|
|
181
|
-
struct cumo_thrust_square
|
|
190
|
+
struct cumo_thrust_square
|
|
182
191
|
{
|
|
192
|
+
using argument_type = dtype;
|
|
193
|
+
using result_type = dtype;
|
|
183
194
|
__host__ __device__ rtype operator()(const dtype& x) const { return c_abs_square(x); }
|
|
184
195
|
};
|
|
185
196
|
|
|
@@ -72,18 +72,27 @@ __host__ __device__ static inline dtype f_minimum_nan(dtype x, dtype y)
|
|
|
72
72
|
/* --------- thrust ----------------- */
|
|
73
73
|
#include "cumo/cuda/cumo_thrust.hpp"
|
|
74
74
|
|
|
75
|
-
struct cumo_thrust_plus
|
|
75
|
+
struct cumo_thrust_plus
|
|
76
76
|
{
|
|
77
|
+
using first_argument_type = dtype;
|
|
78
|
+
using second_argument_type = dtype;
|
|
79
|
+
using result_type = dtype;
|
|
77
80
|
__host__ __device__ dtype operator()(dtype x, dtype y) { return m_add(x,y); }
|
|
78
81
|
};
|
|
79
82
|
|
|
80
|
-
struct cumo_thrust_multiplies
|
|
83
|
+
struct cumo_thrust_multiplies
|
|
81
84
|
{
|
|
85
|
+
using first_argument_type = dtype;
|
|
86
|
+
using second_argument_type = dtype;
|
|
87
|
+
using result_type = dtype;
|
|
82
88
|
__host__ __device__ dtype operator()(dtype x, dtype y) { return m_mul(x,y); }
|
|
83
89
|
};
|
|
84
90
|
|
|
85
|
-
struct cumo_thrust_multiplies_mulsum_nan
|
|
91
|
+
struct cumo_thrust_multiplies_mulsum_nan
|
|
86
92
|
{
|
|
93
|
+
using first_argument_type = dtype;
|
|
94
|
+
using second_argument_type = dtype;
|
|
95
|
+
using result_type = dtype;
|
|
87
96
|
__host__ __device__ dtype operator()(dtype x, dtype y) {
|
|
88
97
|
if (not_nan(x) && not_nan(y)) {
|
|
89
98
|
return m_mul(x, y);
|
|
@@ -93,8 +102,10 @@ struct cumo_thrust_multiplies_mulsum_nan : public thrust::binary_function<dtype,
|
|
|
93
102
|
}
|
|
94
103
|
};
|
|
95
104
|
|
|
96
|
-
struct cumo_thrust_square
|
|
105
|
+
struct cumo_thrust_square
|
|
97
106
|
{
|
|
107
|
+
using argument_type = dtype;
|
|
108
|
+
using result_type = dtype;
|
|
98
109
|
__host__ __device__ rtype operator()(const dtype& x) const { return m_square(x); }
|
|
99
110
|
};
|
|
100
111
|
|