modelstudio 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. {modelstudio-0.5.0 → modelstudio-0.6.0}/CMakeLists.txt +10 -0
  2. {modelstudio-0.5.0/python/modelstudio.egg-info → modelstudio-0.6.0}/PKG-INFO +46 -9
  3. {modelstudio-0.5.0 → modelstudio-0.6.0}/README.md +45 -8
  4. modelstudio-0.6.0/benchmarks/bench_cuda_elementwise.py +54 -0
  5. modelstudio-0.6.0/benchmarks/bench_cuda_matmul.py +52 -0
  6. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_elementwise.py +0 -1
  7. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_trace.py +0 -1
  8. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/CMakeLists.txt +10 -2
  9. modelstudio-0.6.0/csrc/backends/cuda/README.md +19 -0
  10. modelstudio-0.6.0/csrc/backends/cuda/cuda_backend.cu +28 -0
  11. modelstudio-0.6.0/csrc/backends/cuda/cuda_context.cu +37 -0
  12. modelstudio-0.6.0/csrc/backends/cuda/cuda_context.hpp +10 -0
  13. modelstudio-0.6.0/csrc/backends/cuda/cuda_kernels.hpp +16 -0
  14. modelstudio-0.6.0/csrc/backends/cuda/cuda_memory.cu +34 -0
  15. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cuda/cuda_memory.hpp +2 -0
  16. modelstudio-0.6.0/csrc/backends/cuda/cuda_stream.cu +13 -0
  17. modelstudio-0.6.0/csrc/backends/cuda/cuda_stream.hpp +7 -0
  18. modelstudio-0.6.0/csrc/backends/cuda/kernels/elementwise.cu +27 -0
  19. modelstudio-0.6.0/csrc/backends/cuda/kernels/matmul.cu +13 -0
  20. modelstudio-0.6.0/csrc/backends/cuda/kernels/reductions.cu +15 -0
  21. modelstudio-0.6.0/csrc/bindings/cuda_bindings.cpp +12 -0
  22. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/backend-architecture.md +3 -3
  23. modelstudio-0.6.0/docs/backend-status.md +37 -0
  24. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/comparison-ops.md +1 -2
  25. modelstudio-0.6.0/docs/cuda.md +53 -0
  26. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/functional-api.md +0 -1
  27. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/linalg.md +0 -1
  28. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/native-backend-roadmap.md +12 -4
  29. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/random.md +0 -1
  30. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/tensor-creation.md +1 -1
  31. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/tracing.md +0 -1
  32. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/training.md +1 -1
  33. modelstudio-0.6.0/examples/backend_status.py +26 -0
  34. modelstudio-0.6.0/examples/cuda_mlp_demo.py +45 -0
  35. modelstudio-0.6.0/examples/cuda_tensor_demo.py +33 -0
  36. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/functional_training.py +0 -1
  37. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/random_linalg_demo.py +0 -1
  38. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/tracing_demo.py +0 -1
  39. {modelstudio-0.5.0 → modelstudio-0.6.0}/pyproject.toml +1 -1
  40. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/__init__.py +2 -1
  41. modelstudio-0.6.0/python/modelstudio/_version.py +1 -0
  42. modelstudio-0.6.0/python/modelstudio/backends/__init__.py +3 -0
  43. modelstudio-0.6.0/python/modelstudio/backends/cuda.py +84 -0
  44. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/backends/status.py +19 -3
  45. modelstudio-0.6.0/python/modelstudio/cuda/__init__.py +13 -0
  46. modelstudio-0.6.0/python/modelstudio/cuda/device.py +45 -0
  47. modelstudio-0.6.0/python/modelstudio/cuda/memory.py +21 -0
  48. modelstudio-0.6.0/python/modelstudio/cuda/streams.py +12 -0
  49. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/linalg.py +0 -1
  50. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/comparison.py +0 -1
  51. {modelstudio-0.5.0 → modelstudio-0.6.0/python/modelstudio.egg-info}/PKG-INFO +46 -9
  52. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio.egg-info/SOURCES.txt +26 -0
  53. {modelstudio-0.5.0 → modelstudio-0.6.0}/scripts/smoke_test.py +16 -7
  54. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_backend_status.py +7 -3
  55. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_comparison_ops.py +0 -1
  56. modelstudio-0.6.0/tests/test_cuda_autograd.py +72 -0
  57. modelstudio-0.6.0/tests/test_cuda_availability.py +77 -0
  58. modelstudio-0.6.0/tests/test_cuda_memory.py +43 -0
  59. modelstudio-0.6.0/tests/test_cuda_nn.py +36 -0
  60. modelstudio-0.6.0/tests/test_cuda_ops.py +93 -0
  61. modelstudio-0.6.0/tests/test_cuda_tensor.py +52 -0
  62. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_linalg.py +0 -1
  63. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_native_cpu_mode.py +0 -1
  64. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_public_exports.py +2 -2
  65. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_random_namespace.py +0 -1
  66. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_scalar_behavior.py +0 -1
  67. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_serialization_hardening.py +0 -1
  68. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_trace.py +0 -1
  69. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_version.py +1 -1
  70. modelstudio-0.5.0/csrc/backends/cuda/README.md +0 -14
  71. modelstudio-0.5.0/csrc/backends/cuda/cuda_backend.cu +0 -32
  72. modelstudio-0.5.0/docs/backend-status.md +0 -21
  73. modelstudio-0.5.0/examples/backend_status.py +0 -16
  74. modelstudio-0.5.0/python/modelstudio/_version.py +0 -1
  75. modelstudio-0.5.0/python/modelstudio/backends/__init__.py +0 -4
  76. {modelstudio-0.5.0 → modelstudio-0.6.0}/LICENSE +0 -0
  77. {modelstudio-0.5.0 → modelstudio-0.6.0}/MANIFEST.in +0 -0
  78. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_attention.py +0 -0
  79. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_conv.py +0 -0
  80. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_creation.py +0 -0
  81. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_dataloader.py +0 -0
  82. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_dropout.py +0 -0
  83. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_manipulation.py +0 -0
  84. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_matmul.py +0 -0
  85. {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_mlp.py +0 -0
  86. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/cpu_backend.cpp +0 -0
  87. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/cpu_backend.hpp +0 -0
  88. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/kernels/add.cpp +0 -0
  89. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/kernels/matmul.cpp +0 -0
  90. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/kernels/mul.cpp +0 -0
  91. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/kernels/relu.cpp +0 -0
  92. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cuda/cuda_backend.hpp +0 -0
  93. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/oneapi/README.md +0 -0
  94. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/oneapi/oneapi_backend.cpp +0 -0
  95. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/oneapi/oneapi_backend.hpp +0 -0
  96. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/oneapi/sycl_memory.hpp +0 -0
  97. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/rocm/README.md +0 -0
  98. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/rocm/hip_memory.hpp +0 -0
  99. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/rocm/rocm_backend.cpp +0 -0
  100. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/rocm/rocm_backend.hpp +0 -0
  101. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/bindings/python_bindings.cpp +0 -0
  102. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/device.hpp +0 -0
  103. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/dtype.hpp +0 -0
  104. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/error.hpp +0 -0
  105. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/shape.hpp +0 -0
  106. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/storage.hpp +0 -0
  107. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/tensor.hpp +0 -0
  108. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/dispatcher/backend.hpp +0 -0
  109. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/dispatcher/dispatcher.hpp +0 -0
  110. {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/dispatcher/operator_registry.hpp +0 -0
  111. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/autograd.md +0 -0
  112. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/checkpointing.md +0 -0
  113. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/data.md +0 -0
  114. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/metrics.md +0 -0
  115. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/modules.md +0 -0
  116. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/nn.md +0 -0
  117. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/numpy-interop.md +0 -0
  118. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/optimizers.md +0 -0
  119. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/randomness.md +0 -0
  120. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/releasing.md +0 -0
  121. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/serialization.md +0 -0
  122. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/tensor-api.md +0 -0
  123. {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/tensor-manipulation.md +0 -0
  124. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/checkpoint_resume.py +0 -0
  125. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/checkpoint_training.py +0 -0
  126. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/dropout_batchnorm.py +0 -0
  127. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/metrics_demo.py +0 -0
  128. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/numpy_interop.py +0 -0
  129. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/save_load.py +0 -0
  130. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/scheduler_training.py +0 -0
  131. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/tiny_transformer.py +0 -0
  132. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/train_classifier.py +0 -0
  133. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/train_cnn_toy.py +0 -0
  134. {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/train_mlp.py +0 -0
  135. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/autograd/__init__.py +0 -0
  136. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/autograd/engine.py +0 -0
  137. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/autograd/function.py +0 -0
  138. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/autograd/grad_mode.py +0 -0
  139. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/compile/__init__.py +0 -0
  140. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/compile/graph_capture.py +0 -0
  141. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/compile/ir.py +0 -0
  142. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/compile/passes.py +0 -0
  143. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/data/__init__.py +0 -0
  144. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/data/dataloader.py +0 -0
  145. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/data/dataset.py +0 -0
  146. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/device.py +0 -0
  147. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/dtypes.py +0 -0
  148. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/errors.py +0 -0
  149. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/interop.py +0 -0
  150. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/metrics/__init__.py +0 -0
  151. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/metrics/classification.py +0 -0
  152. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/__init__.py +0 -0
  153. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/activations.py +0 -0
  154. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/convolution.py +0 -0
  155. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/embedding.py +0 -0
  156. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/functional.py +0 -0
  157. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/init.py +0 -0
  158. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/linear.py +0 -0
  159. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/losses.py +0 -0
  160. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/module.py +0 -0
  161. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/normalization.py +0 -0
  162. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/parameter.py +0 -0
  163. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/pooling.py +0 -0
  164. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/transformer.py +0 -0
  165. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/utils.py +0 -0
  166. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/__init__.py +0 -0
  167. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/creation.py +0 -0
  168. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/linalg.py +0 -0
  169. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/math.py +0 -0
  170. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/movement.py +0 -0
  171. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/reductions.py +0 -0
  172. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/__init__.py +0 -0
  173. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/adamw.py +0 -0
  174. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/lr_scheduler.py +0 -0
  175. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/optimizer.py +0 -0
  176. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/sgd.py +0 -0
  177. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/py.typed +0 -0
  178. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/random.py +0 -0
  179. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/runtime/__init__.py +0 -0
  180. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/runtime/backend.py +0 -0
  181. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/runtime/dispatcher.py +0 -0
  182. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/serialization.py +0 -0
  183. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/storage.py +0 -0
  184. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/tensor.py +0 -0
  185. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/testing/__init__.py +0 -0
  186. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/testing/gradcheck.py +0 -0
  187. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio.egg-info/dependency_links.txt +0 -0
  188. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio.egg-info/requires.txt +0 -0
  189. {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio.egg-info/top_level.txt +0 -0
  190. {modelstudio-0.5.0 → modelstudio-0.6.0}/setup.cfg +0 -0
  191. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_activations_more.py +0 -0
  192. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_attention.py +0 -0
  193. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_autograd.py +0 -0
  194. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_batchnorm.py +0 -0
  195. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_buffers.py +0 -0
  196. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_checkpoint_helpers.py +0 -0
  197. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_clone_copy.py +0 -0
  198. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_concat_stack.py +0 -0
  199. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_conv.py +0 -0
  200. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_creation_more.py +0 -0
  201. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_data.py +0 -0
  202. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_data_split.py +0 -0
  203. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_dataloader_seed.py +0 -0
  204. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_dispatcher.py +0 -0
  205. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_dropout.py +0 -0
  206. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_dtype_conversion.py +0 -0
  207. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_embedding.py +0 -0
  208. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_functional.py +0 -0
  209. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_grad_clip.py +0 -0
  210. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_gradcheck.py +0 -0
  211. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_indexing.py +0 -0
  212. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_indexing_assignment.py +0 -0
  213. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_init.py +0 -0
  214. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_loss_reductions.py +0 -0
  215. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_losses.py +0 -0
  216. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_lr_scheduler.py +0 -0
  217. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_manipulation_ops.py +0 -0
  218. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_metrics.py +0 -0
  219. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_module_ergonomics.py +0 -0
  220. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_nn.py +0 -0
  221. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_norms.py +0 -0
  222. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_numpy_interop.py +0 -0
  223. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_ops.py +0 -0
  224. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_optim.py +0 -0
  225. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_optimizer_param_groups.py +0 -0
  226. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_optimizer_state.py +0 -0
  227. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_pooling.py +0 -0
  228. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_random.py +0 -0
  229. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_reductions_axis.py +0 -0
  230. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_serialization.py +0 -0
  231. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_shape_ops.py +0 -0
  232. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_state_dict.py +0 -0
  233. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_tensor.py +0 -0
  234. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_transformer.py +0 -0
  235. {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_unary_ops.py +0 -0
@@ -5,6 +5,16 @@ option(MODELSTUDIO_ENABLE_CUDA "Build CUDA backend" OFF)
5
5
  option(MODELSTUDIO_ENABLE_ROCM "Build ROCm backend" OFF)
6
6
  option(MODELSTUDIO_ENABLE_ONEAPI "Build oneAPI backend" OFF)
7
7
 
8
+ if(MODELSTUDIO_ENABLE_CUDA)
9
+ include(CheckLanguage)
10
+ check_language(CUDA)
11
+ if(NOT CMAKE_CUDA_COMPILER)
12
+ message(FATAL_ERROR "MODELSTUDIO_ENABLE_CUDA=ON requires an NVIDIA CUDA compiler/toolkit, but none was found.")
13
+ endif()
14
+ enable_language(CUDA)
15
+ find_package(CUDAToolkit REQUIRED)
16
+ endif()
17
+
8
18
  set(CMAKE_CXX_STANDARD 20)
9
19
  set(CMAKE_CXX_STANDARD_REQUIRED ON)
10
20
  set(CMAKE_CXX_EXTENSIONS OFF)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: modelstudio
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: An early-stage AI tensor framework with CPU tensors, autograd, and backend extension scaffolding.
5
5
  Author: ModelStudio Contributors
6
6
  License-Expression: MIT
@@ -31,14 +31,14 @@ Dynamic: license-file
31
31
 
32
32
  # ModelStudio
33
33
 
34
- ModelStudio is an early-stage AI tensor framework. Version `0.5.0` provides a
34
+ ModelStudio is an early-stage AI tensor framework. Version `0.6.0` provides a
35
35
  CPU tensor/autograd MVP with neural-network modules, optimizers, serialization,
36
- data loading, graph tracing metadata, backend status inspection, and small
37
- LLM-oriented building blocks.
36
+ data loading, graph tracing metadata, backend status inspection, a public CUDA
37
+ availability namespace, and small LLM-oriented building blocks.
38
38
 
39
- It is not a PyTorch or TensorFlow replacement. CPU is the only working backend.
40
- CUDA, ROCm, and oneAPI remain explicit scaffolds until real kernels are built
41
- and tested.
39
+ It is not a PyTorch or TensorFlow replacement. The default PyPI package is
40
+ CPU-only. CUDA, ROCm, and oneAPI remain explicit scaffolds until real kernels
41
+ are built and tested in hardware-backed environments.
42
42
 
43
43
  ## Installation
44
44
 
@@ -74,6 +74,24 @@ python -m pip install -e ".[dev]"
74
74
  | Interop | `asarray`, `from_numpy`, `to_numpy`, and `ms.numpy` |
75
75
  | Metrics | accuracy and top-k accuracy |
76
76
  | Compiler | Metadata-only tracing plus placeholder IR and passes |
77
+ | CUDA API | Availability, device-count, sync, and memory-status facade; tensor execution is not implemented in the CPU wheel |
78
+
79
+ ## Architecture
80
+
81
+ ```text
82
+ Python frontend
83
+ -> Tensor, nn, optim, autograd, ops
84
+ -> runtime dispatcher
85
+ -> backend interface
86
+ -> NumPy CPU backend today
87
+ -> optional native CPU / CUDA / ROCm / oneAPI extensions later
88
+
89
+ Native scaffold
90
+ -> core metadata
91
+ -> dispatcher interfaces
92
+ -> CPU kernel prototypes
93
+ -> CUDA, ROCm, oneAPI backend directories
94
+ ```
77
95
 
78
96
  ## Backend Status
79
97
 
@@ -89,7 +107,7 @@ Expected shape:
89
107
  ```python
90
108
  {
91
109
  "cpu": {"available": True, "native": False},
92
- "cuda": {"available": False, "reason": "..."},
110
+ "cuda": {"available": False, "built": False, "device_count": 0, "reason": "..."},
93
111
  "rocm": {"available": False, "reason": "..."},
94
112
  "oneapi": {"available": False, "reason": "..."},
95
113
  }
@@ -100,6 +118,17 @@ raises `ModelStudioBackendUnavailable` unless a future optional native extension
100
118
  is actually installed. Unsupported accelerator devices fail with
101
119
  `ModelStudioBackendUnavailable`.
102
120
 
121
+ CUDA availability can also be checked through the public namespace:
122
+
123
+ ```python
124
+ print(ms.cuda.is_available())
125
+ print(ms.cuda.device_count())
126
+ print(ms.cuda.memory_summary())
127
+ ```
128
+
129
+ In the CPU-only wheel, explicit CUDA tensor requests raise a clear runtime error
130
+ instead of falling back to CPU.
131
+
103
132
  ## Tensor Example
104
133
 
105
134
  ```python
@@ -195,6 +224,8 @@ python examples/backend_status.py
195
224
  python examples/tracing_demo.py
196
225
  python examples/functional_training.py
197
226
  python examples/random_linalg_demo.py
227
+ python examples/cuda_tensor_demo.py
228
+ python examples/cuda_mlp_demo.py
198
229
  python benchmarks/bench_matmul.py
199
230
  python benchmarks/bench_mlp.py
200
231
  python benchmarks/bench_attention.py
@@ -205,11 +236,14 @@ python benchmarks/bench_creation.py
205
236
  python benchmarks/bench_manipulation.py
206
237
  python benchmarks/bench_elementwise.py
207
238
  python benchmarks/bench_trace.py
239
+ python benchmarks/bench_cuda_elementwise.py
240
+ python benchmarks/bench_cuda_matmul.py
208
241
  ```
209
242
 
210
243
  ## Documentation
211
244
 
212
245
  - [Backend status](docs/backend-status.md)
246
+ - [CUDA status](docs/cuda.md)
213
247
  - [Tracing](docs/tracing.md)
214
248
  - [Functional API](docs/functional-api.md)
215
249
  - [Random namespace](docs/random.md)
@@ -237,5 +271,8 @@ python benchmarks/bench_trace.py
237
271
 
238
272
  - Expand tensor and autograd coverage.
239
273
  - Wire optional native CPU kernels only after a safe Python extension exists.
240
- - Add tested CUDA, ROCm, and oneAPI packages when hardware-backed CI exists.
274
+ - Build a real optional CUDA package after tensor storage, kernels, bindings,
275
+ and hardware-backed CI are in place.
276
+ - Add tested ROCm and oneAPI packages after CUDA establishes the accelerator
277
+ backend contract.
241
278
  - Improve compiler graph capture, analysis passes, and lowering.
@@ -1,13 +1,13 @@
1
1
  # ModelStudio
2
2
 
3
- ModelStudio is an early-stage AI tensor framework. Version `0.5.0` provides a
3
+ ModelStudio is an early-stage AI tensor framework. Version `0.6.0` provides a
4
4
  CPU tensor/autograd MVP with neural-network modules, optimizers, serialization,
5
- data loading, graph tracing metadata, backend status inspection, and small
6
- LLM-oriented building blocks.
5
+ data loading, graph tracing metadata, backend status inspection, a public CUDA
6
+ availability namespace, and small LLM-oriented building blocks.
7
7
 
8
- It is not a PyTorch or TensorFlow replacement. CPU is the only working backend.
9
- CUDA, ROCm, and oneAPI remain explicit scaffolds until real kernels are built
10
- and tested.
8
+ It is not a PyTorch or TensorFlow replacement. The default PyPI package is
9
+ CPU-only. CUDA, ROCm, and oneAPI remain explicit scaffolds until real kernels
10
+ are built and tested in hardware-backed environments.
11
11
 
12
12
  ## Installation
13
13
 
@@ -43,6 +43,24 @@ python -m pip install -e ".[dev]"
43
43
  | Interop | `asarray`, `from_numpy`, `to_numpy`, and `ms.numpy` |
44
44
  | Metrics | accuracy and top-k accuracy |
45
45
  | Compiler | Metadata-only tracing plus placeholder IR and passes |
46
+ | CUDA API | Availability, device-count, sync, and memory-status facade; tensor execution is not implemented in the CPU wheel |
47
+
48
+ ## Architecture
49
+
50
+ ```text
51
+ Python frontend
52
+ -> Tensor, nn, optim, autograd, ops
53
+ -> runtime dispatcher
54
+ -> backend interface
55
+ -> NumPy CPU backend today
56
+ -> optional native CPU / CUDA / ROCm / oneAPI extensions later
57
+
58
+ Native scaffold
59
+ -> core metadata
60
+ -> dispatcher interfaces
61
+ -> CPU kernel prototypes
62
+ -> CUDA, ROCm, oneAPI backend directories
63
+ ```
46
64
 
47
65
  ## Backend Status
48
66
 
@@ -58,7 +76,7 @@ Expected shape:
58
76
  ```python
59
77
  {
60
78
  "cpu": {"available": True, "native": False},
61
- "cuda": {"available": False, "reason": "..."},
79
+ "cuda": {"available": False, "built": False, "device_count": 0, "reason": "..."},
62
80
  "rocm": {"available": False, "reason": "..."},
63
81
  "oneapi": {"available": False, "reason": "..."},
64
82
  }
@@ -69,6 +87,17 @@ raises `ModelStudioBackendUnavailable` unless a future optional native extension
69
87
  is actually installed. Unsupported accelerator devices fail with
70
88
  `ModelStudioBackendUnavailable`.
71
89
 
90
+ CUDA availability can also be checked through the public namespace:
91
+
92
+ ```python
93
+ print(ms.cuda.is_available())
94
+ print(ms.cuda.device_count())
95
+ print(ms.cuda.memory_summary())
96
+ ```
97
+
98
+ In the CPU-only wheel, explicit CUDA tensor requests raise a clear runtime error
99
+ instead of falling back to CPU.
100
+
72
101
  ## Tensor Example
73
102
 
74
103
  ```python
@@ -164,6 +193,8 @@ python examples/backend_status.py
164
193
  python examples/tracing_demo.py
165
194
  python examples/functional_training.py
166
195
  python examples/random_linalg_demo.py
196
+ python examples/cuda_tensor_demo.py
197
+ python examples/cuda_mlp_demo.py
167
198
  python benchmarks/bench_matmul.py
168
199
  python benchmarks/bench_mlp.py
169
200
  python benchmarks/bench_attention.py
@@ -174,11 +205,14 @@ python benchmarks/bench_creation.py
174
205
  python benchmarks/bench_manipulation.py
175
206
  python benchmarks/bench_elementwise.py
176
207
  python benchmarks/bench_trace.py
208
+ python benchmarks/bench_cuda_elementwise.py
209
+ python benchmarks/bench_cuda_matmul.py
177
210
  ```
178
211
 
179
212
  ## Documentation
180
213
 
181
214
  - [Backend status](docs/backend-status.md)
215
+ - [CUDA status](docs/cuda.md)
182
216
  - [Tracing](docs/tracing.md)
183
217
  - [Functional API](docs/functional-api.md)
184
218
  - [Random namespace](docs/random.md)
@@ -206,5 +240,8 @@ python benchmarks/bench_trace.py
206
240
 
207
241
  - Expand tensor and autograd coverage.
208
242
  - Wire optional native CPU kernels only after a safe Python extension exists.
209
- - Add tested CUDA, ROCm, and oneAPI packages when hardware-backed CI exists.
243
+ - Build a real optional CUDA package after tensor storage, kernels, bindings,
244
+ and hardware-backed CI are in place.
245
+ - Add tested ROCm and oneAPI packages after CUDA establishes the accelerator
246
+ backend contract.
210
247
  - Improve compiler graph capture, analysis passes, and lowering.
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import platform
4
+ import time
5
+ from collections.abc import Callable
6
+
7
+ import modelstudio as ms
8
+
9
+
10
+ def _time_ms(fn: Callable[[], object], warmup: int, iterations: int, *, synchronize: bool) -> float:
11
+ for _ in range(warmup):
12
+ fn()
13
+ if synchronize:
14
+ ms.cuda.synchronize()
15
+ start = time.perf_counter()
16
+ for _ in range(iterations):
17
+ fn()
18
+ if synchronize:
19
+ ms.cuda.synchronize()
20
+ return (time.perf_counter() - start) * 1000.0 / iterations
21
+
22
+
23
+ def main() -> None:
24
+ shape = (1024, 1024)
25
+ warmup = 5
26
+ iterations = 50
27
+
28
+ print(f"Python: {platform.python_version()}")
29
+ print(f"NumPy: {ms.numpy.__version__}")
30
+ print(f"ModelStudio: {ms.__version__}")
31
+ print(f"CUDA: available={ms.cuda.is_available()} device_count={ms.cuda.device_count()}")
32
+ print(f"Shape: {shape}")
33
+ print(f"Warmup: {warmup}")
34
+ print(f"Iterations: {iterations}")
35
+
36
+ if not ms.cuda.is_available():
37
+ print(ms.cuda.memory_summary())
38
+ print("Skipping CUDA elementwise benchmark because CUDA tensor execution is not available.")
39
+ return
40
+
41
+ ms.manual_seed(123)
42
+ x = ms.randn(shape, device="cuda")
43
+ y = ms.randn(shape, device="cuda")
44
+
45
+ add_ms = _time_ms(lambda: x + y, warmup, iterations, synchronize=True)
46
+ relu_ms = _time_ms(lambda: ms.relu(x), warmup, iterations, synchronize=True)
47
+
48
+ print(f"CUDA add avg: {add_ms:.3f} ms")
49
+ print(f"CUDA relu avg: {relu_ms:.3f} ms")
50
+ print(ms.cuda.memory_summary())
51
+
52
+
53
+ if __name__ == "__main__":
54
+ main()
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ import platform
4
+ import time
5
+ from collections.abc import Callable
6
+
7
+ import modelstudio as ms
8
+
9
+
10
+ def _time_ms(fn: Callable[[], object], warmup: int, iterations: int, *, synchronize: bool) -> float:
11
+ for _ in range(warmup):
12
+ fn()
13
+ if synchronize:
14
+ ms.cuda.synchronize()
15
+ start = time.perf_counter()
16
+ for _ in range(iterations):
17
+ fn()
18
+ if synchronize:
19
+ ms.cuda.synchronize()
20
+ return (time.perf_counter() - start) * 1000.0 / iterations
21
+
22
+
23
+ def main() -> None:
24
+ shape = (512, 512)
25
+ warmup = 3
26
+ iterations = 20
27
+
28
+ print(f"Python: {platform.python_version()}")
29
+ print(f"NumPy: {ms.numpy.__version__}")
30
+ print(f"ModelStudio: {ms.__version__}")
31
+ print(f"CUDA: available={ms.cuda.is_available()} device_count={ms.cuda.device_count()}")
32
+ print(f"Shape: {shape} x {shape}")
33
+ print(f"Warmup: {warmup}")
34
+ print(f"Iterations: {iterations}")
35
+
36
+ if not ms.cuda.is_available():
37
+ print(ms.cuda.memory_summary())
38
+ print("Skipping CUDA matmul benchmark because CUDA tensor execution is not available.")
39
+ return
40
+
41
+ ms.manual_seed(123)
42
+ a = ms.randn(shape, device="cuda")
43
+ b = ms.randn(shape, device="cuda")
44
+
45
+ matmul_ms = _time_ms(lambda: a @ b, warmup, iterations, synchronize=True)
46
+
47
+ print(f"CUDA matmul avg: {matmul_ms:.3f} ms")
48
+ print(ms.cuda.memory_summary())
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
@@ -41,4 +41,3 @@ def main() -> None:
41
41
 
42
42
  if __name__ == "__main__":
43
43
  main()
44
-
@@ -44,4 +44,3 @@ def main() -> None:
44
44
 
45
45
  if __name__ == "__main__":
46
46
  main()
47
-
@@ -9,9 +9,17 @@ add_library(modelstudio_native STATIC
9
9
  target_include_directories(modelstudio_native PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
10
10
 
11
11
  if(MODELSTUDIO_ENABLE_CUDA)
12
- enable_language(CUDA)
13
- target_sources(modelstudio_native PRIVATE backends/cuda/cuda_backend.cu)
12
+ target_sources(modelstudio_native PRIVATE
13
+ backends/cuda/cuda_backend.cu
14
+ backends/cuda/cuda_context.cu
15
+ backends/cuda/cuda_memory.cu
16
+ backends/cuda/cuda_stream.cu
17
+ backends/cuda/kernels/elementwise.cu
18
+ backends/cuda/kernels/reductions.cu
19
+ backends/cuda/kernels/matmul.cu
20
+ )
14
21
  target_compile_definitions(modelstudio_native PUBLIC MODELSTUDIO_ENABLE_CUDA=1)
22
+ target_link_libraries(modelstudio_native PUBLIC CUDA::cudart CUDA::cublas)
15
23
  endif()
16
24
 
17
25
  if(MODELSTUDIO_ENABLE_ROCM)
@@ -0,0 +1,19 @@
1
+ # CUDA Backend
2
+
3
+ This directory is scaffolding for a future NVIDIA CUDA backend.
4
+
5
+ Current status:
6
+ - Not built by default.
7
+ - Enabled only with `MODELSTUDIO_ENABLE_CUDA=ON`.
8
+ - Python CPU users do not import or depend on CUDA artifacts.
9
+ - Context, allocator, stream, and kernel entry-point files are present as
10
+ scaffolding only.
11
+
12
+ Implementation path:
13
+ 1. Track allocation sizes and ownership in the CUDA allocator.
14
+ 2. Add device tensor storage and shape/stride views.
15
+ 3. Replace placeholder kernel entry points with tested CUDA kernels.
16
+ 4. Bind CUDA runtime functions and tensors into Python.
17
+ 5. Register the native backend with the dispatcher only when all required ops
18
+ are implemented.
19
+ 6. Ship as an optional package such as `modelstudio-cuda`.
@@ -0,0 +1,28 @@
1
+ #include "backends/cuda/cuda_backend.hpp"
2
+
3
+ #include "backends/cuda/cuda_kernels.hpp"
4
+ #include "core/error.hpp"
5
+
6
+ namespace modelstudio::cuda {
7
+
8
+ Tensor CUDABackend::empty(const Shape&, DType) {
9
+ throw Error("CUDA tensor allocation is scaffolded but not wired into Python yet");
10
+ }
11
+
12
+ Tensor CUDABackend::add(const Tensor& lhs, const Tensor& rhs) {
13
+ return add_kernel(lhs, rhs);
14
+ }
15
+
16
+ Tensor CUDABackend::mul(const Tensor& lhs, const Tensor& rhs) {
17
+ return mul_kernel(lhs, rhs);
18
+ }
19
+
20
+ Tensor CUDABackend::matmul(const Tensor& lhs, const Tensor& rhs) {
21
+ return matmul_cublas(lhs, rhs);
22
+ }
23
+
24
+ Tensor CUDABackend::relu(const Tensor& input) {
25
+ return relu_kernel(input);
26
+ }
27
+
28
+ } // namespace modelstudio::cuda
@@ -0,0 +1,37 @@
1
+ #include "backends/cuda/cuda_context.hpp"
2
+
3
+ #include <cuda_runtime_api.h>
4
+
5
+ #include <string>
6
+
7
+ #include "core/error.hpp"
8
+
9
+ namespace modelstudio::cuda {
10
+
11
+ void check_cuda(int status, const char* operation) {
12
+ if (status != cudaSuccess) {
13
+ throw Error(std::string(operation) + " failed: " + cudaGetErrorString(static_cast<cudaError_t>(status)));
14
+ }
15
+ }
16
+
17
+ int device_count() {
18
+ int count = 0;
19
+ auto status = cudaGetDeviceCount(&count);
20
+ if (status == cudaErrorNoDevice || status == cudaErrorInsufficientDriver) {
21
+ return 0;
22
+ }
23
+ check_cuda(status, "cudaGetDeviceCount");
24
+ return count;
25
+ }
26
+
27
+ int current_device() {
28
+ int device = 0;
29
+ check_cuda(cudaGetDevice(&device), "cudaGetDevice");
30
+ return device;
31
+ }
32
+
33
+ void set_device(int index) {
34
+ check_cuda(cudaSetDevice(index), "cudaSetDevice");
35
+ }
36
+
37
+ } // namespace modelstudio::cuda
@@ -0,0 +1,10 @@
1
+ #pragma once
2
+
3
+ namespace modelstudio::cuda {
4
+
5
+ int device_count();
6
+ int current_device();
7
+ void set_device(int index);
8
+ void check_cuda(int status, const char* operation);
9
+
10
+ } // namespace modelstudio::cuda
@@ -0,0 +1,16 @@
1
+ #pragma once
2
+
3
+ #include "core/tensor.hpp"
4
+
5
+ namespace modelstudio::cuda {
6
+
7
+ Tensor add_kernel(const Tensor& lhs, const Tensor& rhs);
8
+ Tensor sub_kernel(const Tensor& lhs, const Tensor& rhs);
9
+ Tensor mul_kernel(const Tensor& lhs, const Tensor& rhs);
10
+ Tensor div_kernel(const Tensor& lhs, const Tensor& rhs);
11
+ Tensor relu_kernel(const Tensor& input);
12
+ Tensor sum_kernel(const Tensor& input);
13
+ Tensor mean_kernel(const Tensor& input);
14
+ Tensor matmul_cublas(const Tensor& lhs, const Tensor& rhs);
15
+
16
+ } // namespace modelstudio::cuda
@@ -0,0 +1,34 @@
1
+ #include "backends/cuda/cuda_memory.hpp"
2
+
3
+ #include <cuda_runtime_api.h>
4
+
5
+ #include <atomic>
6
+ #include <cstddef>
7
+
8
+ #include "backends/cuda/cuda_context.hpp"
9
+
10
+ namespace modelstudio::cuda {
11
+
12
+ namespace {
13
+ std::atomic<unsigned long long> g_allocated_bytes{0};
14
+ }
15
+
16
+ void* CUDAMemoryAllocator::allocate(unsigned long long bytes) {
17
+ void* ptr = nullptr;
18
+ check_cuda(cudaMalloc(&ptr, static_cast<std::size_t>(bytes)), "cudaMalloc");
19
+ g_allocated_bytes.fetch_add(bytes, std::memory_order_relaxed);
20
+ return ptr;
21
+ }
22
+
23
+ void CUDAMemoryAllocator::deallocate(void* ptr) {
24
+ if (ptr == nullptr) {
25
+ return;
26
+ }
27
+ check_cuda(cudaFree(ptr), "cudaFree");
28
+ }
29
+
30
+ unsigned long long allocated_bytes() {
31
+ return g_allocated_bytes.load(std::memory_order_relaxed);
32
+ }
33
+
34
+ } // namespace modelstudio::cuda
@@ -9,4 +9,6 @@ class CUDAMemoryAllocator {
9
9
  void deallocate(void* ptr);
10
10
  };
11
11
 
12
+ unsigned long long allocated_bytes();
13
+
12
14
  } // namespace modelstudio::cuda
@@ -0,0 +1,13 @@
1
+ #include "backends/cuda/cuda_stream.hpp"
2
+
3
+ #include <cuda_runtime_api.h>
4
+
5
+ #include "backends/cuda/cuda_context.hpp"
6
+
7
+ namespace modelstudio::cuda {
8
+
9
+ void synchronize_device() {
10
+ check_cuda(cudaDeviceSynchronize(), "cudaDeviceSynchronize");
11
+ }
12
+
13
+ } // namespace modelstudio::cuda
@@ -0,0 +1,7 @@
1
+ #pragma once
2
+
3
+ namespace modelstudio::cuda {
4
+
5
+ void synchronize_device();
6
+
7
+ } // namespace modelstudio::cuda
@@ -0,0 +1,27 @@
1
+ #include "backends/cuda/cuda_kernels.hpp"
2
+
3
+ #include "core/error.hpp"
4
+
5
+ namespace modelstudio::cuda {
6
+
7
+ Tensor add_kernel(const Tensor&, const Tensor&) {
8
+ throw Error("CUDA add kernel is scaffolded but not wired into Python yet");
9
+ }
10
+
11
+ Tensor sub_kernel(const Tensor&, const Tensor&) {
12
+ throw Error("CUDA sub kernel is scaffolded but not wired into Python yet");
13
+ }
14
+
15
+ Tensor mul_kernel(const Tensor&, const Tensor&) {
16
+ throw Error("CUDA mul kernel is scaffolded but not wired into Python yet");
17
+ }
18
+
19
+ Tensor div_kernel(const Tensor&, const Tensor&) {
20
+ throw Error("CUDA div kernel is scaffolded but not wired into Python yet");
21
+ }
22
+
23
+ Tensor relu_kernel(const Tensor&) {
24
+ throw Error("CUDA relu kernel is scaffolded but not wired into Python yet");
25
+ }
26
+
27
+ } // namespace modelstudio::cuda
@@ -0,0 +1,13 @@
1
+ #include "backends/cuda/cuda_kernels.hpp"
2
+
3
+ #include <cublas_v2.h>
4
+
5
+ #include "core/error.hpp"
6
+
7
+ namespace modelstudio::cuda {
8
+
9
+ Tensor matmul_cublas(const Tensor&, const Tensor&) {
10
+ throw Error("CUDA cuBLAS matmul is scaffolded but not wired into Python yet");
11
+ }
12
+
13
+ } // namespace modelstudio::cuda
@@ -0,0 +1,15 @@
1
+ #include "backends/cuda/cuda_kernels.hpp"
2
+
3
+ #include "core/error.hpp"
4
+
5
+ namespace modelstudio::cuda {
6
+
7
+ Tensor sum_kernel(const Tensor&) {
8
+ throw Error("CUDA sum reduction is scaffolded but not wired into Python yet");
9
+ }
10
+
11
+ Tensor mean_kernel(const Tensor&) {
12
+ throw Error("CUDA mean reduction is scaffolded but not wired into Python yet");
13
+ }
14
+
15
+ } // namespace modelstudio::cuda
@@ -0,0 +1,12 @@
1
+ #include "backends/cuda/cuda_context.hpp"
2
+ #include "backends/cuda/cuda_memory.hpp"
3
+ #include "backends/cuda/cuda_stream.hpp"
4
+
5
+ namespace modelstudio::bindings {
6
+
7
+ // Future Python extension registration point. The CPU-only wheel does not build
8
+ // this file; CUDA bindings will be enabled only when MODELSTUDIO_ENABLE_CUDA=ON
9
+ // and a binding layer is added.
10
+ void register_cuda_bindings_placeholder() {}
11
+
12
+ } // namespace modelstudio::bindings
@@ -16,7 +16,7 @@ Tensor API
16
16
  | Backend | Status |
17
17
  | --- | --- |
18
18
  | CPU | Working MVP backed by NumPy |
19
- | CUDA | Scaffold only |
19
+ | CUDA | Public status namespace plus native scaffold; no tensor execution in the CPU wheel |
20
20
  | ROCm | Scaffold only |
21
21
  | oneAPI | Scaffold only |
22
22
 
@@ -46,7 +46,7 @@ The native scaffolding under `csrc/` mirrors the Python runtime:
46
46
  - `csrc/core`: dtype, device, shape, tensor metadata, storage
47
47
  - `csrc/dispatcher`: backend interface and operator registry
48
48
  - `csrc/backends/cpu`: native CPU backend and kernels
49
- - `csrc/backends/cuda`: CUDA placeholders
49
+ - `csrc/backends/cuda`: CUDA context, memory, stream, and kernel scaffolds
50
50
  - `csrc/backends/rocm`: ROCm/HIP placeholders
51
51
  - `csrc/backends/oneapi`: oneAPI/SYCL placeholders
52
52
 
@@ -58,7 +58,7 @@ Python API stays stable.
58
58
  The CPU MVP stores arrays in NumPy. Future native backends should introduce:
59
59
 
60
60
  - CPU allocator abstractions for native storage
61
- - CUDA device allocator and stream support
61
+ - Complete CUDA device allocator and stream support
62
62
  - HIP allocator and stream support
63
63
  - SYCL allocator and queue support
64
64