modelstudio 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {modelstudio-0.5.0 → modelstudio-0.6.0}/CMakeLists.txt +10 -0
- {modelstudio-0.5.0/python/modelstudio.egg-info → modelstudio-0.6.0}/PKG-INFO +46 -9
- {modelstudio-0.5.0 → modelstudio-0.6.0}/README.md +45 -8
- modelstudio-0.6.0/benchmarks/bench_cuda_elementwise.py +54 -0
- modelstudio-0.6.0/benchmarks/bench_cuda_matmul.py +52 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_elementwise.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_trace.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/CMakeLists.txt +10 -2
- modelstudio-0.6.0/csrc/backends/cuda/README.md +19 -0
- modelstudio-0.6.0/csrc/backends/cuda/cuda_backend.cu +28 -0
- modelstudio-0.6.0/csrc/backends/cuda/cuda_context.cu +37 -0
- modelstudio-0.6.0/csrc/backends/cuda/cuda_context.hpp +10 -0
- modelstudio-0.6.0/csrc/backends/cuda/cuda_kernels.hpp +16 -0
- modelstudio-0.6.0/csrc/backends/cuda/cuda_memory.cu +34 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cuda/cuda_memory.hpp +2 -0
- modelstudio-0.6.0/csrc/backends/cuda/cuda_stream.cu +13 -0
- modelstudio-0.6.0/csrc/backends/cuda/cuda_stream.hpp +7 -0
- modelstudio-0.6.0/csrc/backends/cuda/kernels/elementwise.cu +27 -0
- modelstudio-0.6.0/csrc/backends/cuda/kernels/matmul.cu +13 -0
- modelstudio-0.6.0/csrc/backends/cuda/kernels/reductions.cu +15 -0
- modelstudio-0.6.0/csrc/bindings/cuda_bindings.cpp +12 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/backend-architecture.md +3 -3
- modelstudio-0.6.0/docs/backend-status.md +37 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/comparison-ops.md +1 -2
- modelstudio-0.6.0/docs/cuda.md +53 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/functional-api.md +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/linalg.md +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/native-backend-roadmap.md +12 -4
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/random.md +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/tensor-creation.md +1 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/tracing.md +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/training.md +1 -1
- modelstudio-0.6.0/examples/backend_status.py +26 -0
- modelstudio-0.6.0/examples/cuda_mlp_demo.py +45 -0
- modelstudio-0.6.0/examples/cuda_tensor_demo.py +33 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/functional_training.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/random_linalg_demo.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/tracing_demo.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/pyproject.toml +1 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/__init__.py +2 -1
- modelstudio-0.6.0/python/modelstudio/_version.py +1 -0
- modelstudio-0.6.0/python/modelstudio/backends/__init__.py +3 -0
- modelstudio-0.6.0/python/modelstudio/backends/cuda.py +84 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/backends/status.py +19 -3
- modelstudio-0.6.0/python/modelstudio/cuda/__init__.py +13 -0
- modelstudio-0.6.0/python/modelstudio/cuda/device.py +45 -0
- modelstudio-0.6.0/python/modelstudio/cuda/memory.py +21 -0
- modelstudio-0.6.0/python/modelstudio/cuda/streams.py +12 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/linalg.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/comparison.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0/python/modelstudio.egg-info}/PKG-INFO +46 -9
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio.egg-info/SOURCES.txt +26 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/scripts/smoke_test.py +16 -7
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_backend_status.py +7 -3
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_comparison_ops.py +0 -1
- modelstudio-0.6.0/tests/test_cuda_autograd.py +72 -0
- modelstudio-0.6.0/tests/test_cuda_availability.py +77 -0
- modelstudio-0.6.0/tests/test_cuda_memory.py +43 -0
- modelstudio-0.6.0/tests/test_cuda_nn.py +36 -0
- modelstudio-0.6.0/tests/test_cuda_ops.py +93 -0
- modelstudio-0.6.0/tests/test_cuda_tensor.py +52 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_linalg.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_native_cpu_mode.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_public_exports.py +2 -2
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_random_namespace.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_scalar_behavior.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_serialization_hardening.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_trace.py +0 -1
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_version.py +1 -1
- modelstudio-0.5.0/csrc/backends/cuda/README.md +0 -14
- modelstudio-0.5.0/csrc/backends/cuda/cuda_backend.cu +0 -32
- modelstudio-0.5.0/docs/backend-status.md +0 -21
- modelstudio-0.5.0/examples/backend_status.py +0 -16
- modelstudio-0.5.0/python/modelstudio/_version.py +0 -1
- modelstudio-0.5.0/python/modelstudio/backends/__init__.py +0 -4
- {modelstudio-0.5.0 → modelstudio-0.6.0}/LICENSE +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/MANIFEST.in +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_attention.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_conv.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_creation.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_dataloader.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_dropout.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_manipulation.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_matmul.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/benchmarks/bench_mlp.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/cpu_backend.cpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/cpu_backend.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/kernels/add.cpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/kernels/matmul.cpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/kernels/mul.cpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cpu/kernels/relu.cpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/cuda/cuda_backend.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/oneapi/README.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/oneapi/oneapi_backend.cpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/oneapi/oneapi_backend.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/oneapi/sycl_memory.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/rocm/README.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/rocm/hip_memory.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/rocm/rocm_backend.cpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/backends/rocm/rocm_backend.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/bindings/python_bindings.cpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/device.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/dtype.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/error.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/shape.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/storage.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/core/tensor.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/dispatcher/backend.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/dispatcher/dispatcher.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/csrc/dispatcher/operator_registry.hpp +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/autograd.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/checkpointing.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/data.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/metrics.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/modules.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/nn.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/numpy-interop.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/optimizers.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/randomness.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/releasing.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/serialization.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/tensor-api.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/docs/tensor-manipulation.md +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/checkpoint_resume.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/checkpoint_training.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/dropout_batchnorm.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/metrics_demo.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/numpy_interop.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/save_load.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/scheduler_training.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/tiny_transformer.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/train_classifier.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/train_cnn_toy.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/examples/train_mlp.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/autograd/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/autograd/engine.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/autograd/function.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/autograd/grad_mode.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/compile/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/compile/graph_capture.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/compile/ir.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/compile/passes.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/data/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/data/dataloader.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/data/dataset.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/device.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/dtypes.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/errors.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/interop.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/metrics/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/metrics/classification.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/activations.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/convolution.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/embedding.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/functional.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/init.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/linear.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/losses.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/module.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/normalization.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/parameter.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/pooling.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/transformer.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/nn/utils.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/creation.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/linalg.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/math.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/movement.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/ops/reductions.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/adamw.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/lr_scheduler.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/optimizer.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/optim/sgd.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/py.typed +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/random.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/runtime/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/runtime/backend.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/runtime/dispatcher.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/serialization.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/storage.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/tensor.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/testing/__init__.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio/testing/gradcheck.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio.egg-info/dependency_links.txt +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio.egg-info/requires.txt +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/python/modelstudio.egg-info/top_level.txt +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/setup.cfg +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_activations_more.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_attention.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_autograd.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_batchnorm.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_buffers.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_checkpoint_helpers.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_clone_copy.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_concat_stack.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_conv.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_creation_more.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_data.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_data_split.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_dataloader_seed.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_dispatcher.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_dropout.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_dtype_conversion.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_embedding.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_functional.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_grad_clip.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_gradcheck.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_indexing.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_indexing_assignment.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_init.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_loss_reductions.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_losses.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_lr_scheduler.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_manipulation_ops.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_metrics.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_module_ergonomics.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_nn.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_norms.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_numpy_interop.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_ops.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_optim.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_optimizer_param_groups.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_optimizer_state.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_pooling.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_random.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_reductions_axis.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_serialization.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_shape_ops.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_state_dict.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_tensor.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_transformer.py +0 -0
- {modelstudio-0.5.0 → modelstudio-0.6.0}/tests/test_unary_ops.py +0 -0
|
@@ -5,6 +5,16 @@ option(MODELSTUDIO_ENABLE_CUDA "Build CUDA backend" OFF)
|
|
|
5
5
|
option(MODELSTUDIO_ENABLE_ROCM "Build ROCm backend" OFF)
|
|
6
6
|
option(MODELSTUDIO_ENABLE_ONEAPI "Build oneAPI backend" OFF)
|
|
7
7
|
|
|
8
|
+
if(MODELSTUDIO_ENABLE_CUDA)
|
|
9
|
+
include(CheckLanguage)
|
|
10
|
+
check_language(CUDA)
|
|
11
|
+
if(NOT CMAKE_CUDA_COMPILER)
|
|
12
|
+
message(FATAL_ERROR "MODELSTUDIO_ENABLE_CUDA=ON requires an NVIDIA CUDA compiler/toolkit, but none was found.")
|
|
13
|
+
endif()
|
|
14
|
+
enable_language(CUDA)
|
|
15
|
+
find_package(CUDAToolkit REQUIRED)
|
|
16
|
+
endif()
|
|
17
|
+
|
|
8
18
|
set(CMAKE_CXX_STANDARD 20)
|
|
9
19
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
10
20
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: modelstudio
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: An early-stage AI tensor framework with CPU tensors, autograd, and backend extension scaffolding.
|
|
5
5
|
Author: ModelStudio Contributors
|
|
6
6
|
License-Expression: MIT
|
|
@@ -31,14 +31,14 @@ Dynamic: license-file
|
|
|
31
31
|
|
|
32
32
|
# ModelStudio
|
|
33
33
|
|
|
34
|
-
ModelStudio is an early-stage AI tensor framework. Version `0.
|
|
34
|
+
ModelStudio is an early-stage AI tensor framework. Version `0.6.0` provides a
|
|
35
35
|
CPU tensor/autograd MVP with neural-network modules, optimizers, serialization,
|
|
36
|
-
data loading, graph tracing metadata, backend status inspection,
|
|
37
|
-
LLM-oriented building blocks.
|
|
36
|
+
data loading, graph tracing metadata, backend status inspection, a public CUDA
|
|
37
|
+
availability namespace, and small LLM-oriented building blocks.
|
|
38
38
|
|
|
39
|
-
It is not a PyTorch or TensorFlow replacement.
|
|
40
|
-
CUDA, ROCm, and oneAPI remain explicit scaffolds until real kernels
|
|
41
|
-
and tested.
|
|
39
|
+
It is not a PyTorch or TensorFlow replacement. The default PyPI package is
|
|
40
|
+
CPU-only. CUDA, ROCm, and oneAPI remain explicit scaffolds until real kernels
|
|
41
|
+
are built and tested in hardware-backed environments.
|
|
42
42
|
|
|
43
43
|
## Installation
|
|
44
44
|
|
|
@@ -74,6 +74,24 @@ python -m pip install -e ".[dev]"
|
|
|
74
74
|
| Interop | `asarray`, `from_numpy`, `to_numpy`, and `ms.numpy` |
|
|
75
75
|
| Metrics | accuracy and top-k accuracy |
|
|
76
76
|
| Compiler | Metadata-only tracing plus placeholder IR and passes |
|
|
77
|
+
| CUDA API | Availability, device-count, sync, and memory-status facade; tensor execution is not implemented in the CPU wheel |
|
|
78
|
+
|
|
79
|
+
## Architecture
|
|
80
|
+
|
|
81
|
+
```text
|
|
82
|
+
Python frontend
|
|
83
|
+
-> Tensor, nn, optim, autograd, ops
|
|
84
|
+
-> runtime dispatcher
|
|
85
|
+
-> backend interface
|
|
86
|
+
-> NumPy CPU backend today
|
|
87
|
+
-> optional native CPU / CUDA / ROCm / oneAPI extensions later
|
|
88
|
+
|
|
89
|
+
Native scaffold
|
|
90
|
+
-> core metadata
|
|
91
|
+
-> dispatcher interfaces
|
|
92
|
+
-> CPU kernel prototypes
|
|
93
|
+
-> CUDA, ROCm, oneAPI backend directories
|
|
94
|
+
```
|
|
77
95
|
|
|
78
96
|
## Backend Status
|
|
79
97
|
|
|
@@ -89,7 +107,7 @@ Expected shape:
|
|
|
89
107
|
```python
|
|
90
108
|
{
|
|
91
109
|
"cpu": {"available": True, "native": False},
|
|
92
|
-
"cuda": {"available": False, "reason": "..."},
|
|
110
|
+
"cuda": {"available": False, "built": False, "device_count": 0, "reason": "..."},
|
|
93
111
|
"rocm": {"available": False, "reason": "..."},
|
|
94
112
|
"oneapi": {"available": False, "reason": "..."},
|
|
95
113
|
}
|
|
@@ -100,6 +118,17 @@ raises `ModelStudioBackendUnavailable` unless a future optional native extension
|
|
|
100
118
|
is actually installed. Unsupported accelerator devices fail with
|
|
101
119
|
`ModelStudioBackendUnavailable`.
|
|
102
120
|
|
|
121
|
+
CUDA availability can also be checked through the public namespace:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
print(ms.cuda.is_available())
|
|
125
|
+
print(ms.cuda.device_count())
|
|
126
|
+
print(ms.cuda.memory_summary())
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
In the CPU-only wheel, explicit CUDA tensor requests raise a clear runtime error
|
|
130
|
+
instead of falling back to CPU.
|
|
131
|
+
|
|
103
132
|
## Tensor Example
|
|
104
133
|
|
|
105
134
|
```python
|
|
@@ -195,6 +224,8 @@ python examples/backend_status.py
|
|
|
195
224
|
python examples/tracing_demo.py
|
|
196
225
|
python examples/functional_training.py
|
|
197
226
|
python examples/random_linalg_demo.py
|
|
227
|
+
python examples/cuda_tensor_demo.py
|
|
228
|
+
python examples/cuda_mlp_demo.py
|
|
198
229
|
python benchmarks/bench_matmul.py
|
|
199
230
|
python benchmarks/bench_mlp.py
|
|
200
231
|
python benchmarks/bench_attention.py
|
|
@@ -205,11 +236,14 @@ python benchmarks/bench_creation.py
|
|
|
205
236
|
python benchmarks/bench_manipulation.py
|
|
206
237
|
python benchmarks/bench_elementwise.py
|
|
207
238
|
python benchmarks/bench_trace.py
|
|
239
|
+
python benchmarks/bench_cuda_elementwise.py
|
|
240
|
+
python benchmarks/bench_cuda_matmul.py
|
|
208
241
|
```
|
|
209
242
|
|
|
210
243
|
## Documentation
|
|
211
244
|
|
|
212
245
|
- [Backend status](docs/backend-status.md)
|
|
246
|
+
- [CUDA status](docs/cuda.md)
|
|
213
247
|
- [Tracing](docs/tracing.md)
|
|
214
248
|
- [Functional API](docs/functional-api.md)
|
|
215
249
|
- [Random namespace](docs/random.md)
|
|
@@ -237,5 +271,8 @@ python benchmarks/bench_trace.py
|
|
|
237
271
|
|
|
238
272
|
- Expand tensor and autograd coverage.
|
|
239
273
|
- Wire optional native CPU kernels only after a safe Python extension exists.
|
|
240
|
-
-
|
|
274
|
+
- Build a real optional CUDA package after tensor storage, kernels, bindings,
|
|
275
|
+
and hardware-backed CI are in place.
|
|
276
|
+
- Add tested ROCm and oneAPI packages after CUDA establishes the accelerator
|
|
277
|
+
backend contract.
|
|
241
278
|
- Improve compiler graph capture, analysis passes, and lowering.
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# ModelStudio
|
|
2
2
|
|
|
3
|
-
ModelStudio is an early-stage AI tensor framework. Version `0.
|
|
3
|
+
ModelStudio is an early-stage AI tensor framework. Version `0.6.0` provides a
|
|
4
4
|
CPU tensor/autograd MVP with neural-network modules, optimizers, serialization,
|
|
5
|
-
data loading, graph tracing metadata, backend status inspection,
|
|
6
|
-
LLM-oriented building blocks.
|
|
5
|
+
data loading, graph tracing metadata, backend status inspection, a public CUDA
|
|
6
|
+
availability namespace, and small LLM-oriented building blocks.
|
|
7
7
|
|
|
8
|
-
It is not a PyTorch or TensorFlow replacement.
|
|
9
|
-
CUDA, ROCm, and oneAPI remain explicit scaffolds until real kernels
|
|
10
|
-
and tested.
|
|
8
|
+
It is not a PyTorch or TensorFlow replacement. The default PyPI package is
|
|
9
|
+
CPU-only. CUDA, ROCm, and oneAPI remain explicit scaffolds until real kernels
|
|
10
|
+
are built and tested in hardware-backed environments.
|
|
11
11
|
|
|
12
12
|
## Installation
|
|
13
13
|
|
|
@@ -43,6 +43,24 @@ python -m pip install -e ".[dev]"
|
|
|
43
43
|
| Interop | `asarray`, `from_numpy`, `to_numpy`, and `ms.numpy` |
|
|
44
44
|
| Metrics | accuracy and top-k accuracy |
|
|
45
45
|
| Compiler | Metadata-only tracing plus placeholder IR and passes |
|
|
46
|
+
| CUDA API | Availability, device-count, sync, and memory-status facade; tensor execution is not implemented in the CPU wheel |
|
|
47
|
+
|
|
48
|
+
## Architecture
|
|
49
|
+
|
|
50
|
+
```text
|
|
51
|
+
Python frontend
|
|
52
|
+
-> Tensor, nn, optim, autograd, ops
|
|
53
|
+
-> runtime dispatcher
|
|
54
|
+
-> backend interface
|
|
55
|
+
-> NumPy CPU backend today
|
|
56
|
+
-> optional native CPU / CUDA / ROCm / oneAPI extensions later
|
|
57
|
+
|
|
58
|
+
Native scaffold
|
|
59
|
+
-> core metadata
|
|
60
|
+
-> dispatcher interfaces
|
|
61
|
+
-> CPU kernel prototypes
|
|
62
|
+
-> CUDA, ROCm, oneAPI backend directories
|
|
63
|
+
```
|
|
46
64
|
|
|
47
65
|
## Backend Status
|
|
48
66
|
|
|
@@ -58,7 +76,7 @@ Expected shape:
|
|
|
58
76
|
```python
|
|
59
77
|
{
|
|
60
78
|
"cpu": {"available": True, "native": False},
|
|
61
|
-
"cuda": {"available": False, "reason": "..."},
|
|
79
|
+
"cuda": {"available": False, "built": False, "device_count": 0, "reason": "..."},
|
|
62
80
|
"rocm": {"available": False, "reason": "..."},
|
|
63
81
|
"oneapi": {"available": False, "reason": "..."},
|
|
64
82
|
}
|
|
@@ -69,6 +87,17 @@ raises `ModelStudioBackendUnavailable` unless a future optional native extension
|
|
|
69
87
|
is actually installed. Unsupported accelerator devices fail with
|
|
70
88
|
`ModelStudioBackendUnavailable`.
|
|
71
89
|
|
|
90
|
+
CUDA availability can also be checked through the public namespace:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
print(ms.cuda.is_available())
|
|
94
|
+
print(ms.cuda.device_count())
|
|
95
|
+
print(ms.cuda.memory_summary())
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
In the CPU-only wheel, explicit CUDA tensor requests raise a clear runtime error
|
|
99
|
+
instead of falling back to CPU.
|
|
100
|
+
|
|
72
101
|
## Tensor Example
|
|
73
102
|
|
|
74
103
|
```python
|
|
@@ -164,6 +193,8 @@ python examples/backend_status.py
|
|
|
164
193
|
python examples/tracing_demo.py
|
|
165
194
|
python examples/functional_training.py
|
|
166
195
|
python examples/random_linalg_demo.py
|
|
196
|
+
python examples/cuda_tensor_demo.py
|
|
197
|
+
python examples/cuda_mlp_demo.py
|
|
167
198
|
python benchmarks/bench_matmul.py
|
|
168
199
|
python benchmarks/bench_mlp.py
|
|
169
200
|
python benchmarks/bench_attention.py
|
|
@@ -174,11 +205,14 @@ python benchmarks/bench_creation.py
|
|
|
174
205
|
python benchmarks/bench_manipulation.py
|
|
175
206
|
python benchmarks/bench_elementwise.py
|
|
176
207
|
python benchmarks/bench_trace.py
|
|
208
|
+
python benchmarks/bench_cuda_elementwise.py
|
|
209
|
+
python benchmarks/bench_cuda_matmul.py
|
|
177
210
|
```
|
|
178
211
|
|
|
179
212
|
## Documentation
|
|
180
213
|
|
|
181
214
|
- [Backend status](docs/backend-status.md)
|
|
215
|
+
- [CUDA status](docs/cuda.md)
|
|
182
216
|
- [Tracing](docs/tracing.md)
|
|
183
217
|
- [Functional API](docs/functional-api.md)
|
|
184
218
|
- [Random namespace](docs/random.md)
|
|
@@ -206,5 +240,8 @@ python benchmarks/bench_trace.py
|
|
|
206
240
|
|
|
207
241
|
- Expand tensor and autograd coverage.
|
|
208
242
|
- Wire optional native CPU kernels only after a safe Python extension exists.
|
|
209
|
-
-
|
|
243
|
+
- Build a real optional CUDA package after tensor storage, kernels, bindings,
|
|
244
|
+
and hardware-backed CI are in place.
|
|
245
|
+
- Add tested ROCm and oneAPI packages after CUDA establishes the accelerator
|
|
246
|
+
backend contract.
|
|
210
247
|
- Improve compiler graph capture, analysis passes, and lowering.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import platform
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
|
|
7
|
+
import modelstudio as ms
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _time_ms(fn: Callable[[], object], warmup: int, iterations: int, *, synchronize: bool) -> float:
|
|
11
|
+
for _ in range(warmup):
|
|
12
|
+
fn()
|
|
13
|
+
if synchronize:
|
|
14
|
+
ms.cuda.synchronize()
|
|
15
|
+
start = time.perf_counter()
|
|
16
|
+
for _ in range(iterations):
|
|
17
|
+
fn()
|
|
18
|
+
if synchronize:
|
|
19
|
+
ms.cuda.synchronize()
|
|
20
|
+
return (time.perf_counter() - start) * 1000.0 / iterations
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main() -> None:
|
|
24
|
+
shape = (1024, 1024)
|
|
25
|
+
warmup = 5
|
|
26
|
+
iterations = 50
|
|
27
|
+
|
|
28
|
+
print(f"Python: {platform.python_version()}")
|
|
29
|
+
print(f"NumPy: {ms.numpy.__version__}")
|
|
30
|
+
print(f"ModelStudio: {ms.__version__}")
|
|
31
|
+
print(f"CUDA: available={ms.cuda.is_available()} device_count={ms.cuda.device_count()}")
|
|
32
|
+
print(f"Shape: {shape}")
|
|
33
|
+
print(f"Warmup: {warmup}")
|
|
34
|
+
print(f"Iterations: {iterations}")
|
|
35
|
+
|
|
36
|
+
if not ms.cuda.is_available():
|
|
37
|
+
print(ms.cuda.memory_summary())
|
|
38
|
+
print("Skipping CUDA elementwise benchmark because CUDA tensor execution is not available.")
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
ms.manual_seed(123)
|
|
42
|
+
x = ms.randn(shape, device="cuda")
|
|
43
|
+
y = ms.randn(shape, device="cuda")
|
|
44
|
+
|
|
45
|
+
add_ms = _time_ms(lambda: x + y, warmup, iterations, synchronize=True)
|
|
46
|
+
relu_ms = _time_ms(lambda: ms.relu(x), warmup, iterations, synchronize=True)
|
|
47
|
+
|
|
48
|
+
print(f"CUDA add avg: {add_ms:.3f} ms")
|
|
49
|
+
print(f"CUDA relu avg: {relu_ms:.3f} ms")
|
|
50
|
+
print(ms.cuda.memory_summary())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
main()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import platform
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
|
|
7
|
+
import modelstudio as ms
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _time_ms(fn: Callable[[], object], warmup: int, iterations: int, *, synchronize: bool) -> float:
|
|
11
|
+
for _ in range(warmup):
|
|
12
|
+
fn()
|
|
13
|
+
if synchronize:
|
|
14
|
+
ms.cuda.synchronize()
|
|
15
|
+
start = time.perf_counter()
|
|
16
|
+
for _ in range(iterations):
|
|
17
|
+
fn()
|
|
18
|
+
if synchronize:
|
|
19
|
+
ms.cuda.synchronize()
|
|
20
|
+
return (time.perf_counter() - start) * 1000.0 / iterations
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main() -> None:
|
|
24
|
+
shape = (512, 512)
|
|
25
|
+
warmup = 3
|
|
26
|
+
iterations = 20
|
|
27
|
+
|
|
28
|
+
print(f"Python: {platform.python_version()}")
|
|
29
|
+
print(f"NumPy: {ms.numpy.__version__}")
|
|
30
|
+
print(f"ModelStudio: {ms.__version__}")
|
|
31
|
+
print(f"CUDA: available={ms.cuda.is_available()} device_count={ms.cuda.device_count()}")
|
|
32
|
+
print(f"Shape: {shape} x {shape}")
|
|
33
|
+
print(f"Warmup: {warmup}")
|
|
34
|
+
print(f"Iterations: {iterations}")
|
|
35
|
+
|
|
36
|
+
if not ms.cuda.is_available():
|
|
37
|
+
print(ms.cuda.memory_summary())
|
|
38
|
+
print("Skipping CUDA matmul benchmark because CUDA tensor execution is not available.")
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
ms.manual_seed(123)
|
|
42
|
+
a = ms.randn(shape, device="cuda")
|
|
43
|
+
b = ms.randn(shape, device="cuda")
|
|
44
|
+
|
|
45
|
+
matmul_ms = _time_ms(lambda: a @ b, warmup, iterations, synchronize=True)
|
|
46
|
+
|
|
47
|
+
print(f"CUDA matmul avg: {matmul_ms:.3f} ms")
|
|
48
|
+
print(ms.cuda.memory_summary())
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
main()
|
|
@@ -9,9 +9,17 @@ add_library(modelstudio_native STATIC
|
|
|
9
9
|
target_include_directories(modelstudio_native PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
|
10
10
|
|
|
11
11
|
if(MODELSTUDIO_ENABLE_CUDA)
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
target_sources(modelstudio_native PRIVATE
|
|
13
|
+
backends/cuda/cuda_backend.cu
|
|
14
|
+
backends/cuda/cuda_context.cu
|
|
15
|
+
backends/cuda/cuda_memory.cu
|
|
16
|
+
backends/cuda/cuda_stream.cu
|
|
17
|
+
backends/cuda/kernels/elementwise.cu
|
|
18
|
+
backends/cuda/kernels/reductions.cu
|
|
19
|
+
backends/cuda/kernels/matmul.cu
|
|
20
|
+
)
|
|
14
21
|
target_compile_definitions(modelstudio_native PUBLIC MODELSTUDIO_ENABLE_CUDA=1)
|
|
22
|
+
target_link_libraries(modelstudio_native PUBLIC CUDA::cudart CUDA::cublas)
|
|
15
23
|
endif()
|
|
16
24
|
|
|
17
25
|
if(MODELSTUDIO_ENABLE_ROCM)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# CUDA Backend
|
|
2
|
+
|
|
3
|
+
This directory is scaffolding for a future NVIDIA CUDA backend.
|
|
4
|
+
|
|
5
|
+
Current status:
|
|
6
|
+
- Not built by default.
|
|
7
|
+
- Enabled only with `MODELSTUDIO_ENABLE_CUDA=ON`.
|
|
8
|
+
- Python CPU users do not import or depend on CUDA artifacts.
|
|
9
|
+
- Context, allocator, stream, and kernel entry-point files are present as
|
|
10
|
+
scaffolding only.
|
|
11
|
+
|
|
12
|
+
Implementation path:
|
|
13
|
+
1. Track allocation sizes and ownership in the CUDA allocator.
|
|
14
|
+
2. Add device tensor storage and shape/stride views.
|
|
15
|
+
3. Replace placeholder kernel entry points with tested CUDA kernels.
|
|
16
|
+
4. Bind CUDA runtime functions and tensors into Python.
|
|
17
|
+
5. Register the native backend with the dispatcher only when all required ops
|
|
18
|
+
are implemented.
|
|
19
|
+
6. Ship as an optional package such as `modelstudio-cuda`.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#include "backends/cuda/cuda_backend.hpp"
|
|
2
|
+
|
|
3
|
+
#include "backends/cuda/cuda_kernels.hpp"
|
|
4
|
+
#include "core/error.hpp"
|
|
5
|
+
|
|
6
|
+
namespace modelstudio::cuda {
|
|
7
|
+
|
|
8
|
+
Tensor CUDABackend::empty(const Shape&, DType) {
|
|
9
|
+
throw Error("CUDA tensor allocation is scaffolded but not wired into Python yet");
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
Tensor CUDABackend::add(const Tensor& lhs, const Tensor& rhs) {
|
|
13
|
+
return add_kernel(lhs, rhs);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
Tensor CUDABackend::mul(const Tensor& lhs, const Tensor& rhs) {
|
|
17
|
+
return mul_kernel(lhs, rhs);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
Tensor CUDABackend::matmul(const Tensor& lhs, const Tensor& rhs) {
|
|
21
|
+
return matmul_cublas(lhs, rhs);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
Tensor CUDABackend::relu(const Tensor& input) {
|
|
25
|
+
return relu_kernel(input);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
} // namespace modelstudio::cuda
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#include "backends/cuda/cuda_context.hpp"
|
|
2
|
+
|
|
3
|
+
#include <cuda_runtime_api.h>
|
|
4
|
+
|
|
5
|
+
#include <string>
|
|
6
|
+
|
|
7
|
+
#include "core/error.hpp"
|
|
8
|
+
|
|
9
|
+
namespace modelstudio::cuda {
|
|
10
|
+
|
|
11
|
+
void check_cuda(int status, const char* operation) {
|
|
12
|
+
if (status != cudaSuccess) {
|
|
13
|
+
throw Error(std::string(operation) + " failed: " + cudaGetErrorString(static_cast<cudaError_t>(status)));
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
int device_count() {
|
|
18
|
+
int count = 0;
|
|
19
|
+
auto status = cudaGetDeviceCount(&count);
|
|
20
|
+
if (status == cudaErrorNoDevice || status == cudaErrorInsufficientDriver) {
|
|
21
|
+
return 0;
|
|
22
|
+
}
|
|
23
|
+
check_cuda(status, "cudaGetDeviceCount");
|
|
24
|
+
return count;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
int current_device() {
|
|
28
|
+
int device = 0;
|
|
29
|
+
check_cuda(cudaGetDevice(&device), "cudaGetDevice");
|
|
30
|
+
return device;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
void set_device(int index) {
|
|
34
|
+
check_cuda(cudaSetDevice(index), "cudaSetDevice");
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
} // namespace modelstudio::cuda
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "core/tensor.hpp"
|
|
4
|
+
|
|
5
|
+
namespace modelstudio::cuda {
|
|
6
|
+
|
|
7
|
+
Tensor add_kernel(const Tensor& lhs, const Tensor& rhs);
|
|
8
|
+
Tensor sub_kernel(const Tensor& lhs, const Tensor& rhs);
|
|
9
|
+
Tensor mul_kernel(const Tensor& lhs, const Tensor& rhs);
|
|
10
|
+
Tensor div_kernel(const Tensor& lhs, const Tensor& rhs);
|
|
11
|
+
Tensor relu_kernel(const Tensor& input);
|
|
12
|
+
Tensor sum_kernel(const Tensor& input);
|
|
13
|
+
Tensor mean_kernel(const Tensor& input);
|
|
14
|
+
Tensor matmul_cublas(const Tensor& lhs, const Tensor& rhs);
|
|
15
|
+
|
|
16
|
+
} // namespace modelstudio::cuda
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#include "backends/cuda/cuda_memory.hpp"
|
|
2
|
+
|
|
3
|
+
#include <cuda_runtime_api.h>
|
|
4
|
+
|
|
5
|
+
#include <atomic>
|
|
6
|
+
#include <cstddef>
|
|
7
|
+
|
|
8
|
+
#include "backends/cuda/cuda_context.hpp"
|
|
9
|
+
|
|
10
|
+
namespace modelstudio::cuda {
|
|
11
|
+
|
|
12
|
+
namespace {
|
|
13
|
+
std::atomic<unsigned long long> g_allocated_bytes{0};
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
void* CUDAMemoryAllocator::allocate(unsigned long long bytes) {
|
|
17
|
+
void* ptr = nullptr;
|
|
18
|
+
check_cuda(cudaMalloc(&ptr, static_cast<std::size_t>(bytes)), "cudaMalloc");
|
|
19
|
+
g_allocated_bytes.fetch_add(bytes, std::memory_order_relaxed);
|
|
20
|
+
return ptr;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
void CUDAMemoryAllocator::deallocate(void* ptr) {
|
|
24
|
+
if (ptr == nullptr) {
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
check_cuda(cudaFree(ptr), "cudaFree");
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
unsigned long long allocated_bytes() {
|
|
31
|
+
return g_allocated_bytes.load(std::memory_order_relaxed);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
} // namespace modelstudio::cuda
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#include "backends/cuda/cuda_stream.hpp"
|
|
2
|
+
|
|
3
|
+
#include <cuda_runtime_api.h>
|
|
4
|
+
|
|
5
|
+
#include "backends/cuda/cuda_context.hpp"
|
|
6
|
+
|
|
7
|
+
namespace modelstudio::cuda {
|
|
8
|
+
|
|
9
|
+
void synchronize_device() {
|
|
10
|
+
check_cuda(cudaDeviceSynchronize(), "cudaDeviceSynchronize");
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
} // namespace modelstudio::cuda
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#include "backends/cuda/cuda_kernels.hpp"
|
|
2
|
+
|
|
3
|
+
#include "core/error.hpp"
|
|
4
|
+
|
|
5
|
+
namespace modelstudio::cuda {
|
|
6
|
+
|
|
7
|
+
Tensor add_kernel(const Tensor&, const Tensor&) {
|
|
8
|
+
throw Error("CUDA add kernel is scaffolded but not wired into Python yet");
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
Tensor sub_kernel(const Tensor&, const Tensor&) {
|
|
12
|
+
throw Error("CUDA sub kernel is scaffolded but not wired into Python yet");
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
Tensor mul_kernel(const Tensor&, const Tensor&) {
|
|
16
|
+
throw Error("CUDA mul kernel is scaffolded but not wired into Python yet");
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
Tensor div_kernel(const Tensor&, const Tensor&) {
|
|
20
|
+
throw Error("CUDA div kernel is scaffolded but not wired into Python yet");
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
Tensor relu_kernel(const Tensor&) {
|
|
24
|
+
throw Error("CUDA relu kernel is scaffolded but not wired into Python yet");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
} // namespace modelstudio::cuda
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#include "backends/cuda/cuda_kernels.hpp"
|
|
2
|
+
|
|
3
|
+
#include <cublas_v2.h>
|
|
4
|
+
|
|
5
|
+
#include "core/error.hpp"
|
|
6
|
+
|
|
7
|
+
namespace modelstudio::cuda {
|
|
8
|
+
|
|
9
|
+
Tensor matmul_cublas(const Tensor&, const Tensor&) {
|
|
10
|
+
throw Error("CUDA cuBLAS matmul is scaffolded but not wired into Python yet");
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
} // namespace modelstudio::cuda
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#include "backends/cuda/cuda_kernels.hpp"
|
|
2
|
+
|
|
3
|
+
#include "core/error.hpp"
|
|
4
|
+
|
|
5
|
+
namespace modelstudio::cuda {
|
|
6
|
+
|
|
7
|
+
Tensor sum_kernel(const Tensor&) {
|
|
8
|
+
throw Error("CUDA sum reduction is scaffolded but not wired into Python yet");
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
Tensor mean_kernel(const Tensor&) {
|
|
12
|
+
throw Error("CUDA mean reduction is scaffolded but not wired into Python yet");
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
} // namespace modelstudio::cuda
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#include "backends/cuda/cuda_context.hpp"
|
|
2
|
+
#include "backends/cuda/cuda_memory.hpp"
|
|
3
|
+
#include "backends/cuda/cuda_stream.hpp"
|
|
4
|
+
|
|
5
|
+
namespace modelstudio::bindings {
|
|
6
|
+
|
|
7
|
+
// Future Python extension registration point. The CPU-only wheel does not build
|
|
8
|
+
// this file; CUDA bindings will be enabled only when MODELSTUDIO_ENABLE_CUDA=ON
|
|
9
|
+
// and a binding layer is added.
|
|
10
|
+
void register_cuda_bindings_placeholder() {}
|
|
11
|
+
|
|
12
|
+
} // namespace modelstudio::bindings
|
|
@@ -16,7 +16,7 @@ Tensor API
|
|
|
16
16
|
| Backend | Status |
|
|
17
17
|
| --- | --- |
|
|
18
18
|
| CPU | Working MVP backed by NumPy |
|
|
19
|
-
| CUDA |
|
|
19
|
+
| CUDA | Public status namespace plus native scaffold; no tensor execution in the CPU wheel |
|
|
20
20
|
| ROCm | Scaffold only |
|
|
21
21
|
| oneAPI | Scaffold only |
|
|
22
22
|
|
|
@@ -46,7 +46,7 @@ The native scaffolding under `csrc/` mirrors the Python runtime:
|
|
|
46
46
|
- `csrc/core`: dtype, device, shape, tensor metadata, storage
|
|
47
47
|
- `csrc/dispatcher`: backend interface and operator registry
|
|
48
48
|
- `csrc/backends/cpu`: native CPU backend and kernels
|
|
49
|
-
- `csrc/backends/cuda`: CUDA
|
|
49
|
+
- `csrc/backends/cuda`: CUDA context, memory, stream, and kernel scaffolds
|
|
50
50
|
- `csrc/backends/rocm`: ROCm/HIP placeholders
|
|
51
51
|
- `csrc/backends/oneapi`: oneAPI/SYCL placeholders
|
|
52
52
|
|
|
@@ -58,7 +58,7 @@ Python API stays stable.
|
|
|
58
58
|
The CPU MVP stores arrays in NumPy. Future native backends should introduce:
|
|
59
59
|
|
|
60
60
|
- CPU allocator abstractions for native storage
|
|
61
|
-
- CUDA device allocator and stream support
|
|
61
|
+
- Complete CUDA device allocator and stream support
|
|
62
62
|
- HIP allocator and stream support
|
|
63
63
|
- SYCL allocator and queue support
|
|
64
64
|
|