gpufl 1.0.2__tar.gz → 1.1.0rc2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/.dockerignore +4 -4
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/.github/workflows/build.yml +20 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/.github/workflows/release.yml +1 -1
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/.gitignore +28 -1
- gpufl-1.1.0rc2/CHANGELOG.md +219 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/CMakeLists.txt +52 -30
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/Dockerfile.monitor.amd +3 -11
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/PKG-INFO +104 -70
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/README.md +103 -69
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/benchmark/README.md +3 -3
- gpufl-1.1.0rc2/benchmark/profile_pytorch_kernels.py +193 -0
- gpufl-1.1.0rc2/benchmark/profile_pytorch_via_gpufl.py +304 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/benchmark/run_benchmark.py +5 -8
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/daemon/monitor/main.cpp +6 -2
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/gpufl_scope_demo.cpp +1 -1
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/CMakeLists.txt +11 -1
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/block_style_example.cu +2 -2
- gpufl-1.1.0rc2/example/cuda/manykernel_benchmark.cu +549 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/memory_coalescing_demo.cu +2 -2
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/occupancy_demo.cu +1 -1
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/sass_divergence_demo.cu +37 -9
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/system_monitor.cu +1 -1
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/vector_add_benchmark.cu +2 -2
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/02_numba_cuda.py +31 -8
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/03_pytorch_benchmark.py +27 -10
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/rocm_collector.cpp +2 -1
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/rocprofiler_backend.cpp +8 -2
- gpufl-1.1.0rc2/include/gpufl/backends/nvidia/cuda_cleanup_handler.cpp +93 -0
- gpufl-1.1.0rc2/include/gpufl/backends/nvidia/cuda_cleanup_handler.hpp +27 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_backend.cpp +505 -56
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_backend.hpp +94 -6
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_common.hpp +43 -10
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_utils.cpp +10 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_utils.hpp +6 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.cpp +80 -137
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.hpp +6 -0
- gpufl-1.1.0rc2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp +194 -0
- gpufl-1.1.0rc2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.hpp +93 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/profiling_engine.hpp +15 -0
- gpufl-1.1.0rc2/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +734 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/sass_metrics_engine.hpp +31 -4
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/kernel_launch_handler.cpp +148 -21
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/mem_transfer_handler.cpp +9 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/resource_handler.cpp +3 -3
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/synchronization_handler.cpp +8 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/config_file_loader.cpp +9 -6
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/debug_logger.hpp +12 -10
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/dictionary_manager.cpp +94 -24
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/events.hpp +29 -3
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/gpufl.cpp +282 -123
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/host_info.hpp +3 -3
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/json/json.cpp +33 -3
- gpufl-1.1.0rc2/include/gpufl/core/logger/file_compressor.cpp +55 -0
- gpufl-1.1.0rc2/include/gpufl/core/logger/file_log_sink.cpp +231 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/logger/file_log_sink.hpp +15 -4
- gpufl-1.1.0rc2/include/gpufl/core/logger/log_rotator.cpp +105 -0
- gpufl-1.1.0rc2/include/gpufl/core/logger/log_rotator.hpp +65 -0
- gpufl-1.1.0rc2/include/gpufl/core/logger/logger.cpp +60 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/logger/logger.hpp +19 -3
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/batch_models.cpp +8 -3
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/lifecycle_model.cpp +22 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/lifecycle_model.hpp +8 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor.cpp +12 -7
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor.hpp +91 -10
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor_adapter.cpp +11 -0
- gpufl-1.1.0rc2/include/gpufl/core/remote_config.cpp +118 -0
- gpufl-1.1.0rc2/include/gpufl/core/remote_config.hpp +39 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/sampler.cpp +69 -29
- gpufl-1.1.0rc2/include/gpufl/core/sampler.hpp +117 -0
- gpufl-1.1.0rc2/include/gpufl/gpufl.hpp +466 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/report/text_report.cpp +105 -8
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/report/text_report.hpp +13 -0
- gpufl-1.1.0rc2/include/gpufl/upload/upload_logs.cpp +1503 -0
- gpufl-1.1.0rc2/include/gpufl/upload/upload_logs.hpp +243 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/pyproject.toml +8 -1
- gpufl-1.1.0rc2/python/bindings.cpp +329 -0
- gpufl-1.1.0rc2/python/gpufl/__init__.py +1073 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/analyzer/analyzer.py +143 -24
- gpufl-1.1.0rc2/python/gpufl/cli.py +178 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/report/text_report.py +141 -33
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/viz/reader.py +36 -2
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/viz/visualizer.py +4 -4
- gpufl-1.1.0rc2/run_tests.py +309 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/CMakeLists.txt +56 -14
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/nvidia/test_engine_coverage.cpp +35 -13
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/log_utils.cpp +76 -19
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_analyzer.cpp +2 -2
- gpufl-1.1.0rc2/tests/core/test_api_path_routing.cpp +50 -0
- gpufl-1.1.0rc2/tests/core/test_bench_invoker.cpp +139 -0
- gpufl-1.1.0rc2/tests/core/test_disabled.cpp +171 -0
- gpufl-1.1.0rc2/tests/core/test_sampler.cpp +195 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_wire_contract.cpp +45 -2
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/python/test_bindings.py +13 -6
- gpufl-1.1.0rc2/tests/python/test_continuous_system_sampling.py +192 -0
- gpufl-1.1.0rc2/tests/python/test_disabled.py +178 -0
- gpufl-1.1.0rc2/tests/python/test_scope_iterable.py +308 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/run_engine_coverage.ps1 +3 -3
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/run_engine_coverage.sh +3 -3
- gpufl-1.1.0rc2/tests/upload/test_upload_logs.cpp +1029 -0
- gpufl-1.1.0rc2/tests/verify_pipeline.py +114 -0
- gpufl-1.0.2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp +0 -70
- gpufl-1.0.2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.hpp +0 -65
- gpufl-1.0.2/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +0 -421
- gpufl-1.0.2/include/gpufl/core/logger/file_compressor.cpp +0 -44
- gpufl-1.0.2/include/gpufl/core/logger/file_log_sink.cpp +0 -151
- gpufl-1.0.2/include/gpufl/core/logger/http_log_sink.cpp +0 -476
- gpufl-1.0.2/include/gpufl/core/logger/http_log_sink.hpp +0 -206
- gpufl-1.0.2/include/gpufl/core/logger/log_rotator.cpp +0 -65
- gpufl-1.0.2/include/gpufl/core/logger/log_rotator.hpp +0 -32
- gpufl-1.0.2/include/gpufl/core/logger/logger.cpp +0 -47
- gpufl-1.0.2/include/gpufl/core/remote_config.cpp +0 -276
- gpufl-1.0.2/include/gpufl/core/remote_config.hpp +0 -60
- gpufl-1.0.2/include/gpufl/core/sampler.hpp +0 -63
- gpufl-1.0.2/include/gpufl/gpufl.hpp +0 -244
- gpufl-1.0.2/python/bindings.cpp +0 -189
- gpufl-1.0.2/python/gpufl/__init__.py +0 -211
- gpufl-1.0.2/tests/core/test_api_path_routing.cpp +0 -213
- gpufl-1.0.2/tests/core/test_http_log_sink.cpp +0 -409
- gpufl-1.0.2/tests/python/test_remote_upload_smoke.py +0 -185
- gpufl-1.0.2/tests/verify_pipeline.py +0 -102
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/.clang-format +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/.github/pull_request_template.md +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/CONTRIBUTING.md +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/Dockerfile.demo +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/Dockerfile.monitor +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/Dockerfile.monitor.supervisord.conf +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/LICENSE +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/THIRD-PARTY-NOTICES.txt +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/benchmark/cuda_gemm.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/benchmark/pytorch_train.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/build.sh +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/daemon/README.md +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/daemon/monitor/CMakeLists.txt +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/docker-compose.monitor.amd.yml +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/docker-compose.monitor.yml +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/CMakeLists.txt +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/README.md +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/check_device.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/vector_add_benchmark.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/check_conflict.cu +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/check_device.cu +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/cupti_basic.cu +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/cupti_pc_sampling.cu +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/list_sass_metrics.cu +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/test_occupancy.cu +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/01_basic.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/analyzer/01_analyzer_sample.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/requirements.txt +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/viz/01_plot_memory_timeline.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/viz/02_plot_stress_timeline.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/images/Screenshot1.png +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/images/Screenshot2.png +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/engine/amd_profiling_engine.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/engine/dispatch_counter_engine.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/hip_static_collector.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/hip_static_collector.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/monitor_adapter_amd.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/monitor_adapter_amd.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/rocm_collector.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/rocprofiler_backend.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/host_collector.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cuda_collector.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cuda_collector.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cuda_feature_guards.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/range_profiler_engine.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/range_profiler_engine.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/kernel_launch_handler.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/mem_transfer_handler.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/nvml_collector.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/nvml_collector.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/resource_handler.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/sampler/cupti_sass.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/sampler/cupti_sass.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/synchronization_handler.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/activity_record.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/backend_factory.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/backend_factory.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/backend_interfaces.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/batch_buffer.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/common.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/common.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/config_file_loader.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/debug_logger.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/dictionary_manager.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/host_info.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/itanium_demangle.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/itanium_demangle.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/json/json.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/logger/file_compressor.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/logger/log_sink.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/batch_models.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/graph_launch_event_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/graph_launch_event_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/kernel_event_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/kernel_event_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/memcpy_event_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/memcpy_event_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/memory_alloc_event_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/memory_alloc_event_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/model_utils.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/nvtx_marker_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/nvtx_marker_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/perf_metric_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/perf_metric_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/profile_sample_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/profile_sample_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/scope_event_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/scope_event_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/serializable.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/synchronization_event_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/synchronization_event_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/system_event_model.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/system_event_model.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor_adapter.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor_backend.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/ring_buffer.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/runtime.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/runtime.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/sass_compressor.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/sass_compressor.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/scope_registry.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/scope_registry.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/stack_registry.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/stack_trace.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/stack_trace.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/stream_handle.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/trace_type.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/version.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/report/hint_engine.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/report/hint_engine.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/.gitignore +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/analyzer/__init__.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/cupy/__init__.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/jax/__init__.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/numba/__init__.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/report/__init__.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/__init__.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/dispatch.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/profile.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/stack.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/trace_import.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/triton/__init__.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/utils.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/viz/__init__.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/viz/timeline.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/scripts/docker-demo-loop.sh +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/scripts/windows/run-monitor-local.bat +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/amd/test_rocm_collector.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/nvidia/test_cuda_collector.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/nvidia/test_nvidia_backend.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/nvidia/test_nvml_collector.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/log_utils.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/test_kernel.cu +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/test_kernel.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/test_utils.hpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_batch_models.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_itanium_demangle.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_monitor.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/main_test_runner.cpp +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/python/conftest.py +0 -0
- {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/python/test_analyzer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
# Python / notebooks —
|
|
2
|
-
python/
|
|
3
|
-
example/python/
|
|
1
|
+
# Python / notebooks — uncomment if building Python wheels inside Docker
|
|
2
|
+
# python/
|
|
3
|
+
# example/python/
|
|
4
4
|
**/.Trash-*
|
|
5
5
|
**/__pycache__/
|
|
6
6
|
**/*.pyc
|
|
@@ -15,4 +15,4 @@ build/
|
|
|
15
15
|
.git/
|
|
16
16
|
.idea/
|
|
17
17
|
.vscode/
|
|
18
|
-
*.md
|
|
18
|
+
# *.md
|
|
@@ -3,8 +3,28 @@ name: Build GPUFl Client
|
|
|
3
3
|
on:
|
|
4
4
|
push:
|
|
5
5
|
branches: [ "main" ]
|
|
6
|
+
paths-ignore:
|
|
7
|
+
- '**.md'
|
|
8
|
+
- 'docs/**'
|
|
9
|
+
- 'Dockerfile*'
|
|
10
|
+
- '**/Dockerfile'
|
|
11
|
+
- '**/Dockerfile.*'
|
|
12
|
+
- 'LICENSE'
|
|
13
|
+
- 'THIRD-PARTY-NOTICES.txt'
|
|
14
|
+
- '.gitignore'
|
|
15
|
+
- 'images/**'
|
|
6
16
|
pull_request:
|
|
7
17
|
branches: [ "main" ]
|
|
18
|
+
paths-ignore:
|
|
19
|
+
- '**.md'
|
|
20
|
+
- 'docs/**'
|
|
21
|
+
- 'Dockerfile*'
|
|
22
|
+
- '**/Dockerfile'
|
|
23
|
+
- '**/Dockerfile.*'
|
|
24
|
+
- 'LICENSE'
|
|
25
|
+
- 'THIRD-PARTY-NOTICES.txt'
|
|
26
|
+
- '.gitignore'
|
|
27
|
+
- 'images/**'
|
|
8
28
|
|
|
9
29
|
jobs:
|
|
10
30
|
build:
|
|
@@ -148,7 +148,7 @@ jobs:
|
|
|
148
148
|
# quay.io/pypa/manylinux_2_28_x86_64 image (OpenSSL 3.5.5).
|
|
149
149
|
CIBW_ENVIRONMENT_LINUX: "CUDA_HOME=/usr/local/cuda PATH=/usr/local/cuda/bin:$PATH CMAKE_ARGS='-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF -DOPENSSL_INCLUDE_DIR=/usr/include/openssl3 -DOPENSSL_SSL_LIBRARY=/usr/lib64/openssl3/libssl.so -DOPENSSL_CRYPTO_LIBRARY=/usr/lib64/openssl3/libcrypto.so'"
|
|
150
150
|
# Windows build needs the OpenSSL install path so find_package(OpenSSL)
|
|
151
|
-
# in CMakeLists.txt succeeds, otherwise HTTPS upload (
|
|
151
|
+
# in CMakeLists.txt succeeds, otherwise HTTPS upload (gpufl::uploadLogs)
|
|
152
152
|
# silently falls back to HTTP-only — see openssl-windows.html in the
|
|
153
153
|
# manual repo for the user-facing story. CIBW_BEFORE_ALL_WINDOWS
|
|
154
154
|
# installs choco's openssl package into this path.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
###
|
|
1
|
+
### ai
|
|
2
2
|
.claude/
|
|
3
|
+
.junie/
|
|
3
4
|
|
|
4
5
|
### idea
|
|
5
6
|
.idea/**
|
|
@@ -14,6 +15,7 @@ wget-log*
|
|
|
14
15
|
|
|
15
16
|
### docker
|
|
16
17
|
example/python/docker/**/
|
|
18
|
+
dist/
|
|
17
19
|
|
|
18
20
|
### C++ template
|
|
19
21
|
# Prerequisites
|
|
@@ -89,3 +91,28 @@ example/python/docker/**/
|
|
|
89
91
|
*.hex
|
|
90
92
|
|
|
91
93
|
*.log
|
|
94
|
+
|
|
95
|
+
### Python
|
|
96
|
+
# Byte-compiled / optimized files
|
|
97
|
+
__pycache__/
|
|
98
|
+
*.py[cod]
|
|
99
|
+
*$py.class
|
|
100
|
+
|
|
101
|
+
# Test / coverage caches
|
|
102
|
+
.pytest_cache/
|
|
103
|
+
.mypy_cache/
|
|
104
|
+
.ruff_cache/
|
|
105
|
+
.coverage
|
|
106
|
+
.coverage.*
|
|
107
|
+
htmlcov/
|
|
108
|
+
coverage.xml
|
|
109
|
+
|
|
110
|
+
# Packaging / distribution
|
|
111
|
+
*.egg-info/
|
|
112
|
+
*.egg
|
|
113
|
+
.eggs/
|
|
114
|
+
|
|
115
|
+
# Virtualenvs
|
|
116
|
+
.venv/
|
|
117
|
+
venv/
|
|
118
|
+
env/
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `gpufl-client` are documented here. Format
|
|
4
|
+
inspired by [Keep a Changelog](https://keepachangelog.com/en/1.1.0/);
|
|
5
|
+
versioning follows PEP 440 for the Python wheel and semver-style
|
|
6
|
+
`MAJOR.MINOR.PATCH` for the C++ library.
|
|
7
|
+
|
|
8
|
+
## [1.1.0] — Unreleased
|
|
9
|
+
|
|
10
|
+
Currently validating as **`1.1.0rc2`**. Once it survives a full smoke
|
|
11
|
+
cycle in dev + a sample PyTorch run + the example Dockerfile build,
|
|
12
|
+
the `rc2` suffix gets dropped to ship as `1.1.0`.
|
|
13
|
+
|
|
14
|
+
### Breaking changes
|
|
15
|
+
|
|
16
|
+
#### `HttpLogSink` removed — upload is now a separate post-shutdown step
|
|
17
|
+
|
|
18
|
+
The in-process `HttpLogSink` that POSTed every NDJSON event live
|
|
19
|
+
during a session has been deleted. Network failures during the
|
|
20
|
+
workload could leak into the GPU job's exit code, and per-event HTTP
|
|
21
|
+
added measurable jitter to PyTorch training runs. Upload now happens
|
|
22
|
+
as an explicit step after `gpufl::shutdown()` returns.
|
|
23
|
+
|
|
24
|
+
For Python customers, the migration is **soft** — `remote_upload=True`
|
|
25
|
+
still works in v1.1 as a deprecation shim (see Deprecations below).
|
|
26
|
+
For pure-C++ customers who `#include`'d the header directly, the
|
|
27
|
+
break is a compile error.
|
|
28
|
+
|
|
29
|
+
| Surface | Before (v1.0.x) | New (v1.1+) | v1.1 backward-compat behavior |
|
|
30
|
+
|---|---|---|---|
|
|
31
|
+
| Python `init(remote_upload=True)` | Live HttpLogSink during session | `with gpufl.session(...)` or `gpufl.upload_logs(...)` after shutdown | **Still works** — `DeprecationWarning` at init + `atexit` handler that calls `upload_logs()` at interpreter exit |
|
|
32
|
+
| C++ `opts.remote_upload = true;` | Live HttpLogSink during session | `gpufl::uploadLogs(uopts)` after `shutdown()` | **Still works** — deprecation log at init + auto-call to `gpufl::uploadLogs()` at the end of `gpufl::shutdown()` (shutdown now blocks until upload completes) |
|
|
33
|
+
| Env var `GPUFL_REMOTE_UPLOAD=1` | Live HttpLogSink during session | `gpufl.upload_logs()` post-shutdown | **Still works** — routes through the Python shim above |
|
|
34
|
+
| `#include "gpufl/core/logger/http_log_sink.hpp"` | The header | gone | **Compile error** — drop the include |
|
|
35
|
+
|
|
36
|
+
See [docs/getting-started/sending-data.md](docs/getting-started/sending-data.md)
|
|
37
|
+
for the full migration guide.
|
|
38
|
+
|
|
39
|
+
### Deprecations (scheduled for v1.2 removal)
|
|
40
|
+
|
|
41
|
+
| Field / kwarg | Status in v1.1 | What to use instead |
|
|
42
|
+
|---|---|---|
|
|
43
|
+
| `InitOptions::remote_upload` (Python kwarg + C++ field) | DeprecationWarning + atexit shim that calls `upload_logs()` at interpreter exit | `with gpufl.session(...)` or call `gpufl.upload_logs()` explicitly after `shutdown()` |
|
|
44
|
+
| `InitOptions::backend_url` | Still functional; read by the version-discovery probe and stored for `upload_logs()` to read back | Pass `backend_url` directly to `UploadOptions` / `gpufl.upload_logs()` |
|
|
45
|
+
| `InitOptions::api_key` | Same as `backend_url` | Pass `api_key` directly to `UploadOptions` / `gpufl.upload_logs()` |
|
|
46
|
+
| `GPUFL_REMOTE_UPLOAD` env var | Still read; routes to the Python atexit shim | Drop from container manifests / start scripts |
|
|
47
|
+
|
|
48
|
+
All three fields ship in v1.1 to keep the migration painless and will
|
|
49
|
+
be removed together in v1.2 — at which point creds live exclusively on
|
|
50
|
+
`UploadOptions` and `gpufl::init()` stops touching network config
|
|
51
|
+
entirely.
|
|
52
|
+
|
|
53
|
+
### Breaking changes (cont.)
|
|
54
|
+
|
|
55
|
+
#### `sampling_auto_start` renamed to `continuous_system_sampling`
|
|
56
|
+
|
|
57
|
+
The old name only described init-time behavior. The new flag covers
|
|
58
|
+
the full policy — the semantics also got fixed (see Bug fixes).
|
|
59
|
+
|
|
60
|
+
- **Python**: old kwarg still accepted for this release with a
|
|
61
|
+
`DeprecationWarning`. Will be removed in the next release.
|
|
62
|
+
- **C++**: hard rename. Compile error points at the call site with
|
|
63
|
+
a clear "no member named 'sampling_auto_start'" message.
|
|
64
|
+
|
|
65
|
+
### Added
|
|
66
|
+
|
|
67
|
+
#### Deferred upload — `gpufl.upload_logs()` / `gpufl::uploadLogs()`
|
|
68
|
+
|
|
69
|
+
A new module under `include/gpufl/upload/`. Reads the session's
|
|
70
|
+
NDJSON files post-shutdown, POSTs each event to the existing
|
|
71
|
+
`/api/v1/events/{eventType}` backend endpoints. Never throws on
|
|
72
|
+
network errors; returns an `UploadResult` with `.success`,
|
|
73
|
+
`.events_uploaded`, `.warnings`, etc.
|
|
74
|
+
|
|
75
|
+
Python orchestration via context manager:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
with gpufl.session(app_name="train",
|
|
79
|
+
backend_url="https://api.gpuflight.com",
|
|
80
|
+
api_key="gpfl_xxxxx"):
|
|
81
|
+
train_one_epoch()
|
|
82
|
+
# On __exit__: shutdown() then upload_logs() — automatic.
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
#### `gpufl upload` CLI
|
|
86
|
+
|
|
87
|
+
Post-mortem / ad-hoc shipping tool. Registered via
|
|
88
|
+
`[project.scripts]` in `pyproject.toml`:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
gpufl upload /tmp/runs/train --backend-url ... --api-key ...
|
|
92
|
+
gpufl upload /tmp/runs/train --session-id <uuid>
|
|
93
|
+
gpufl upload /tmp/runs/train --all-sessions
|
|
94
|
+
gpufl upload /tmp/runs/train --force # bypass cursor check
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Default behavior uploads only the **latest** session found in the
|
|
98
|
+
directory (most recent `job_start.ts_ns`). `--session-id` picks a
|
|
99
|
+
specific one; `--all-sessions` ships every session present.
|
|
100
|
+
|
|
101
|
+
#### Session-aware cursor file
|
|
102
|
+
|
|
103
|
+
`.gpufl-upload-cursor.json` (in the log directory) tracks which
|
|
104
|
+
sessions have completed a successful upload. Re-running `gpufl
|
|
105
|
+
upload` on a completed session refuses with a clear message
|
|
106
|
+
suggesting `--force`; `--all-sessions` mode silently skips completed
|
|
107
|
+
sessions and uploads the rest. Survives across runs to skip
|
|
108
|
+
already-uploaded rotated files.
|
|
109
|
+
|
|
110
|
+
#### `ProfilingEngine` — clarified names
|
|
111
|
+
|
|
112
|
+
The engine enum was reworked into a single, plainly-named ladder
|
|
113
|
+
(no aliases). New default is `Monitor` (telemetry only, no CUPTI).
|
|
114
|
+
|
|
115
|
+
| Name | What it captures |
|
|
116
|
+
|---|---|
|
|
117
|
+
| `Monitor` | GPU/host health metrics only — no CUPTI. The default. |
|
|
118
|
+
| `Trace` | + activity trace: kernels, memcpy, sync (no sampling) |
|
|
119
|
+
| `PcSampling` | + PC stall-reason sampling |
|
|
120
|
+
| `SassMetrics` | + per-instruction SASS counters |
|
|
121
|
+
| `RangeProfiler` | + hardware throughput counters |
|
|
122
|
+
| `Deep` | `PcSampling` + `SassMetrics` in one run |
|
|
123
|
+
|
|
124
|
+
Replaces the earlier `None` / `KernelTrace` / `Continuous` / `Range` /
|
|
125
|
+
`PcSamplingWithSass` names. Pre-1.0, no deprecation shim — the old
|
|
126
|
+
names are gone.
|
|
127
|
+
|
|
128
|
+
#### Ref-counted system-metric sampler
|
|
129
|
+
|
|
130
|
+
`Sampler::configure()` / `activate()` / `deactivate()` / `shutdown()`
|
|
131
|
+
replaces the old `start()` / `stop()`. Activation count composes
|
|
132
|
+
across `continuous_system_sampling` baseline, `GFL_SCOPE` enter/exit,
|
|
133
|
+
and explicit `systemStart()` / `systemStop()` calls — the sampler
|
|
134
|
+
runs while any activator is in flight.
|
|
135
|
+
|
|
136
|
+
### Bug fixes
|
|
137
|
+
|
|
138
|
+
#### Scope-driven system sampling now works
|
|
139
|
+
|
|
140
|
+
Before: setting `sampling_auto_start=false` silently disabled all
|
|
141
|
+
system metrics, even inside `GFL_SCOPE` regions. The flag's name
|
|
142
|
+
suggested "wait for explicit start" semantics but the code disabled
|
|
143
|
+
sampling entirely. Now, under the renamed `continuous_system_sampling
|
|
144
|
+
= false`, the sampler activates while inside any scope or between
|
|
145
|
+
`systemStart` / `systemStop` calls, then idles outside that window.
|
|
146
|
+
|
|
147
|
+
#### EventWrapper envelope on upload POSTs
|
|
148
|
+
|
|
149
|
+
The initial `uploadLogs()` draft POSTed bare NDJSON event lines.
|
|
150
|
+
The backend's `EventIngestionController` deserialized those into an
|
|
151
|
+
`EventWrapper` with every field null, the inner `objectMapper.readValue
|
|
152
|
+
(null, ...)` threw, the exception was caught and swallowed, and the
|
|
153
|
+
controller returned 200 OK anyway — silent data loss. Every event is
|
|
154
|
+
now correctly wrapped in `{data, agentSendingTime, hostname, ipAddr}`.
|
|
155
|
+
|
|
156
|
+
Regression test added in `tests/upload/test_upload_logs.cpp`.
|
|
157
|
+
|
|
158
|
+
### Tests added
|
|
159
|
+
|
|
160
|
+
- `tests/core/test_sampler.cpp` — 8 scenarios for the ref-counted
|
|
161
|
+
Sampler (activate/deactivate, nesting, force-shutdown, unbalanced
|
|
162
|
+
deactivate clamping).
|
|
163
|
+
- `tests/upload/test_upload_logs.cpp` — 12 scenarios for the upload
|
|
164
|
+
path (happy path, headers, cursor refusal + force override, auth
|
|
165
|
+
failure, malformed lines, session-id filter, all-sessions,
|
|
166
|
+
lifecycle ordering, EventWrapper envelope regression guard).
|
|
167
|
+
- `tests/python/test_continuous_system_sampling.py` — 5 integration
|
|
168
|
+
scenarios for the three sampling modes plus deprecation behavior.
|
|
169
|
+
|
|
170
|
+
### Internal / build
|
|
171
|
+
|
|
172
|
+
- Removed `include/gpufl/core/logger/http_log_sink.{hpp,cpp}`.
|
|
173
|
+
- Added `include/gpufl/upload/upload_logs.{hpp,cpp}` to the CMake
|
|
174
|
+
target sources.
|
|
175
|
+
- `CMakeLists.txt` `project(VERSION)` bumped to 1.1.0; new
|
|
176
|
+
`GPUFL_VERSION_SUFFIX` variable layers the PEP 440 pre-release
|
|
177
|
+
token onto `GPUFL_CLIENT_VERSION` (currently `"rc2"`; set to `""`
|
|
178
|
+
to promote to 1.1.0 final).
|
|
179
|
+
|
|
180
|
+
### Migration checklist for 1.0.x → 1.1.0rc2
|
|
181
|
+
|
|
182
|
+
**Optional in v1.1, required by v1.2:**
|
|
183
|
+
|
|
184
|
+
- [ ] Python: replace every `gpufl.init(remote_upload=True, ...)` call
|
|
185
|
+
with `with gpufl.session(backend_url=..., api_key=...):` or an
|
|
186
|
+
explicit `gpufl.upload_logs(...)` after `shutdown()`. The old form
|
|
187
|
+
still works in v1.1 with a `DeprecationWarning`; v1.2 will remove it.
|
|
188
|
+
- [ ] C++: replace `opts.remote_upload = true;` with an explicit
|
|
189
|
+
`gpufl::uploadLogs(uopts)` after `gpufl::shutdown()`. The field
|
|
190
|
+
still compiles in v1.1 but is a no-op; v1.2 will delete it.
|
|
191
|
+
- [ ] Container manifests: prefer dropping `GPUFL_REMOTE_UPLOAD` and
|
|
192
|
+
driving upload via your app code (or the `gpufl upload` CLI in a
|
|
193
|
+
lifecycle hook). The env var still routes through the Python shim
|
|
194
|
+
in v1.1; v1.2 stops reading it.
|
|
195
|
+
- [ ] Future-proof: start passing `backend_url` and `api_key` directly
|
|
196
|
+
to `gpufl::uploadLogs()` / `gpufl.upload_logs()` rather than relying
|
|
197
|
+
on the InitOptions fields. Those InitOptions fields will move to
|
|
198
|
+
UploadOptions only in v1.2.
|
|
199
|
+
|
|
200
|
+
**Required in v1.1 (no grace period):**
|
|
201
|
+
|
|
202
|
+
- [ ] Python: rename `sampling_auto_start` → `continuous_system_sampling`.
|
|
203
|
+
The old name still works with a `DeprecationWarning` (removed in v1.2).
|
|
204
|
+
- [ ] C++: rename `opts.sampling_auto_start` → `opts.continuous_system_sampling`
|
|
205
|
+
(compile-time error otherwise — no grace period for C++).
|
|
206
|
+
- [ ] If you `#include`'d `http_log_sink.hpp` directly anywhere,
|
|
207
|
+
drop the include — the header is gone.
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## Releases prior to 1.1.0
|
|
212
|
+
|
|
213
|
+
See git tags for the historical record. Highlights:
|
|
214
|
+
|
|
215
|
+
- **1.0.3** — `ScopeMeta` benchmark-iteration helper, scope iterator
|
|
216
|
+
form, `gpufl.report` text summary improvements.
|
|
217
|
+
- **1.0.2** — first version published to PyPI; "Stable" status.
|
|
218
|
+
- **1.0.1** — `kernel_sample_rate_ms` deprecated (no-op).
|
|
219
|
+
- **1.0.0** — first stable contract.
|
|
@@ -1,11 +1,18 @@
|
|
|
1
1
|
cmake_minimum_required(VERSION 3.31)
|
|
2
2
|
|
|
3
3
|
project(gpufl_client
|
|
4
|
-
VERSION 1.0
|
|
4
|
+
VERSION 1.1.0
|
|
5
5
|
LANGUAGES CXX
|
|
6
6
|
DESCRIPTION "Header-only GPU monitoring client library"
|
|
7
7
|
)
|
|
8
8
|
|
|
9
|
+
# Pre-release suffix appended to GPUFL_CLIENT_VERSION below. PEP 440
|
|
10
|
+
# pre-release tokens (`rc1`, `a1`, `b1`, …) aren't valid in CMake's
|
|
11
|
+
# `project(... VERSION ...)`, so we layer them on top here. To promote
|
|
12
|
+
# 1.1.0rc2 → 1.1.0 final, set this to the empty string. To bump rc2 →
|
|
13
|
+
# rc3 mid-validation, change just this line.
|
|
14
|
+
set(GPUFL_VERSION_SUFFIX "rc2")
|
|
15
|
+
|
|
9
16
|
# -----------------------
|
|
10
17
|
# CUDA Architectures (CI Friendly)
|
|
11
18
|
# -----------------------
|
|
@@ -22,6 +29,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
|
|
|
22
29
|
# -----------------------
|
|
23
30
|
option(GPUFL_ENABLE_NVIDIA "Enable NVIDIA backends (CUDA + NVML when available)" ON)
|
|
24
31
|
option(GPUFL_ENABLE_AMD "Enable AMD backends (ROCm when available)" OFF)
|
|
32
|
+
option(GPUFL_ENABLE_AMD_ROCPROFILER "Enable AMD rocprofiler-sdk tracing backend when available" ON)
|
|
25
33
|
|
|
26
34
|
option(BUILD_GPUFL_EXAMPLE "Build gpufl example application" ON)
|
|
27
35
|
option(BUILD_GPUFL_MONITOR "Build gpufl-monitor standalone daemon" ON)
|
|
@@ -56,7 +64,7 @@ target_compile_features(gpufl INTERFACE cxx_std_17)
|
|
|
56
64
|
# inline the literal at call sites and the mismatch becomes visible
|
|
57
65
|
# as comparison failures (e.g. test asserting on User-Agent).
|
|
58
66
|
target_compile_definitions(gpufl PUBLIC
|
|
59
|
-
GPUFL_CLIENT_VERSION="${PROJECT_VERSION}"
|
|
67
|
+
GPUFL_CLIENT_VERSION="${PROJECT_VERSION}${GPUFL_VERSION_SUFFIX}"
|
|
60
68
|
)
|
|
61
69
|
|
|
62
70
|
# Enable PIC for static library (required when linking into shared libraries like Python modules)
|
|
@@ -68,7 +76,7 @@ target_sources(gpufl PRIVATE
|
|
|
68
76
|
include/gpufl/core/logger/logger.cpp
|
|
69
77
|
include/gpufl/core/logger/log_rotator.cpp
|
|
70
78
|
include/gpufl/core/logger/file_log_sink.cpp
|
|
71
|
-
include/gpufl/
|
|
79
|
+
include/gpufl/upload/upload_logs.cpp
|
|
72
80
|
include/gpufl/core/host_info.cpp
|
|
73
81
|
include/gpufl/core/remote_config.cpp
|
|
74
82
|
include/gpufl/core/model/batch_models.cpp
|
|
@@ -147,7 +155,9 @@ target_sources(gpufl PRIVATE include/gpufl/core/logger/file_compressor.cpp)
|
|
|
147
155
|
|
|
148
156
|
|
|
149
157
|
# -----------------------
|
|
150
|
-
# cpp-httplib — HTTP client
|
|
158
|
+
# cpp-httplib — HTTP client used by gpufl::uploadLogs (deferred upload
|
|
159
|
+
# of session NDJSON files to the backend, called after gpufl::shutdown).
|
|
160
|
+
# Also used by remote_config.cpp for the post-init version probe.
|
|
151
161
|
#
|
|
152
162
|
# Single-header library. Fetched once via FetchContent so every build gets
|
|
153
163
|
# the same version regardless of the host system. HTTPS support is gated
|
|
@@ -167,30 +177,37 @@ FetchContent_MakeAvailable(httplib)
|
|
|
167
177
|
|
|
168
178
|
find_package(OpenSSL QUIET)
|
|
169
179
|
if(OpenSSL_FOUND)
|
|
170
|
-
message(STATUS "Found OpenSSL: ${OPENSSL_VERSION} —
|
|
180
|
+
message(STATUS "Found OpenSSL: ${OPENSSL_VERSION} — HTTPS upload enabled")
|
|
171
181
|
target_compile_definitions(gpufl PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT=1)
|
|
172
182
|
target_link_libraries(gpufl PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
|
173
183
|
set(GPUFL_HTTPLIB_TLS 1)
|
|
174
184
|
else()
|
|
175
185
|
message(WARNING
|
|
176
|
-
"OpenSSL not found —
|
|
177
|
-
"
|
|
178
|
-
"
|
|
179
|
-
"
|
|
186
|
+
"OpenSSL not found — gpufl::uploadLogs will support HTTP only. "
|
|
187
|
+
"Pointing backend_url at an https:// endpoint will fail to verify. "
|
|
188
|
+
"Install OpenSSL (apt: libssl-dev, vcpkg: openssl, brew: openssl) "
|
|
189
|
+
"to enable TLS.")
|
|
180
190
|
set(GPUFL_HTTPLIB_TLS 0)
|
|
181
191
|
endif()
|
|
182
192
|
target_compile_definitions(gpufl PRIVATE GPUFL_HTTPLIB_TLS=${GPUFL_HTTPLIB_TLS})
|
|
183
|
-
#
|
|
184
|
-
#
|
|
185
|
-
#
|
|
186
|
-
#
|
|
187
|
-
#
|
|
188
|
-
#
|
|
189
|
-
#
|
|
193
|
+
# Enable cpp-httplib's gzip path. PUBLIC so any consumer that includes
|
|
194
|
+
# httplib.h via gpufl's headers (notably the test target's embedded
|
|
195
|
+
# httplib::Server) gets the same wire-format support — without this,
|
|
196
|
+
# server::Post handlers return 415 on incoming Content-Encoding: gzip
|
|
197
|
+
# requests, which is exactly what uploadLogs sends in v1.2+.
|
|
198
|
+
#
|
|
199
|
+
# In production, the Spring Boot backend handles gzip natively, so
|
|
200
|
+
# this define affects the test target more than the client. We still
|
|
201
|
+
# define it on gpufl PUBLIC because:
|
|
202
|
+
# - It enables future use of httplib::Client::set_compress(true)
|
|
203
|
+
# to compress the body in-place instead of our manual gzipString
|
|
204
|
+
# (no current need, but a free optimization if we ever want it).
|
|
205
|
+
# - It propagates to tests, which is the immediate motivation.
|
|
190
206
|
#
|
|
191
|
-
# ZLIB is
|
|
192
|
-
#
|
|
193
|
-
#
|
|
207
|
+
# ZLIB is already linked above — file_compressor.cpp uses it to gzip
|
|
208
|
+
# rotated NDJSON files, and upload_logs.cpp uses it both to read those
|
|
209
|
+
# files back and to gzip outgoing stream-chunks.
|
|
210
|
+
target_compile_definitions(gpufl PUBLIC CPPHTTPLIB_ZLIB_SUPPORT=1)
|
|
194
211
|
target_link_libraries(gpufl PRIVATE httplib::httplib)
|
|
195
212
|
|
|
196
213
|
|
|
@@ -225,6 +242,7 @@ if(GPUFL_ENABLE_NVIDIA)
|
|
|
225
242
|
include/gpufl/backends/nvidia/sampler/cupti_sass.hpp
|
|
226
243
|
include/gpufl/backends/nvidia/cuda_collector.cpp
|
|
227
244
|
include/gpufl/backends/nvidia/cupti_utils.cpp
|
|
245
|
+
include/gpufl/backends/nvidia/cuda_cleanup_handler.cpp
|
|
228
246
|
include/gpufl/backends/nvidia/resource_handler.cpp
|
|
229
247
|
include/gpufl/backends/nvidia/kernel_launch_handler.cpp
|
|
230
248
|
include/gpufl/backends/nvidia/mem_transfer_handler.cpp
|
|
@@ -481,18 +499,22 @@ if(GPUFL_ENABLE_AMD)
|
|
|
481
499
|
endif()
|
|
482
500
|
endif()
|
|
483
501
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
502
|
+
if(GPUFL_ENABLE_AMD_ROCPROFILER)
|
|
503
|
+
find_package(rocprofiler-sdk QUIET CONFIG HINTS /opt/rocm)
|
|
504
|
+
if(TARGET rocprofiler-sdk::rocprofiler-sdk)
|
|
505
|
+
set(GPUFL_HAS_ROCPROFILER_SDK 1)
|
|
506
|
+
target_link_libraries(gpufl PRIVATE rocprofiler-sdk::rocprofiler-sdk)
|
|
507
|
+
target_sources(gpufl PRIVATE
|
|
508
|
+
include/gpufl/backends/amd/monitor_adapter_amd.cpp
|
|
509
|
+
include/gpufl/backends/amd/rocprofiler_backend.cpp
|
|
510
|
+
include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp
|
|
511
|
+
)
|
|
512
|
+
message(STATUS "Found ROCprofiler-SDK support")
|
|
513
|
+
else()
|
|
514
|
+
message(STATUS "ROCprofiler-SDK not found; AMD kernel tracing disabled")
|
|
515
|
+
endif()
|
|
494
516
|
else()
|
|
495
|
-
message(STATUS "ROCprofiler-SDK
|
|
517
|
+
message(STATUS "ROCprofiler-SDK support disabled by GPUFL_ENABLE_AMD_ROCPROFILER=OFF")
|
|
496
518
|
endif()
|
|
497
519
|
|
|
498
520
|
if(GPUFL_HAS_ROCM_SMI OR GPUFL_HAS_HIP)
|
|
@@ -44,6 +44,7 @@ RUN cmake -S . -B /build \
|
|
|
44
44
|
-DBUILD_GPUFL_MONITOR=ON \
|
|
45
45
|
-DGPUFL_ENABLE_NVIDIA=OFF \
|
|
46
46
|
-DGPUFL_ENABLE_AMD=ON \
|
|
47
|
+
-DGPUFL_ENABLE_AMD_ROCPROFILER=OFF \
|
|
47
48
|
&& cmake --build /build --target gpufl-monitor --parallel
|
|
48
49
|
|
|
49
50
|
# ── Stage 2: pull the pre-built Java agent ────────────────────────────────────
|
|
@@ -53,22 +54,13 @@ FROM ghcr.io/gpu-flight/gpufl-agent:latest AS agent-jar
|
|
|
53
54
|
FROM eclipse-temurin:25-jre AS jre
|
|
54
55
|
|
|
55
56
|
# ── Stage 4: runtime image ─────────────────────────────────────────────────────
|
|
56
|
-
|
|
57
|
+
# Use ROCm base runtime to avoid fragile manual ROCm apt repo/package wiring.
|
|
58
|
+
FROM rocm/dev-ubuntu-24.04:6.4-complete
|
|
57
59
|
|
|
58
60
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
59
61
|
|
|
60
|
-
# Add ROCm apt repository
|
|
61
62
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
62
|
-
ca-certificates \
|
|
63
|
-
wget \
|
|
64
|
-
gnupg \
|
|
65
|
-
&& wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/keyrings/rocm.gpg \
|
|
66
|
-
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.4 noble main' \
|
|
67
|
-
> /etc/apt/sources.list.d/rocm.list \
|
|
68
|
-
&& apt-get update && apt-get install -y --no-install-recommends \
|
|
69
63
|
supervisor \
|
|
70
|
-
rocm-smi-lib \
|
|
71
|
-
rocm-hip-runtime \
|
|
72
64
|
&& rm -rf /var/lib/apt/lists/*
|
|
73
65
|
|
|
74
66
|
# Copy Java 25 JRE from Temurin (Ubuntu 24.04 repos only ship up to openjdk-21)
|