gpufl 0.1.4__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpufl-0.1.4 → gpufl-1.0.0}/.github/workflows/release.yml +84 -3
- {gpufl-0.1.4 → gpufl-1.0.0}/CMakeLists.txt +1 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/Dockerfile.monitor +5 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/Dockerfile.monitor.amd +5 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/Dockerfile.monitor.supervisord.conf +3 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/PKG-INFO +69 -10
- {gpufl-0.1.4 → gpufl-1.0.0}/README.md +66 -7
- {gpufl-0.1.4 → gpufl-1.0.0}/daemon/README.md +6 -5
- {gpufl-0.1.4 → gpufl-1.0.0}/docker-compose.monitor.amd.yml +6 -2
- {gpufl-0.1.4 → gpufl-1.0.0}/docker-compose.monitor.yml +6 -2
- {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/gpufl_scope_demo.cpp +0 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/block_style_example.cu +0 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/memory_coalescing_demo.cu +0 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/occupancy_demo.cu +0 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/sass_divergence_demo.cu +2 -5
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/vector_add_benchmark.cu +0 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/example/python/02_numba_cuda.py +42 -2
- {gpufl-0.1.4 → gpufl-1.0.0}/example/python/03_pytorch_benchmark.py +2 -4
- gpufl-1.0.0/images/Screenshot2.png +0 -0
- gpufl-1.0.0/include/gpufl/backends/nvidia/cuda_feature_guards.hpp +39 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_backend.hpp +17 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/kernel_launch_handler.cpp +86 -76
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/resource_handler.cpp +9 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/config_file_loader.cpp +4 -2
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/events.hpp +17 -5
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/gpufl.cpp +1 -12
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/itanium_demangle.cpp +58 -4
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/itanium_demangle.hpp +5 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/http_log_sink.cpp +80 -12
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/http_log_sink.hpp +28 -3
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/lifecycle_model.cpp +0 -8
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor.hpp +0 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/remote_config.cpp +0 -3
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/stack_trace.cpp +17 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/gpufl.hpp +0 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/pyproject.toml +14 -3
- {gpufl-0.1.4 → gpufl-1.0.0}/python/bindings.cpp +39 -55
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/__init__.py +3 -19
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/analyzer/analyzer.py +52 -3
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/report/__init__.py +2 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/report/text_report.py +8 -0
- gpufl-1.0.0/python/gpufl/viz/reader.py +239 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/viz/timeline.py +194 -78
- {gpufl-0.1.4 → gpufl-1.0.0}/scripts/windows/run-monitor-local.bat +3 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/nvidia/test_engine_coverage.cpp +0 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_http_log_sink.cpp +109 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_wire_contract.cpp +16 -1
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/python/test_bindings.py +0 -3
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/verify_pipeline.py +4 -3
- gpufl-0.1.4/python/gpufl/viz/reader.py +0 -48
- {gpufl-0.1.4 → gpufl-1.0.0}/.clang-format +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/.dockerignore +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/.github/pull_request_template.md +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/.github/workflows/build.yml +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/.gitignore +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/CONTRIBUTING.md +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/Dockerfile.demo +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/LICENSE +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/benchmark/README.md +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/benchmark/cuda_gemm.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/benchmark/pytorch_train.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/benchmark/run_benchmark.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/build.sh +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/daemon/monitor/CMakeLists.txt +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/daemon/monitor/main.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/CMakeLists.txt +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/README.md +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/check_device.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/vector_add_benchmark.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/CMakeLists.txt +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/check_conflict.cu +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/check_device.cu +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/cupti_basic.cu +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/cupti_pc_sampling.cu +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/list_sass_metrics.cu +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/system_monitor.cu +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/test_occupancy.cu +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/python/01_basic.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/python/analyzer/01_analyzer_sample.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/python/requirements.txt +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/python/viz/01_plot_memory_timeline.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/example/python/viz/02_plot_stress_timeline.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/images/Screenshot1.png +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/engine/amd_profiling_engine.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/engine/dispatch_counter_engine.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/hip_static_collector.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/hip_static_collector.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/monitor_adapter_amd.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/monitor_adapter_amd.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/rocm_collector.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/rocm_collector.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/rocprofiler_backend.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/rocprofiler_backend.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/host_collector.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cuda_collector.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cuda_collector.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_backend.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_common.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_utils.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_utils.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/profiling_engine.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/range_profiler_engine.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/range_profiler_engine.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/sass_metrics_engine.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/kernel_launch_handler.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/mem_transfer_handler.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/mem_transfer_handler.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/nvml_collector.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/nvml_collector.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/resource_handler.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/sampler/cupti_sass.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/sampler/cupti_sass.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/synchronization_handler.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/synchronization_handler.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/activity_record.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/backend_factory.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/backend_factory.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/backend_interfaces.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/batch_buffer.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/common.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/common.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/config_file_loader.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/debug_logger.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/debug_logger.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/dictionary_manager.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/dictionary_manager.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/host_info.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/host_info.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/json/json.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/json/json.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/file_compressor.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/file_compressor.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/file_log_sink.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/file_log_sink.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/log_rotator.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/log_rotator.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/log_sink.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/logger.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/logger.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/batch_models.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/batch_models.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/graph_launch_event_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/graph_launch_event_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/kernel_event_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/kernel_event_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/lifecycle_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/memcpy_event_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/memcpy_event_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/memory_alloc_event_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/memory_alloc_event_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/model_utils.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/nvtx_marker_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/nvtx_marker_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/perf_metric_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/perf_metric_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/profile_sample_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/profile_sample_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/scope_event_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/scope_event_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/serializable.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/synchronization_event_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/synchronization_event_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/system_event_model.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/system_event_model.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor_adapter.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor_adapter.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor_backend.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/remote_config.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/ring_buffer.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/runtime.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/runtime.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/sampler.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/sampler.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/sass_compressor.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/sass_compressor.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/scope_registry.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/scope_registry.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/stack_registry.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/stack_trace.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/stream_handle.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/trace_type.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/version.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/report/hint_engine.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/report/hint_engine.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/report/text_report.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/report/text_report.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/.gitignore +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/analyzer/__init__.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/cupy/__init__.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/jax/__init__.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/numba/__init__.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/__init__.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/dispatch.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/profile.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/stack.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/trace_import.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/triton/__init__.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/utils.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/viz/__init__.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/viz/visualizer.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/scripts/docker-demo-loop.sh +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/CMakeLists.txt +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/amd/test_rocm_collector.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/nvidia/test_cuda_collector.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/nvidia/test_nvidia_backend.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/nvidia/test_nvml_collector.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/log_utils.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/log_utils.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/test_kernel.cu +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/test_kernel.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/test_utils.hpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_analyzer.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_api_path_routing.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_batch_models.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_itanium_demangle.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_monitor.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/main_test_runner.cpp +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/python/conftest.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/python/test_analyzer.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/python/test_remote_upload_smoke.py +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/run_engine_coverage.ps1 +0 -0
- {gpufl-0.1.4 → gpufl-1.0.0}/tests/run_engine_coverage.sh +0 -0
|
@@ -136,16 +136,65 @@ jobs:
|
|
|
136
136
|
uses: pypa/cibuildwheel@v2.22.0
|
|
137
137
|
env:
|
|
138
138
|
CIBW_VIRTUALENV_VERSION: "20.27.1"
|
|
139
|
-
|
|
139
|
+
# OpenSSL hints: manylinux_2_28 (AlmaLinux 8) ships OpenSSL 1.1.1
|
|
140
|
+
# as the system default, but cpp-httplib v0.18.5 requires >= 3.0.0
|
|
141
|
+
# (SSL_get1_peer_certificate). We install EL8's `openssl3-devel`
|
|
142
|
+
# compat package (see CIBW_BEFORE_ALL_LINUX) which lays OpenSSL 3.x
|
|
143
|
+
# down under NON-standard prefixes — headers in /usr/include/openssl3,
|
|
144
|
+
# dev symlinks in /usr/lib64/openssl3 — so find_package(OpenSSL)
|
|
145
|
+
# won't see it without these explicit cache vars. The runtime
|
|
146
|
+
# SONAME is still libssl.so.3 in /usr/lib64, so auditwheel bundles
|
|
147
|
+
# it into gpufl.libs/ as before. Verified against the actual
|
|
148
|
+
# quay.io/pypa/manylinux_2_28_x86_64 image (OpenSSL 3.5.5).
|
|
149
|
+
CIBW_ENVIRONMENT_LINUX: "CUDA_HOME=/usr/local/cuda PATH=/usr/local/cuda/bin:$PATH CMAKE_ARGS='-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF -DOPENSSL_INCLUDE_DIR=/usr/include/openssl3 -DOPENSSL_SSL_LIBRARY=/usr/lib64/openssl3/libssl.so -DOPENSSL_CRYPTO_LIBRARY=/usr/lib64/openssl3/libcrypto.so'"
|
|
150
|
+
# Windows build needs the OpenSSL install path so find_package(OpenSSL)
|
|
151
|
+
# in CMakeLists.txt succeeds, otherwise HTTPS upload (HttpLogSink)
|
|
152
|
+
# silently falls back to HTTP-only — see openssl-windows.html in the
|
|
153
|
+
# manual repo for the user-facing story. CIBW_BEFORE_ALL_WINDOWS
|
|
154
|
+
# installs choco's openssl package into this path.
|
|
155
|
+
CIBW_ENVIRONMENT_WINDOWS: >-
|
|
156
|
+
OPENSSL_ROOT_DIR="C:/Program Files/OpenSSL-Win64"
|
|
157
|
+
CMAKE_ARGS="-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF"
|
|
140
158
|
# cuda-nvml-devel-13-1 ships the libnvidia-ml.so stub under
|
|
141
159
|
# targets/x86_64-linux/lib/stubs/ — without it CMake's NVML probe
|
|
142
160
|
# finds nothing and (since v0.1.1) fails the build loudly. Every
|
|
143
161
|
# release before v0.1.1 silently shipped wheels without NVML
|
|
144
162
|
# because this package was missing here.
|
|
163
|
+
#
|
|
164
|
+
# openssl3-devel (NOT openssl-devel — that's 1.1.1 on EL8) provides
|
|
165
|
+
# OpenSSL 3.x headers + .so symlinks so cpp-httplib's
|
|
166
|
+
# CPPHTTPLIB_OPENSSL_SUPPORT path compiles (it #errors on < 3.0.0).
|
|
167
|
+
# It installs under /usr/include/openssl3 + /usr/lib64/openssl3 —
|
|
168
|
+
# see the OPENSSL_* hints in CIBW_ENVIRONMENT_LINUX. auditwheel
|
|
169
|
+
# bundles the resulting libssl.so.3 / libcrypto.so.3 (from
|
|
170
|
+
# /usr/lib64) into the wheel under gpufl.libs/ automatically
|
|
171
|
+
# (they're not on the manylinux_2_28 whitelist or our --exclude list).
|
|
145
172
|
CIBW_BEFORE_ALL_LINUX: >-
|
|
146
173
|
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo > /etc/yum.repos.d/cuda.repo &&
|
|
147
|
-
dnf install -y --nogpgcheck cuda-nvcc-13-1 cuda-cudart-devel-13-1 cuda-cupti-13-1 cuda-driver-devel-13-1 cuda-nvml-devel-13-1
|
|
148
|
-
|
|
174
|
+
dnf install -y --nogpgcheck cuda-nvcc-13-1 cuda-cudart-devel-13-1 cuda-cupti-13-1 cuda-driver-devel-13-1 cuda-nvml-devel-13-1 openssl3-devel
|
|
175
|
+
# Install OpenSSL on the Windows runner so find_package(OpenSSL)
|
|
176
|
+
# in CMakeLists.txt succeeds and cpp-httplib gets compiled with
|
|
177
|
+
# CPPHTTPLIB_OPENSSL_SUPPORT=1. Chocolatey is pre-installed on
|
|
178
|
+
# GitHub's windows-latest runners; the openssl package installs
|
|
179
|
+
# to C:\Program Files\OpenSSL-Win64\ (matches OPENSSL_ROOT_DIR
|
|
180
|
+
# in CIBW_ENVIRONMENT_WINDOWS above).
|
|
181
|
+
CIBW_BEFORE_ALL_WINDOWS: choco install -y openssl --no-progress
|
|
182
|
+
# cibuildwheel ships NO default Windows repair command and only
|
|
183
|
+
# auto-installs delvewheel for that (nonexistent) default. Because
|
|
184
|
+
# we override CIBW_REPAIR_WHEEL_COMMAND_WINDOWS below (to pass
|
|
185
|
+
# --add-path for the OpenSSL DLLs), we must install delvewheel
|
|
186
|
+
# ourselves. The repair step runs in this same build env, so the
|
|
187
|
+
# tool lands on PATH. Without this the repair dies with
|
|
188
|
+
# "'delvewheel' is not recognized as an internal or external command".
|
|
189
|
+
CIBW_BEFORE_BUILD_WINDOWS: pip install delvewheel
|
|
190
|
+
# Pin a recent manylinux_2_28 image (AlmaLinux 8.10). cibuildwheel
|
|
191
|
+
# 2.22.0's default pin (2024.11.16-1) is an AlmaLinux 8.6 snapshot
|
|
192
|
+
# whose repos only carry OpenSSL 1.1.1 — there is NO openssl3
|
|
193
|
+
# package — so the build died with "No match for argument:
|
|
194
|
+
# openssl3-devel". 8.10 ships openssl3 (3.5.5). Same glibc-2.28 /
|
|
195
|
+
# manylinux_2_28 ABI; verified to have cp312+cp313, openssl3-devel,
|
|
196
|
+
# and libssl.so.3 in /usr/lib64 (so auditwheel still bundles it).
|
|
197
|
+
CIBW_MANYLINUX_X86_64_IMAGE: quay.io/pypa/manylinux_2_28_x86_64:2026.05.17-1
|
|
149
198
|
CIBW_BUILD: "cp312-manylinux_x86_64 cp313-manylinux_x86_64 cp312-win_amd64 cp313-win_amd64"
|
|
150
199
|
# libnvidia-ml.so.1 is excluded for the same reason as libcuda.so.1:
|
|
151
200
|
# it ships with the NVIDIA driver, not the CUDA toolkit, and is
|
|
@@ -156,7 +205,39 @@ jobs:
|
|
|
156
205
|
# could not be located"). The toolkit's `libnvidia-ml.so` stub is
|
|
157
206
|
# only the unversioned link-time placeholder — the versioned
|
|
158
207
|
# `.so.1` the SONAME chains to lives on the user's machine.
|
|
208
|
+
#
|
|
209
|
+
# libssl/libcrypto are NOT excluded — auditwheel bundles them
|
|
210
|
+
# under gpufl.libs/ so the wheel ships its own OpenSSL and
|
|
211
|
+
# HTTPS works on user machines without any system install.
|
|
159
212
|
CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair --plat manylinux_2_28_x86_64 --exclude libcuda.so.1 --exclude libnvidia-ml.so.1 -w {dest_dir} {wheel}"
|
|
213
|
+
# On Windows, cibuildwheel's default is `delvewheel repair`.
|
|
214
|
+
# We need delvewheel to find the OpenSSL DLLs (libssl-3-x64.dll,
|
|
215
|
+
# libcrypto-3-x64.dll) so it copies them into the wheel. The
|
|
216
|
+
# choco install puts them under C:\Program Files\OpenSSL-Win64\bin\
|
|
217
|
+
# — give that to delvewheel via --add-path. Without this, the
|
|
218
|
+
# rebuilt wheel imports cleanly on a system that already has
|
|
219
|
+
# OpenSSL on PATH but fails on a clean machine.
|
|
220
|
+
# delvewheel vendors the wheel's DLL deps. We must:
|
|
221
|
+
# * --add-path the dirs holding the DLLs to bundle. OpenSSL is in
|
|
222
|
+
# its choco bin; cudart64_*.dll is in CUDA\vX.Y\bin (on PATH, but
|
|
223
|
+
# listed for safety); cupti64_*.dll lives in CUDA's
|
|
224
|
+
# extras\CUPTI\lib64, which is NOT on PATH — without it delvewheel
|
|
225
|
+
# fails with "Unable to find library: cupti64_2025.4.0.dll". We
|
|
226
|
+
# bundle cudart+cupti so the wheel is self-contained, matching the
|
|
227
|
+
# Linux wheel (auditwheel bundles libcudart/libcupti). The CUPTI
|
|
228
|
+
# version-suffixed DLL name also makes excluding it fragile across
|
|
229
|
+
# CUDA point releases, so bundling is the robust choice.
|
|
230
|
+
# * --exclude the driver DLLs that ship with the user's driver, not
|
|
231
|
+
# the toolkit, and are absent on the GPU-less runner: nvcuda.dll
|
|
232
|
+
# (== libcuda.so.1) and nvml.dll (== libnvidia-ml.so.1). Mirrors
|
|
233
|
+
# the Linux auditwheel --exclude flags.
|
|
234
|
+
# Paths use the pinned CUDA 13.1 location (see the Jimver cuda-toolkit
|
|
235
|
+
# step). Both --add-path and --exclude are ';'-delimited.
|
|
236
|
+
CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: >-
|
|
237
|
+
delvewheel repair
|
|
238
|
+
--add-path "C:\\Program Files\\OpenSSL-Win64\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.1\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.1\\extras\\CUPTI\\lib64"
|
|
239
|
+
--exclude "nvcuda.dll;nvml.dll"
|
|
240
|
+
-w {dest_dir} {wheel}
|
|
160
241
|
|
|
161
242
|
- uses: actions/upload-artifact@v4
|
|
162
243
|
with:
|
|
@@ -5,10 +5,14 @@
|
|
|
5
5
|
#
|
|
6
6
|
# Run:
|
|
7
7
|
# docker run --gpus all \
|
|
8
|
-
# -e
|
|
8
|
+
# -e GPUFL_HTTP_HOST=https://api.gpuflight.com \
|
|
9
9
|
# -e GPUFL_HTTP_TOKEN=gfl_... \
|
|
10
10
|
# gpufl/monitor:latest
|
|
11
11
|
#
|
|
12
|
+
# GPUFL_HTTP_HOST is just the scheme+host. The agent appends the
|
|
13
|
+
# /api/{version}/events/<type> path automatically; override the
|
|
14
|
+
# version with GPUFL_HTTP_API_VERSION when the backend cuts v2 etc.
|
|
15
|
+
#
|
|
12
16
|
# The Java agent JAR is pulled from the pre-built ghcr.io/gpu-flight/gpufl-agent image.
|
|
13
17
|
# No local gpufl-agent checkout is required.
|
|
14
18
|
|
|
@@ -7,10 +7,14 @@
|
|
|
7
7
|
# docker run -d \
|
|
8
8
|
# --device /dev/kfd --device /dev/dri \
|
|
9
9
|
# --group-add video --group-add render \
|
|
10
|
-
# -e
|
|
10
|
+
# -e GPUFL_HTTP_HOST=https://api.gpuflight.com \
|
|
11
11
|
# -e GPUFL_HTTP_TOKEN=gfl_... \
|
|
12
12
|
# gpufl/monitor-amd:latest
|
|
13
13
|
#
|
|
14
|
+
# GPUFL_HTTP_HOST is just the scheme+host. The agent appends the
|
|
15
|
+
# /api/{version}/events/<type> path automatically; override the
|
|
16
|
+
# version with GPUFL_HTTP_API_VERSION when the backend cuts v2 etc.
|
|
17
|
+
#
|
|
14
18
|
# The Java agent JAR is pulled from the pre-built ghcr.io/gpu-flight/gpufl-agent image.
|
|
15
19
|
# No local gpufl-agent checkout is required.
|
|
16
20
|
|
|
@@ -21,7 +21,9 @@ stderr_logfile_maxbytes=0
|
|
|
21
21
|
; GPUFL_SOURCE_FOLDER — must match the log dir used by gpufl-monitor
|
|
22
22
|
; GPUFL_SOURCE_PREFIX — must match GPUFL_MONITOR_LOG_DIR base name (default: session)
|
|
23
23
|
; GPUFL_PUBLISHER_TYPE — http or kafka
|
|
24
|
-
;
|
|
24
|
+
; GPUFL_HTTP_HOST — scheme+host, e.g. https://api.gpuflight.com
|
|
25
|
+
; (agent appends /api/{version}/events/<type> automatically)
|
|
26
|
+
; GPUFL_HTTP_API_VERSION — optional; defaults to v1
|
|
25
27
|
; GPUFL_HTTP_TOKEN — Bearer token
|
|
26
28
|
; GPUFL_LOG_TYPES — default: device,scope,system (override to restrict channels)
|
|
27
29
|
; GPUFL_CURSOR_FILE — default: ./cursor.json (override for persistence across restarts)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: gpufl
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: GPU Monitoring Client
|
|
5
5
|
Author-Email: Myoungho Shin <myounghoshin84@gmail.com>
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -22,7 +22,7 @@ Requires-Dist: jax>=0.4; extra == "jax"
|
|
|
22
22
|
Provides-Extra: triton
|
|
23
23
|
Requires-Dist: triton>=2.1; extra == "triton"
|
|
24
24
|
Provides-Extra: numba
|
|
25
|
-
Requires-Dist: numba; extra == "numba"
|
|
25
|
+
Requires-Dist: numba-cuda; extra == "numba"
|
|
26
26
|
Requires-Dist: numpy; extra == "numba"
|
|
27
27
|
Provides-Extra: viz
|
|
28
28
|
Requires-Dist: pandas>=1.5; extra == "viz"
|
|
@@ -36,7 +36,7 @@ Requires-Dist: requests>=2.28; extra == "all"
|
|
|
36
36
|
Requires-Dist: cupy-cuda12x>=12; extra == "all"
|
|
37
37
|
Requires-Dist: jax>=0.4; extra == "all"
|
|
38
38
|
Requires-Dist: triton>=2.1; extra == "all"
|
|
39
|
-
Requires-Dist: numba; extra == "all"
|
|
39
|
+
Requires-Dist: numba-cuda; extra == "all"
|
|
40
40
|
Requires-Dist: numpy; extra == "all"
|
|
41
41
|
Requires-Dist: pandas>=1.5; extra == "all"
|
|
42
42
|
Requires-Dist: matplotlib>=3.7; extra == "all"
|
|
@@ -63,7 +63,7 @@ To keep the initial design coherent, **we are not currently accepting major feat
|
|
|
63
63
|
|
|
64
64
|
Try the portal with real session data — no sign-up required:
|
|
65
65
|
|
|
66
|
-
**[
|
|
66
|
+
**[Demo Link](https://demo.gpuflight.com)**
|
|
67
67
|
|
|
68
68
|
## Key Features
|
|
69
69
|
|
|
@@ -128,7 +128,6 @@ import gpufl
|
|
|
128
128
|
gpufl.init("my-app",
|
|
129
129
|
log_path="./my_logs",
|
|
130
130
|
sampling_auto_start=True,
|
|
131
|
-
enable_kernel_details=True,
|
|
132
131
|
enable_stack_trace=True)
|
|
133
132
|
|
|
134
133
|
a = torch.randn(1024, 1024, device="cuda")
|
|
@@ -150,8 +149,7 @@ from gpufl import ProfilingEngine
|
|
|
150
149
|
|
|
151
150
|
gpufl.init("my-app",
|
|
152
151
|
log_path="./logs",
|
|
153
|
-
profiling_engine=ProfilingEngine.PcSampling
|
|
154
|
-
enable_kernel_details=True)
|
|
152
|
+
profiling_engine=ProfilingEngine.PcSampling)
|
|
155
153
|
```
|
|
156
154
|
|
|
157
155
|
| Engine | What it collects | Analyzer method | Best for |
|
|
@@ -167,7 +165,6 @@ gpufl.init("my-app",
|
|
|
167
165
|
gpufl::InitOptions opts;
|
|
168
166
|
opts.app_name = "my_app";
|
|
169
167
|
opts.log_path = "my_logs";
|
|
170
|
-
opts.enable_kernel_details = true;
|
|
171
168
|
opts.enable_stack_trace = true;
|
|
172
169
|
opts.sampling_auto_start = true;
|
|
173
170
|
opts.profiling_engine = gpufl::ProfilingEngine::SassMetrics;
|
|
@@ -305,6 +302,60 @@ viz.show()
|
|
|
305
302
|
|
|
306
303
|
---
|
|
307
304
|
|
|
305
|
+
## Report Generation
|
|
306
|
+
|
|
307
|
+
For a quick, shareable text summary of a session — session metadata, kernel
|
|
308
|
+
hotspots, duration percentiles, and system metrics — generate a **text report**.
|
|
309
|
+
It's the fastest way to see "what happened" without opening the dashboard, and
|
|
310
|
+
it drops cleanly into CI logs, PR comments, or a plain terminal.
|
|
311
|
+
|
|
312
|
+

|
|
313
|
+
|
|
314
|
+
The report includes:
|
|
315
|
+
- **Session Summary** — app name, session ID, duration, GPU device + SM count.
|
|
316
|
+
- **Kernel Execution Summary** — total / unique kernels, GPU-busy %, and
|
|
317
|
+
duration statistics (avg / median / P90 / P99 / min / max). When a SASS
|
|
318
|
+
profiling engine was active, kernel durations include instrumentation
|
|
319
|
+
overhead and the report labels them accordingly.
|
|
320
|
+
- **Top kernels by total GPU time** — with per-kernel call counts.
|
|
321
|
+
- **Per-kernel details** — grid/block dimensions, occupancy, registers,
|
|
322
|
+
shared memory (static + dynamic), register spills, and Waves/SM.
|
|
323
|
+
|
|
324
|
+
### From C++
|
|
325
|
+
|
|
326
|
+
Call `generateReport()` after `shutdown()` — it reads the NDJSON logs written
|
|
327
|
+
during the session:
|
|
328
|
+
|
|
329
|
+
```cpp
|
|
330
|
+
gpufl::init(opts);
|
|
331
|
+
// ... your CUDA / HIP work ...
|
|
332
|
+
gpufl::shutdown();
|
|
333
|
+
|
|
334
|
+
gpufl::generateReport(); // print to stdout
|
|
335
|
+
gpufl::generateReport("report.txt"); // or save to a file
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### From Python
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
from gpufl.report import generate_report
|
|
342
|
+
|
|
343
|
+
# Print the report — wrap in print() so newlines render. In a Jupyter
|
|
344
|
+
# notebook this also keeps the table columns aligned (stdout renders in
|
|
345
|
+
# a monospace font). A bare `generate_report(...)` as a cell's last
|
|
346
|
+
# expression shows an escaped one-line string, so always print() it.
|
|
347
|
+
text = generate_report("./logs", log_prefix="my_app", top_n=10)
|
|
348
|
+
print(text)
|
|
349
|
+
|
|
350
|
+
# Or save it straight to a file
|
|
351
|
+
generate_report("./logs", log_prefix="my_app", top_n=10, output_path="report.txt")
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
The Python version reads the same NDJSON logs the analyzer uses — no GPU
|
|
355
|
+
required, so you can generate reports from logs copied off another machine.
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
308
359
|
## Testing
|
|
309
360
|
|
|
310
361
|
### C++ Tests
|
|
@@ -345,5 +396,13 @@ To allow non-root users to profile GPU kernels (using CUPTI/PC Sampling) on Linu
|
|
|
345
396
|
|
|
346
397
|
---
|
|
347
398
|
|
|
348
|
-
|
|
349
|
-
|
|
399
|
+
## Where your logs go
|
|
400
|
+
|
|
401
|
+
By default the client writes NDJSON to disk. To stream them to a hosted
|
|
402
|
+
dashboard, set `backend_url` + `api_key` (or the `GPUFL_BACKEND_URL` /
|
|
403
|
+
`GPUFL_API_KEY` env vars) and they're delivered live to
|
|
404
|
+
[app.gpuflight.com](https://app.gpuflight.com). Create a workspace at
|
|
405
|
+
[gpuflight.com](https://gpuflight.com)
|
|
406
|
+
|
|
407
|
+
This client (gpufl-client) is open source. The ingestion service and
|
|
408
|
+
the dashboard UI are proprietary and managed-only today.
|
|
@@ -18,7 +18,7 @@ To keep the initial design coherent, **we are not currently accepting major feat
|
|
|
18
18
|
|
|
19
19
|
Try the portal with real session data — no sign-up required:
|
|
20
20
|
|
|
21
|
-
**[
|
|
21
|
+
**[Demo Link](https://demo.gpuflight.com)**
|
|
22
22
|
|
|
23
23
|
## Key Features
|
|
24
24
|
|
|
@@ -83,7 +83,6 @@ import gpufl
|
|
|
83
83
|
gpufl.init("my-app",
|
|
84
84
|
log_path="./my_logs",
|
|
85
85
|
sampling_auto_start=True,
|
|
86
|
-
enable_kernel_details=True,
|
|
87
86
|
enable_stack_trace=True)
|
|
88
87
|
|
|
89
88
|
a = torch.randn(1024, 1024, device="cuda")
|
|
@@ -105,8 +104,7 @@ from gpufl import ProfilingEngine
|
|
|
105
104
|
|
|
106
105
|
gpufl.init("my-app",
|
|
107
106
|
log_path="./logs",
|
|
108
|
-
profiling_engine=ProfilingEngine.PcSampling
|
|
109
|
-
enable_kernel_details=True)
|
|
107
|
+
profiling_engine=ProfilingEngine.PcSampling)
|
|
110
108
|
```
|
|
111
109
|
|
|
112
110
|
| Engine | What it collects | Analyzer method | Best for |
|
|
@@ -122,7 +120,6 @@ gpufl.init("my-app",
|
|
|
122
120
|
gpufl::InitOptions opts;
|
|
123
121
|
opts.app_name = "my_app";
|
|
124
122
|
opts.log_path = "my_logs";
|
|
125
|
-
opts.enable_kernel_details = true;
|
|
126
123
|
opts.enable_stack_trace = true;
|
|
127
124
|
opts.sampling_auto_start = true;
|
|
128
125
|
opts.profiling_engine = gpufl::ProfilingEngine::SassMetrics;
|
|
@@ -260,6 +257,60 @@ viz.show()
|
|
|
260
257
|
|
|
261
258
|
---
|
|
262
259
|
|
|
260
|
+
## Report Generation
|
|
261
|
+
|
|
262
|
+
For a quick, shareable text summary of a session — session metadata, kernel
|
|
263
|
+
hotspots, duration percentiles, and system metrics — generate a **text report**.
|
|
264
|
+
It's the fastest way to see "what happened" without opening the dashboard, and
|
|
265
|
+
it drops cleanly into CI logs, PR comments, or a plain terminal.
|
|
266
|
+
|
|
267
|
+

|
|
268
|
+
|
|
269
|
+
The report includes:
|
|
270
|
+
- **Session Summary** — app name, session ID, duration, GPU device + SM count.
|
|
271
|
+
- **Kernel Execution Summary** — total / unique kernels, GPU-busy %, and
|
|
272
|
+
duration statistics (avg / median / P90 / P99 / min / max). When a SASS
|
|
273
|
+
profiling engine was active, kernel durations include instrumentation
|
|
274
|
+
overhead and the report labels them accordingly.
|
|
275
|
+
- **Top kernels by total GPU time** — with per-kernel call counts.
|
|
276
|
+
- **Per-kernel details** — grid/block dimensions, occupancy, registers,
|
|
277
|
+
shared memory (static + dynamic), register spills, and Waves/SM.
|
|
278
|
+
|
|
279
|
+
### From C++
|
|
280
|
+
|
|
281
|
+
Call `generateReport()` after `shutdown()` — it reads the NDJSON logs written
|
|
282
|
+
during the session:
|
|
283
|
+
|
|
284
|
+
```cpp
|
|
285
|
+
gpufl::init(opts);
|
|
286
|
+
// ... your CUDA / HIP work ...
|
|
287
|
+
gpufl::shutdown();
|
|
288
|
+
|
|
289
|
+
gpufl::generateReport(); // print to stdout
|
|
290
|
+
gpufl::generateReport("report.txt"); // or save to a file
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
### From Python
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
from gpufl.report import generate_report
|
|
297
|
+
|
|
298
|
+
# Print the report — wrap in print() so newlines render. In a Jupyter
|
|
299
|
+
# notebook this also keeps the table columns aligned (stdout renders in
|
|
300
|
+
# a monospace font). A bare `generate_report(...)` as a cell's last
|
|
301
|
+
# expression shows an escaped one-line string, so always print() it.
|
|
302
|
+
text = generate_report("./logs", log_prefix="my_app", top_n=10)
|
|
303
|
+
print(text)
|
|
304
|
+
|
|
305
|
+
# Or save it straight to a file
|
|
306
|
+
generate_report("./logs", log_prefix="my_app", top_n=10, output_path="report.txt")
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
The Python version reads the same NDJSON logs the analyzer uses — no GPU
|
|
310
|
+
required, so you can generate reports from logs copied off another machine.
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
263
314
|
## Testing
|
|
264
315
|
|
|
265
316
|
### C++ Tests
|
|
@@ -300,5 +351,13 @@ To allow non-root users to profile GPU kernels (using CUPTI/PC Sampling) on Linu
|
|
|
300
351
|
|
|
301
352
|
---
|
|
302
353
|
|
|
303
|
-
|
|
304
|
-
|
|
354
|
+
## Where your logs go
|
|
355
|
+
|
|
356
|
+
By default the client writes NDJSON to disk. To stream them to a hosted
|
|
357
|
+
dashboard, set `backend_url` + `api_key` (or the `GPUFL_BACKEND_URL` /
|
|
358
|
+
`GPUFL_API_KEY` env vars) and they're delivered live to
|
|
359
|
+
[app.gpuflight.com](https://app.gpuflight.com). Create a workspace at
|
|
360
|
+
[gpuflight.com](https://gpuflight.com)
|
|
361
|
+
|
|
362
|
+
This client (gpufl-client) is open source. The ingestion service and
|
|
363
|
+
the dashboard UI are proprietary and managed-only today.
|
|
@@ -79,7 +79,7 @@ docker build \
|
|
|
79
79
|
Copy `.env.example` to `.env` and set the required variables, then:
|
|
80
80
|
|
|
81
81
|
```bash
|
|
82
|
-
|
|
82
|
+
GPUFL_HTTP_HOST=https://your-backend \
|
|
83
83
|
GPUFL_HTTP_TOKEN=gfl_your_token_here \
|
|
84
84
|
docker compose -f docker-compose.monitor.yml up -d
|
|
85
85
|
```
|
|
@@ -99,7 +99,7 @@ docker compose -f docker-compose.monitor.yml down
|
|
|
99
99
|
### AMD
|
|
100
100
|
|
|
101
101
|
```bash
|
|
102
|
-
|
|
102
|
+
GPUFL_HTTP_HOST=https://your-backend \
|
|
103
103
|
GPUFL_HTTP_TOKEN=gfl_your_token_here \
|
|
104
104
|
docker compose -f docker-compose.monitor.amd.yml up -d
|
|
105
105
|
```
|
|
@@ -127,7 +127,7 @@ docker run -d \
|
|
|
127
127
|
--name gpufl-monitor \
|
|
128
128
|
--gpus all \
|
|
129
129
|
--restart unless-stopped \
|
|
130
|
-
-e
|
|
130
|
+
-e GPUFL_HTTP_HOST=https://your-backend \
|
|
131
131
|
-e GPUFL_HTTP_TOKEN=gfl_your_token_here \
|
|
132
132
|
-v gpufl-cursor:/var/gpufl/monitor \
|
|
133
133
|
gpufl/monitor:latest
|
|
@@ -141,7 +141,7 @@ docker run -d \
|
|
|
141
141
|
--device /dev/kfd --device /dev/dri \
|
|
142
142
|
--group-add video --group-add render \
|
|
143
143
|
--restart unless-stopped \
|
|
144
|
-
-e
|
|
144
|
+
-e GPUFL_HTTP_HOST=https://your-backend \
|
|
145
145
|
-e GPUFL_HTTP_TOKEN=gfl_your_token_here \
|
|
146
146
|
-v gpufl-cursor-amd:/var/gpufl/monitor \
|
|
147
147
|
gpufl/monitor-amd:latest
|
|
@@ -175,7 +175,8 @@ The named volume persists the agent's read cursor so it resumes from where it le
|
|
|
175
175
|
| Variable | Default | Description |
|
|
176
176
|
|---|---|---|
|
|
177
177
|
| `GPUFL_PUBLISHER_TYPE` | `http` | Publisher backend: `http` or `kafka` |
|
|
178
|
-
| `
|
|
178
|
+
| `GPUFL_HTTP_HOST` | *(required)* | Backend scheme+host, e.g. `https://api.gpuflight.com`. The agent appends `/api/{version}/events/<type>` automatically. |
|
|
179
|
+
| `GPUFL_HTTP_API_VERSION` | `v1` | Backend API version. Bump when the backend cuts v2 etc. |
|
|
179
180
|
| `GPUFL_HTTP_TOKEN` | *(empty)* | Bearer token for the backend API |
|
|
180
181
|
| `GPUFL_HTTP_TIMEOUT_SEC` | `10` | HTTP request timeout in seconds |
|
|
181
182
|
|
|
@@ -22,9 +22,13 @@ services:
|
|
|
22
22
|
GPUFL_LOG_TYPES: ${GPUFL_LOG_TYPES:-device,scope,system}
|
|
23
23
|
GPUFL_CURSOR_FILE: ${GPUFL_CURSOR_FILE:-/var/gpufl/monitor/cursor.json}
|
|
24
24
|
|
|
25
|
-
# Java agent — publisher (HTTP)
|
|
25
|
+
# Java agent — publisher (HTTP). Set GPUFL_HTTP_HOST to just the
|
|
26
|
+
# scheme+host (e.g. https://api.gpuflight.com); the agent builds
|
|
27
|
+
# the /api/{version}/events/<type> path itself. Override the
|
|
28
|
+
# version via GPUFL_HTTP_API_VERSION when the backend bumps to v2.
|
|
26
29
|
GPUFL_PUBLISHER_TYPE: ${GPUFL_PUBLISHER_TYPE:-http}
|
|
27
|
-
|
|
30
|
+
GPUFL_HTTP_HOST: ${GPUFL_HTTP_HOST}
|
|
31
|
+
GPUFL_HTTP_API_VERSION: ${GPUFL_HTTP_API_VERSION:-v1}
|
|
28
32
|
GPUFL_HTTP_TOKEN: ${GPUFL_HTTP_TOKEN:-}
|
|
29
33
|
GPUFL_HTTP_TIMEOUT_SEC: ${GPUFL_HTTP_TIMEOUT_SEC:-10}
|
|
30
34
|
|
|
@@ -39,9 +39,13 @@ services:
|
|
|
39
39
|
GPUFL_SOURCE_FOLDERS: ${GPUFL_SOURCE_FOLDERS:-/var/gpufl/monitor,/var/gpufl/demo}
|
|
40
40
|
GPUFL_CURSOR_FILE: ${GPUFL_CURSOR_FILE:-/var/gpufl/monitor/cursor.json}
|
|
41
41
|
|
|
42
|
-
# Java agent — publisher (HTTP)
|
|
42
|
+
# Java agent — publisher (HTTP). Set GPUFL_HTTP_HOST to just the
|
|
43
|
+
# scheme+host (e.g. https://api.gpuflight.com); the agent builds
|
|
44
|
+
# the /api/{version}/events/<type> path itself. Override the
|
|
45
|
+
# version via GPUFL_HTTP_API_VERSION when the backend bumps to v2.
|
|
43
46
|
GPUFL_PUBLISHER_TYPE: ${GPUFL_PUBLISHER_TYPE:-http}
|
|
44
|
-
|
|
47
|
+
GPUFL_HTTP_HOST: ${GPUFL_HTTP_HOST}
|
|
48
|
+
GPUFL_HTTP_API_VERSION: ${GPUFL_HTTP_API_VERSION:-v1}
|
|
45
49
|
GPUFL_HTTP_TOKEN: ${GPUFL_HTTP_TOKEN:-}
|
|
46
50
|
GPUFL_HTTP_TIMEOUT_SEC: ${GPUFL_HTTP_TIMEOUT_SEC:-10}
|
|
47
51
|
|
|
@@ -130,7 +130,6 @@ int main() {
|
|
|
130
130
|
opts.system_sample_rate_ms = 50;
|
|
131
131
|
opts.kernel_sample_rate_ms = 0;
|
|
132
132
|
opts.sampling_auto_start = true;
|
|
133
|
-
opts.enable_kernel_details = true;
|
|
134
133
|
opts.enable_debug_output = true;
|
|
135
134
|
opts.enable_stack_trace = false;
|
|
136
135
|
opts.profiling_engine = gpufl::ProfilingEngine::SassMetrics;
|
|
@@ -38,7 +38,6 @@ int main() {
|
|
|
38
38
|
opts.log_path = "gfl_block";
|
|
39
39
|
opts.system_sample_rate_ms = 50;
|
|
40
40
|
opts.kernel_sample_rate_ms = 50;
|
|
41
|
-
opts.enable_kernel_details = true;
|
|
42
41
|
opts.sampling_auto_start = true;
|
|
43
42
|
opts.enable_debug_output = true;
|
|
44
43
|
opts.enable_source_collection = true;
|
|
@@ -61,7 +61,6 @@ int main() {
|
|
|
61
61
|
opts.log_path = "memory_coalescing_demo";
|
|
62
62
|
opts.system_sample_rate_ms = 10;
|
|
63
63
|
opts.kernel_sample_rate_ms = 10;
|
|
64
|
-
opts.enable_kernel_details = true;
|
|
65
64
|
opts.sampling_auto_start = true;
|
|
66
65
|
opts.enable_debug_output = true;
|
|
67
66
|
opts.profiling_engine = gpufl::ProfilingEngine::PcSamplingWithSass;
|
|
@@ -79,7 +79,6 @@ int main()
|
|
|
79
79
|
gpufl::InitOptions opts;
|
|
80
80
|
opts.app_name = "occupancy_demo";
|
|
81
81
|
opts.log_path = "occupancy_demo.log";
|
|
82
|
-
opts.enable_kernel_details = true; // required for occupancy breakdown fields
|
|
83
82
|
opts.sampling_auto_start = true;
|
|
84
83
|
opts.enable_debug_output = false;
|
|
85
84
|
|
|
@@ -144,15 +144,12 @@ int main() {
|
|
|
144
144
|
gpufl::InitOptions opts;
|
|
145
145
|
opts.app_name = "sass_divergence_demo";
|
|
146
146
|
opts.log_path = "sass_divergence";
|
|
147
|
-
|
|
148
|
-
opts.backend_url = "http://localhost:8080";
|
|
149
|
-
opts.remote_upload = true;
|
|
147
|
+
opts.remote_upload = false;
|
|
150
148
|
opts.system_sample_rate_ms = 10;
|
|
151
|
-
opts.enable_kernel_details = true;
|
|
152
149
|
opts.enable_debug_output = true;
|
|
153
150
|
opts.sampling_auto_start = true;
|
|
154
151
|
opts.enable_stack_trace = true;
|
|
155
|
-
opts.profiling_engine = gpufl::ProfilingEngine::
|
|
152
|
+
opts.profiling_engine = gpufl::ProfilingEngine::PcSamplingWithSass;
|
|
156
153
|
|
|
157
154
|
if (!gpufl::init(opts)) {
|
|
158
155
|
std::cerr << "Failed to initialize gpufl" << std::endl;
|
|
@@ -27,7 +27,6 @@ int main() {
|
|
|
27
27
|
opts.log_path = "vector_add_benchmark";
|
|
28
28
|
opts.system_sample_rate_ms = 50;
|
|
29
29
|
opts.kernel_sample_rate_ms = 50;
|
|
30
|
-
opts.enable_kernel_details = true;
|
|
31
30
|
opts.sampling_auto_start = true;
|
|
32
31
|
opts.enable_debug_output = true;
|
|
33
32
|
opts.enable_source_collection = true;
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import gpufl as gfl
|
|
2
|
+
from gpufl.report import generate_report
|
|
2
3
|
import numpy as np
|
|
3
4
|
from numba import cuda
|
|
4
5
|
import math
|
|
6
|
+
import os
|
|
5
7
|
import time
|
|
6
8
|
|
|
7
9
|
# --- 1. Define a Real CUDA Kernel (Matrix Mul) ---
|
|
@@ -20,9 +22,32 @@ def matmul_kernel(A, B, C):
|
|
|
20
22
|
|
|
21
23
|
def run_benchmark():
|
|
22
24
|
# --- 2. Initialize GPUFL ---
|
|
23
|
-
#
|
|
25
|
+
# LOG_PATH is the file prefix the FileLogSink writes to — it produces
|
|
26
|
+
# <LOG_PATH>.device.log / .scope.log / .system.log. We reuse it below
|
|
27
|
+
# to point generate_report() at the same files.
|
|
28
|
+
LOG_PATH = "./gfl_logs"
|
|
29
|
+
|
|
30
|
+
BACKEND_URL = os.environ.get("GPUFL_BACKEND_URL", "api.gpuflight.com")
|
|
31
|
+
API_KEY = os.environ.get("GPUFL_API_KEY", "")
|
|
32
|
+
REMOTE_UPLOAD = bool(API_KEY)
|
|
33
|
+
|
|
24
34
|
print("[GPUFL] Initializing...")
|
|
25
|
-
|
|
35
|
+
if REMOTE_UPLOAD:
|
|
36
|
+
print(f"[GPUFL] Live upload ON -> {BACKEND_URL}")
|
|
37
|
+
else:
|
|
38
|
+
print("[GPUFL] Live upload OFF (set GPUFL_API_KEY to enable). Local files only.")
|
|
39
|
+
|
|
40
|
+
gfl.init(
|
|
41
|
+
app_name="Numba_App",
|
|
42
|
+
log_path=LOG_PATH,
|
|
43
|
+
sampling_auto_start=True,
|
|
44
|
+
system_sample_rate_ms=100,
|
|
45
|
+
enable_debug_output=True,
|
|
46
|
+
profiling_engine=gfl.ProfilingEngine.PcSamplingWithSass,
|
|
47
|
+
backend_url=BACKEND_URL,
|
|
48
|
+
api_key=API_KEY,
|
|
49
|
+
remote_upload=REMOTE_UPLOAD,
|
|
50
|
+
)
|
|
26
51
|
|
|
27
52
|
try:
|
|
28
53
|
# --- 3. Setup Data (Heavy Load) ---
|
|
@@ -70,6 +95,21 @@ def run_benchmark():
|
|
|
70
95
|
print("[GPUFL] Shutting down...")
|
|
71
96
|
gfl.shutdown()
|
|
72
97
|
|
|
98
|
+
# --- 6. Generate a text report from the logs we just wrote ---
|
|
99
|
+
# shutdown() above flushes and closes the NDJSON channels, so the
|
|
100
|
+
# report reflects the full session. generate_report reads the same
|
|
101
|
+
# logs the analyzer uses — no GPU required for this step. We split
|
|
102
|
+
# LOG_PATH into (dir, prefix) the way GpuFlightSession expects:
|
|
103
|
+
# "./gfl_logs" -> dir=".", prefix="gfl_logs"
|
|
104
|
+
# -> reads ./gfl_logs.{device,scope,system}.log
|
|
105
|
+
# Wrap in print() so the report renders with real newlines (and,
|
|
106
|
+
# in a Jupyter notebook, in the monospace stdout area so the
|
|
107
|
+
# kernel tables stay aligned).
|
|
108
|
+
log_dir = os.path.dirname(LOG_PATH) or "."
|
|
109
|
+
log_prefix = os.path.basename(LOG_PATH)
|
|
110
|
+
print("\n[GPUFL] Session report:\n")
|
|
111
|
+
print(generate_report(log_dir, log_prefix=log_prefix, top_n=10))
|
|
112
|
+
|
|
73
113
|
if __name__ == "__main__":
|
|
74
114
|
if cuda.is_available():
|
|
75
115
|
run_benchmark()
|
|
@@ -31,9 +31,7 @@ def run_stress_test():
|
|
|
31
31
|
sampling_auto_start=True,
|
|
32
32
|
system_sample_rate_ms=50,
|
|
33
33
|
kernel_sample_rate_ms=50,
|
|
34
|
-
|
|
35
|
-
enable_debug_output=True,
|
|
36
|
-
enable_profiling=True,
|
|
34
|
+
enable_debug_output=False,
|
|
37
35
|
enable_stack_trace=True,
|
|
38
36
|
# opt-in to memory tracking. Default-off in v1
|
|
39
37
|
# because TF eager and similar workloads can produce
|
|
@@ -49,7 +47,7 @@ def run_stress_test():
|
|
|
49
47
|
remote_upload=remote_upload,
|
|
50
48
|
api_key=api_key,
|
|
51
49
|
backend_url=backend_url,
|
|
52
|
-
profiling_engine=gpufl.ProfilingEngine.
|
|
50
|
+
profiling_engine=gpufl.ProfilingEngine.PcSampling)
|
|
53
51
|
|
|
54
52
|
try:
|
|
55
53
|
# 2. Allocate (Uses approx 3GB VRAM)
|
|
Binary file
|