gpufl 1.0.2__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (275) hide show
  1. {gpufl-1.0.2 → gpufl-1.1.0}/.dockerignore +4 -4
  2. {gpufl-1.0.2 → gpufl-1.1.0}/.github/workflows/build.yml +20 -0
  3. {gpufl-1.0.2 → gpufl-1.1.0}/.github/workflows/release.yml +1 -1
  4. {gpufl-1.0.2 → gpufl-1.1.0}/.gitignore +28 -1
  5. gpufl-1.1.0/CHANGELOG.md +214 -0
  6. {gpufl-1.0.2 → gpufl-1.1.0}/CMakeLists.txt +52 -30
  7. {gpufl-1.0.2 → gpufl-1.1.0}/Dockerfile.monitor.amd +3 -11
  8. {gpufl-1.0.2 → gpufl-1.1.0}/PKG-INFO +191 -72
  9. gpufl-1.1.0/README.md +482 -0
  10. {gpufl-1.0.2 → gpufl-1.1.0}/benchmark/README.md +3 -3
  11. gpufl-1.1.0/benchmark/profile_pytorch_kernels.py +193 -0
  12. gpufl-1.1.0/benchmark/profile_pytorch_via_gpufl.py +304 -0
  13. {gpufl-1.0.2 → gpufl-1.1.0}/benchmark/run_benchmark.py +5 -8
  14. gpufl-1.1.0/build-ubuntu.sh +104 -0
  15. gpufl-1.1.0/build-windows.ps1 +105 -0
  16. gpufl-1.1.0/build.sh +23 -0
  17. {gpufl-1.0.2 → gpufl-1.1.0}/daemon/monitor/CMakeLists.txt +7 -1
  18. {gpufl-1.0.2 → gpufl-1.1.0}/daemon/monitor/main.cpp +6 -2
  19. {gpufl-1.0.2 → gpufl-1.1.0}/docker-compose.monitor.amd.yml +2 -2
  20. {gpufl-1.0.2 → gpufl-1.1.0}/docker-compose.monitor.yml +2 -1
  21. gpufl-1.1.0/docs/compatibility/nvidia-deep-mode-matrix.md +316 -0
  22. {gpufl-1.0.2 → gpufl-1.1.0}/example/amd/gpufl_scope_demo.cpp +1 -1
  23. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/CMakeLists.txt +30 -1
  24. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/block_style_example.cu +2 -2
  25. gpufl-1.1.0/example/cuda/deep_deadlock_repro.cu +341 -0
  26. gpufl-1.1.0/example/cuda/manykernel_benchmark.cu +549 -0
  27. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/memory_coalescing_demo.cu +2 -2
  28. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/occupancy_demo.cu +1 -1
  29. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/sass_divergence_demo.cu +37 -9
  30. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/system_monitor.cu +1 -1
  31. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/vector_add_benchmark.cu +2 -2
  32. {gpufl-1.0.2 → gpufl-1.1.0}/example/python/02_numba_cuda.py +31 -8
  33. {gpufl-1.0.2 → gpufl-1.1.0}/example/python/03_pytorch_benchmark.py +27 -10
  34. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/rocm_collector.cpp +2 -1
  35. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/rocprofiler_backend.cpp +8 -2
  36. gpufl-1.1.0/include/gpufl/backends/nvidia/cuda_cleanup_handler.cpp +93 -0
  37. gpufl-1.1.0/include/gpufl/backends/nvidia/cuda_cleanup_handler.hpp +27 -0
  38. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/cupti_backend.cpp +546 -56
  39. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/cupti_backend.hpp +54 -10
  40. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/cupti_common.hpp +43 -10
  41. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/cupti_utils.cpp +10 -0
  42. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/cupti_utils.hpp +6 -0
  43. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.cpp +80 -137
  44. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.hpp +6 -0
  45. gpufl-1.1.0/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp +204 -0
  46. gpufl-1.1.0/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.hpp +97 -0
  47. gpufl-1.1.0/include/gpufl/backends/nvidia/engine/pm_sampling_engine.cpp +417 -0
  48. gpufl-1.1.0/include/gpufl/backends/nvidia/engine/pm_sampling_engine.hpp +80 -0
  49. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/engine/profiling_engine.hpp +15 -0
  50. gpufl-1.1.0/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +750 -0
  51. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/engine/sass_metrics_engine.hpp +31 -4
  52. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/kernel_launch_handler.cpp +148 -21
  53. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/mem_transfer_handler.cpp +9 -0
  54. gpufl-1.1.0/include/gpufl/backends/nvidia/profiling_plan.hpp +160 -0
  55. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/resource_handler.cpp +3 -3
  56. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/synchronization_handler.cpp +8 -0
  57. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/config_file_loader.cpp +18 -6
  58. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/debug_logger.hpp +12 -10
  59. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/dictionary_manager.cpp +94 -24
  60. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/events.hpp +48 -3
  61. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/gpufl.cpp +288 -123
  62. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/host_info.hpp +3 -3
  63. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/json/json.cpp +33 -3
  64. gpufl-1.1.0/include/gpufl/core/logger/file_compressor.cpp +55 -0
  65. gpufl-1.1.0/include/gpufl/core/logger/file_log_sink.cpp +231 -0
  66. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/logger/file_log_sink.hpp +15 -4
  67. gpufl-1.1.0/include/gpufl/core/logger/log_rotator.cpp +105 -0
  68. gpufl-1.1.0/include/gpufl/core/logger/log_rotator.hpp +65 -0
  69. gpufl-1.1.0/include/gpufl/core/logger/logger.cpp +60 -0
  70. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/logger/logger.hpp +19 -3
  71. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/batch_models.cpp +53 -3
  72. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/batch_models.hpp +27 -0
  73. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/lifecycle_model.cpp +22 -0
  74. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/lifecycle_model.hpp +8 -0
  75. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/monitor.cpp +116 -10
  76. gpufl-1.1.0/include/gpufl/core/monitor.hpp +325 -0
  77. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/monitor_adapter.cpp +11 -0
  78. gpufl-1.1.0/include/gpufl/core/remote_config.cpp +118 -0
  79. gpufl-1.1.0/include/gpufl/core/remote_config.hpp +39 -0
  80. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/sampler.cpp +69 -29
  81. gpufl-1.1.0/include/gpufl/core/sampler.hpp +117 -0
  82. gpufl-1.1.0/include/gpufl/gpufl.hpp +474 -0
  83. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/report/text_report.cpp +174 -8
  84. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/report/text_report.hpp +23 -0
  85. gpufl-1.1.0/include/gpufl/upload/upload_logs.cpp +1503 -0
  86. gpufl-1.1.0/include/gpufl/upload/upload_logs.hpp +243 -0
  87. {gpufl-1.0.2 → gpufl-1.1.0}/pyproject.toml +9 -2
  88. gpufl-1.1.0/python/bindings.cpp +351 -0
  89. gpufl-1.1.0/python/gpufl/__init__.py +1079 -0
  90. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/analyzer/analyzer.py +215 -26
  91. gpufl-1.1.0/python/gpufl/cli.py +178 -0
  92. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/report/text_report.py +213 -33
  93. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/viz/reader.py +36 -2
  94. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/viz/visualizer.py +4 -4
  95. gpufl-1.1.0/run_tests.py +309 -0
  96. {gpufl-1.0.2 → gpufl-1.1.0}/tests/CMakeLists.txt +56 -14
  97. {gpufl-1.0.2 → gpufl-1.1.0}/tests/backends/nvidia/test_engine_coverage.cpp +35 -13
  98. {gpufl-1.0.2 → gpufl-1.1.0}/tests/common/log_utils.cpp +76 -19
  99. {gpufl-1.0.2 → gpufl-1.1.0}/tests/core/test_analyzer.cpp +2 -2
  100. gpufl-1.1.0/tests/core/test_api_path_routing.cpp +50 -0
  101. gpufl-1.1.0/tests/core/test_bench_invoker.cpp +139 -0
  102. gpufl-1.1.0/tests/core/test_disabled.cpp +171 -0
  103. gpufl-1.1.0/tests/core/test_sampler.cpp +195 -0
  104. {gpufl-1.0.2 → gpufl-1.1.0}/tests/core/test_wire_contract.cpp +45 -2
  105. {gpufl-1.0.2 → gpufl-1.1.0}/tests/python/test_bindings.py +13 -6
  106. gpufl-1.1.0/tests/python/test_continuous_system_sampling.py +192 -0
  107. gpufl-1.1.0/tests/python/test_disabled.py +178 -0
  108. gpufl-1.1.0/tests/python/test_scope_iterable.py +308 -0
  109. {gpufl-1.0.2 → gpufl-1.1.0}/tests/run_engine_coverage.ps1 +3 -3
  110. {gpufl-1.0.2 → gpufl-1.1.0}/tests/run_engine_coverage.sh +3 -3
  111. gpufl-1.1.0/tests/upload/test_upload_logs.cpp +1029 -0
  112. gpufl-1.1.0/tests/verify_pipeline.py +114 -0
  113. gpufl-1.0.2/README.md +0 -363
  114. gpufl-1.0.2/build.sh +0 -1
  115. gpufl-1.0.2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp +0 -70
  116. gpufl-1.0.2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.hpp +0 -65
  117. gpufl-1.0.2/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +0 -421
  118. gpufl-1.0.2/include/gpufl/core/logger/file_compressor.cpp +0 -44
  119. gpufl-1.0.2/include/gpufl/core/logger/file_log_sink.cpp +0 -151
  120. gpufl-1.0.2/include/gpufl/core/logger/http_log_sink.cpp +0 -476
  121. gpufl-1.0.2/include/gpufl/core/logger/http_log_sink.hpp +0 -206
  122. gpufl-1.0.2/include/gpufl/core/logger/log_rotator.cpp +0 -65
  123. gpufl-1.0.2/include/gpufl/core/logger/log_rotator.hpp +0 -32
  124. gpufl-1.0.2/include/gpufl/core/logger/logger.cpp +0 -47
  125. gpufl-1.0.2/include/gpufl/core/monitor.hpp +0 -203
  126. gpufl-1.0.2/include/gpufl/core/remote_config.cpp +0 -276
  127. gpufl-1.0.2/include/gpufl/core/remote_config.hpp +0 -60
  128. gpufl-1.0.2/include/gpufl/core/sampler.hpp +0 -63
  129. gpufl-1.0.2/include/gpufl/gpufl.hpp +0 -244
  130. gpufl-1.0.2/python/bindings.cpp +0 -189
  131. gpufl-1.0.2/python/gpufl/__init__.py +0 -211
  132. gpufl-1.0.2/tests/core/test_api_path_routing.cpp +0 -213
  133. gpufl-1.0.2/tests/core/test_http_log_sink.cpp +0 -409
  134. gpufl-1.0.2/tests/python/test_remote_upload_smoke.py +0 -185
  135. gpufl-1.0.2/tests/verify_pipeline.py +0 -102
  136. {gpufl-1.0.2 → gpufl-1.1.0}/.clang-format +0 -0
  137. {gpufl-1.0.2 → gpufl-1.1.0}/.github/pull_request_template.md +0 -0
  138. {gpufl-1.0.2 → gpufl-1.1.0}/CONTRIBUTING.md +0 -0
  139. {gpufl-1.0.2 → gpufl-1.1.0}/Dockerfile.demo +0 -0
  140. {gpufl-1.0.2 → gpufl-1.1.0}/Dockerfile.monitor +0 -0
  141. {gpufl-1.0.2 → gpufl-1.1.0}/Dockerfile.monitor.supervisord.conf +0 -0
  142. {gpufl-1.0.2 → gpufl-1.1.0}/LICENSE +0 -0
  143. {gpufl-1.0.2 → gpufl-1.1.0}/THIRD-PARTY-NOTICES.txt +0 -0
  144. {gpufl-1.0.2 → gpufl-1.1.0}/benchmark/cuda_gemm.py +0 -0
  145. {gpufl-1.0.2 → gpufl-1.1.0}/benchmark/pytorch_train.py +0 -0
  146. {gpufl-1.0.2 → gpufl-1.1.0}/daemon/README.md +0 -0
  147. {gpufl-1.0.2 → gpufl-1.1.0}/example/amd/CMakeLists.txt +0 -0
  148. {gpufl-1.0.2 → gpufl-1.1.0}/example/amd/README.md +0 -0
  149. {gpufl-1.0.2 → gpufl-1.1.0}/example/amd/check_device.cpp +0 -0
  150. {gpufl-1.0.2 → gpufl-1.1.0}/example/amd/vector_add_benchmark.cpp +0 -0
  151. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/check_conflict.cu +0 -0
  152. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/check_device.cu +0 -0
  153. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/cupti_basic.cu +0 -0
  154. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/cupti_pc_sampling.cu +0 -0
  155. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/list_sass_metrics.cu +0 -0
  156. {gpufl-1.0.2 → gpufl-1.1.0}/example/cuda/test_occupancy.cu +0 -0
  157. {gpufl-1.0.2 → gpufl-1.1.0}/example/python/01_basic.py +0 -0
  158. {gpufl-1.0.2 → gpufl-1.1.0}/example/python/analyzer/01_analyzer_sample.py +0 -0
  159. {gpufl-1.0.2 → gpufl-1.1.0}/example/python/requirements.txt +0 -0
  160. {gpufl-1.0.2 → gpufl-1.1.0}/example/python/viz/01_plot_memory_timeline.py +0 -0
  161. {gpufl-1.0.2 → gpufl-1.1.0}/example/python/viz/02_plot_stress_timeline.py +0 -0
  162. {gpufl-1.0.2 → gpufl-1.1.0}/images/Screenshot1.png +0 -0
  163. {gpufl-1.0.2 → gpufl-1.1.0}/images/Screenshot2.png +0 -0
  164. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/engine/amd_profiling_engine.hpp +0 -0
  165. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp +0 -0
  166. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/engine/dispatch_counter_engine.hpp +0 -0
  167. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/hip_static_collector.cpp +0 -0
  168. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/hip_static_collector.hpp +0 -0
  169. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/monitor_adapter_amd.cpp +0 -0
  170. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/monitor_adapter_amd.hpp +0 -0
  171. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/rocm_collector.hpp +0 -0
  172. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/amd/rocprofiler_backend.hpp +0 -0
  173. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/host_collector.hpp +0 -0
  174. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/cuda_collector.cpp +0 -0
  175. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/cuda_collector.hpp +0 -0
  176. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/cuda_feature_guards.hpp +0 -0
  177. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/engine/range_profiler_engine.cpp +0 -0
  178. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/engine/range_profiler_engine.hpp +0 -0
  179. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/kernel_launch_handler.hpp +0 -0
  180. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/mem_transfer_handler.hpp +0 -0
  181. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.cpp +0 -0
  182. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.hpp +0 -0
  183. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/nvml_collector.cpp +0 -0
  184. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/nvml_collector.hpp +0 -0
  185. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/resource_handler.hpp +0 -0
  186. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/sampler/cupti_sass.cpp +0 -0
  187. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/sampler/cupti_sass.hpp +0 -0
  188. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/backends/nvidia/synchronization_handler.hpp +0 -0
  189. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/activity_record.hpp +0 -0
  190. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/backend_factory.cpp +0 -0
  191. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/backend_factory.hpp +0 -0
  192. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/backend_interfaces.hpp +0 -0
  193. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/batch_buffer.hpp +0 -0
  194. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/common.cpp +0 -0
  195. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/common.hpp +0 -0
  196. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/config_file_loader.hpp +0 -0
  197. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/debug_logger.cpp +0 -0
  198. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/dictionary_manager.hpp +0 -0
  199. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/host_info.cpp +0 -0
  200. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/itanium_demangle.cpp +0 -0
  201. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/itanium_demangle.hpp +0 -0
  202. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/json/json.hpp +0 -0
  203. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/logger/file_compressor.hpp +0 -0
  204. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/logger/log_sink.hpp +0 -0
  205. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/graph_launch_event_model.cpp +0 -0
  206. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/graph_launch_event_model.hpp +0 -0
  207. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/kernel_event_model.cpp +0 -0
  208. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/kernel_event_model.hpp +0 -0
  209. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/memcpy_event_model.cpp +0 -0
  210. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/memcpy_event_model.hpp +0 -0
  211. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/memory_alloc_event_model.cpp +0 -0
  212. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/memory_alloc_event_model.hpp +0 -0
  213. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/model_utils.hpp +0 -0
  214. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/nvtx_marker_model.cpp +0 -0
  215. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/nvtx_marker_model.hpp +0 -0
  216. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/perf_metric_model.cpp +0 -0
  217. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/perf_metric_model.hpp +0 -0
  218. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/profile_sample_model.cpp +0 -0
  219. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/profile_sample_model.hpp +0 -0
  220. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/scope_event_model.cpp +0 -0
  221. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/scope_event_model.hpp +0 -0
  222. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/serializable.hpp +0 -0
  223. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/synchronization_event_model.cpp +0 -0
  224. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/synchronization_event_model.hpp +0 -0
  225. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/system_event_model.cpp +0 -0
  226. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/model/system_event_model.hpp +0 -0
  227. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/monitor_adapter.hpp +0 -0
  228. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/monitor_backend.hpp +0 -0
  229. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/ring_buffer.hpp +0 -0
  230. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/runtime.cpp +0 -0
  231. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/runtime.hpp +0 -0
  232. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/sass_compressor.cpp +0 -0
  233. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/sass_compressor.hpp +0 -0
  234. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/scope_registry.cpp +0 -0
  235. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/scope_registry.hpp +0 -0
  236. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/stack_registry.hpp +0 -0
  237. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/stack_trace.cpp +0 -0
  238. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/stack_trace.hpp +0 -0
  239. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/stream_handle.hpp +0 -0
  240. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/trace_type.hpp +0 -0
  241. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/core/version.hpp +0 -0
  242. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/report/hint_engine.cpp +0 -0
  243. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl/report/hint_engine.hpp +0 -0
  244. {gpufl-1.0.2 → gpufl-1.1.0}/include/gpufl.hpp +0 -0
  245. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/.gitignore +0 -0
  246. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/analyzer/__init__.py +0 -0
  247. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/cupy/__init__.py +0 -0
  248. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/jax/__init__.py +0 -0
  249. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/numba/__init__.py +0 -0
  250. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/report/__init__.py +0 -0
  251. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/torch/__init__.py +0 -0
  252. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/torch/dispatch.py +0 -0
  253. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/torch/profile.py +0 -0
  254. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/torch/stack.py +0 -0
  255. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/torch/trace_import.py +0 -0
  256. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/triton/__init__.py +0 -0
  257. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/utils.py +0 -0
  258. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/viz/__init__.py +0 -0
  259. {gpufl-1.0.2 → gpufl-1.1.0}/python/gpufl/viz/timeline.py +0 -0
  260. {gpufl-1.0.2 → gpufl-1.1.0}/scripts/docker-demo-loop.sh +0 -0
  261. {gpufl-1.0.2 → gpufl-1.1.0}/scripts/windows/run-monitor-local.bat +0 -0
  262. {gpufl-1.0.2 → gpufl-1.1.0}/tests/backends/amd/test_rocm_collector.cpp +0 -0
  263. {gpufl-1.0.2 → gpufl-1.1.0}/tests/backends/nvidia/test_cuda_collector.cpp +0 -0
  264. {gpufl-1.0.2 → gpufl-1.1.0}/tests/backends/nvidia/test_nvidia_backend.cpp +0 -0
  265. {gpufl-1.0.2 → gpufl-1.1.0}/tests/backends/nvidia/test_nvml_collector.cpp +0 -0
  266. {gpufl-1.0.2 → gpufl-1.1.0}/tests/common/log_utils.hpp +0 -0
  267. {gpufl-1.0.2 → gpufl-1.1.0}/tests/common/test_kernel.cu +0 -0
  268. {gpufl-1.0.2 → gpufl-1.1.0}/tests/common/test_kernel.hpp +0 -0
  269. {gpufl-1.0.2 → gpufl-1.1.0}/tests/common/test_utils.hpp +0 -0
  270. {gpufl-1.0.2 → gpufl-1.1.0}/tests/core/test_batch_models.cpp +0 -0
  271. {gpufl-1.0.2 → gpufl-1.1.0}/tests/core/test_itanium_demangle.cpp +0 -0
  272. {gpufl-1.0.2 → gpufl-1.1.0}/tests/core/test_monitor.cpp +0 -0
  273. {gpufl-1.0.2 → gpufl-1.1.0}/tests/main_test_runner.cpp +0 -0
  274. {gpufl-1.0.2 → gpufl-1.1.0}/tests/python/conftest.py +0 -0
  275. {gpufl-1.0.2 → gpufl-1.1.0}/tests/python/test_analyzer.py +0 -0
@@ -1,6 +1,6 @@
1
- # Python / notebooks — not needed for the C++ daemon build
2
- python/
3
- example/python/
1
+ # Python / notebooks — uncomment if building Python wheels inside Docker
2
+ # python/
3
+ # example/python/
4
4
  **/.Trash-*
5
5
  **/__pycache__/
6
6
  **/*.pyc
@@ -15,4 +15,4 @@ build/
15
15
  .git/
16
16
  .idea/
17
17
  .vscode/
18
- *.md
18
+ # *.md
@@ -3,8 +3,28 @@ name: Build GPUFl Client
3
3
  on:
4
4
  push:
5
5
  branches: [ "main" ]
6
+ paths-ignore:
7
+ - '**.md'
8
+ - 'docs/**'
9
+ - 'Dockerfile*'
10
+ - '**/Dockerfile'
11
+ - '**/Dockerfile.*'
12
+ - 'LICENSE'
13
+ - 'THIRD-PARTY-NOTICES.txt'
14
+ - '.gitignore'
15
+ - 'images/**'
6
16
  pull_request:
7
17
  branches: [ "main" ]
18
+ paths-ignore:
19
+ - '**.md'
20
+ - 'docs/**'
21
+ - 'Dockerfile*'
22
+ - '**/Dockerfile'
23
+ - '**/Dockerfile.*'
24
+ - 'LICENSE'
25
+ - 'THIRD-PARTY-NOTICES.txt'
26
+ - '.gitignore'
27
+ - 'images/**'
8
28
 
9
29
  jobs:
10
30
  build:
@@ -148,7 +148,7 @@ jobs:
148
148
  # quay.io/pypa/manylinux_2_28_x86_64 image (OpenSSL 3.5.5).
149
149
  CIBW_ENVIRONMENT_LINUX: "CUDA_HOME=/usr/local/cuda PATH=/usr/local/cuda/bin:$PATH CMAKE_ARGS='-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF -DOPENSSL_INCLUDE_DIR=/usr/include/openssl3 -DOPENSSL_SSL_LIBRARY=/usr/lib64/openssl3/libssl.so -DOPENSSL_CRYPTO_LIBRARY=/usr/lib64/openssl3/libcrypto.so'"
150
150
  # Windows build needs the OpenSSL install path so find_package(OpenSSL)
151
- # in CMakeLists.txt succeeds, otherwise HTTPS upload (HttpLogSink)
151
+ # in CMakeLists.txt succeeds, otherwise HTTPS upload (gpufl::uploadLogs)
152
152
  # silently falls back to HTTP-only — see openssl-windows.html in the
153
153
  # manual repo for the user-facing story. CIBW_BEFORE_ALL_WINDOWS
154
154
  # installs choco's openssl package into this path.
@@ -1,5 +1,6 @@
1
- ### claude
1
+ ### ai
2
2
  .claude/
3
+ .junie/
3
4
 
4
5
  ### idea
5
6
  .idea/**
@@ -14,6 +15,7 @@ wget-log*
14
15
 
15
16
  ### docker
16
17
  example/python/docker/**/
18
+ dist/
17
19
 
18
20
  ### C++ template
19
21
  # Prerequisites
@@ -89,3 +91,28 @@ example/python/docker/**/
89
91
  *.hex
90
92
 
91
93
  *.log
94
+
95
+ ### Python
96
+ # Byte-compiled / optimized files
97
+ __pycache__/
98
+ *.py[cod]
99
+ *$py.class
100
+
101
+ # Test / coverage caches
102
+ .pytest_cache/
103
+ .mypy_cache/
104
+ .ruff_cache/
105
+ .coverage
106
+ .coverage.*
107
+ htmlcov/
108
+ coverage.xml
109
+
110
+ # Packaging / distribution
111
+ *.egg-info/
112
+ *.egg
113
+ .eggs/
114
+
115
+ # Virtualenvs
116
+ .venv/
117
+ venv/
118
+ env/
@@ -0,0 +1,214 @@
1
+ # Changelog
2
+
3
+ All notable changes to `gpufl-client` are documented here. Format
4
+ inspired by [Keep a Changelog](https://keepachangelog.com/en/1.1.0/);
5
+ versioning follows PEP 440 for the Python wheel and semver-style
6
+ `MAJOR.MINOR.PATCH` for the C++ library.
7
+
8
+ ## [1.1.0] — 2026-06-03
9
+
10
+ ### Breaking changes
11
+
12
+ #### `HttpLogSink` removed — upload is now a separate post-shutdown step
13
+
14
+ The in-process `HttpLogSink` that POSTed every NDJSON event live
15
+ during a session has been deleted. Network failures during the
16
+ workload could leak into the GPU job's exit code, and per-event HTTP
17
+ added measurable jitter to PyTorch training runs. Upload now happens
18
+ as an explicit step after `gpufl::shutdown()` returns.
19
+
20
+ For Python customers, the migration is **soft** — `remote_upload=True`
21
+ still works in v1.1 as a deprecation shim (see Deprecations below).
22
+ For pure-C++ customers who `#include`'d the header directly, the
23
+ break is a compile error.
24
+
25
+ | Surface | Before (v1.0.x) | New (v1.1+) | v1.1 backward-compat behavior |
26
+ |---|---|---|---|
27
+ | Python `init(remote_upload=True)` | Live HttpLogSink during session | `with gpufl.session(...)` or `gpufl.upload_logs(...)` after shutdown | **Still works** — `DeprecationWarning` at init + `atexit` handler that calls `upload_logs()` at interpreter exit |
28
+ | C++ `opts.remote_upload = true;` | Live HttpLogSink during session | `gpufl::uploadLogs(uopts)` after `shutdown()` | **Still works** — deprecation log at init + auto-call to `gpufl::uploadLogs()` at the end of `gpufl::shutdown()` (shutdown now blocks until upload completes) |
29
+ | Env var `GPUFL_REMOTE_UPLOAD=1` | Live HttpLogSink during session | `gpufl.upload_logs()` post-shutdown | **Still works** — routes through the Python shim above |
30
+ | `#include "gpufl/core/logger/http_log_sink.hpp"` | The header | gone | **Compile error** — drop the include |
31
+
32
+ See [docs/getting-started/sending-data.md](docs/getting-started/sending-data.md)
33
+ for the full migration guide.
34
+
35
+ ### Deprecations (scheduled for v1.2 removal)
36
+
37
+ | Field / kwarg | Status in v1.1 | What to use instead |
38
+ |---|---|---|
39
+ | `InitOptions::remote_upload` (Python kwarg + C++ field) | DeprecationWarning + atexit shim that calls `upload_logs()` at interpreter exit | `with gpufl.session(...)` or call `gpufl.upload_logs()` explicitly after `shutdown()` |
40
+ | `InitOptions::backend_url` | Still functional; read by the version-discovery probe and stored for `upload_logs()` to read back | Pass `backend_url` directly to `UploadOptions` / `gpufl.upload_logs()` |
41
+ | `InitOptions::api_key` | Same as `backend_url` | Pass `api_key` directly to `UploadOptions` / `gpufl.upload_logs()` |
42
+ | `GPUFL_REMOTE_UPLOAD` env var | Still read; routes to the Python atexit shim | Drop from container manifests / start scripts |
43
+
44
+ All three fields ship in v1.1 to keep the migration painless and will
45
+ be removed together in v1.2 — at which point creds live exclusively on
46
+ `UploadOptions` and `gpufl::init()` stops touching network config
47
+ entirely.
48
+
49
+ ### Breaking changes (cont.)
50
+
51
+ #### `sampling_auto_start` renamed to `continuous_system_sampling`
52
+
53
+ The old name only described init-time behavior. The new flag covers
54
+ the full policy — the semantics also got fixed (see Bug fixes).
55
+
56
+ - **Python**: old kwarg still accepted for this release with a
57
+ `DeprecationWarning`. Will be removed in the next release.
58
+ - **C++**: hard rename. Compile error points at the call site with
59
+ a clear "no member named 'sampling_auto_start'" message.
60
+
61
+ ### Added
62
+
63
+ #### Deferred upload — `gpufl.upload_logs()` / `gpufl::uploadLogs()`
64
+
65
+ A new module under `include/gpufl/upload/`. Reads the session's
66
+ NDJSON files post-shutdown, POSTs each event to the existing
67
+ `/api/v1/events/{eventType}` backend endpoints. Never throws on
68
+ network errors; returns an `UploadResult` with `.success`,
69
+ `.events_uploaded`, `.warnings`, etc.
70
+
71
+ Python orchestration via context manager:
72
+
73
+ ```python
74
+ with gpufl.session(app_name="train",
75
+ backend_url="https://api.gpuflight.com",
76
+ api_key="gpfl_xxxxx"):
77
+ train_one_epoch()
78
+ # On __exit__: shutdown() then upload_logs() — automatic.
79
+ ```
80
+
81
+ #### `gpufl upload` CLI
82
+
83
+ Post-mortem / ad-hoc shipping tool. Registered via
84
+ `[project.scripts]` in `pyproject.toml`:
85
+
86
+ ```bash
87
+ gpufl upload /tmp/runs/train --backend-url ... --api-key ...
88
+ gpufl upload /tmp/runs/train --session-id <uuid>
89
+ gpufl upload /tmp/runs/train --all-sessions
90
+ gpufl upload /tmp/runs/train --force # bypass cursor check
91
+ ```
92
+
93
+ Default behavior uploads only the **latest** session found in the
94
+ directory (most recent `job_start.ts_ns`). `--session-id` picks a
95
+ specific one; `--all-sessions` ships every session present.
96
+
97
+ #### Session-aware cursor file
98
+
99
+ `.gpufl-upload-cursor.json` (in the log directory) tracks which
100
+ sessions have completed a successful upload. Re-running `gpufl
101
+ upload` on a completed session refuses with a clear message
102
+ suggesting `--force`; `--all-sessions` mode silently skips completed
103
+ sessions and uploads the rest. Survives across runs to skip
104
+ already-uploaded rotated files.
105
+
106
+ #### `ProfilingEngine` — clarified names
107
+
108
+ The engine enum was reworked into a single, plainly-named ladder
109
+ (no aliases). New default is `Monitor` (telemetry only, no CUPTI).
110
+
111
+ | Name | What it captures |
112
+ |---|---|
113
+ | `Monitor` | GPU/host health metrics only — no CUPTI. The default. |
114
+ | `Trace` | + activity trace: kernels, memcpy, sync (no sampling) |
115
+ | `PcSampling` | + PC stall-reason sampling |
116
+ | `SassMetrics` | + per-instruction SASS counters |
117
+ | `RangeProfiler` | + hardware throughput counters |
118
+ | `Deep` | `PcSampling` + `SassMetrics` in one run |
119
+
120
+ Replaces the earlier `None` / `KernelTrace` / `Continuous` / `Range` /
121
+ `PcSamplingWithSass` names. Pre-1.0, no deprecation shim — the old
122
+ names are gone.
123
+
124
+ #### Ref-counted system-metric sampler
125
+
126
+ `Sampler::configure()` / `activate()` / `deactivate()` / `shutdown()`
127
+ replaces the old `start()` / `stop()`. Activation count composes
128
+ across `continuous_system_sampling` baseline, `GFL_SCOPE` enter/exit,
129
+ and explicit `systemStart()` / `systemStop()` calls — the sampler
130
+ runs while any activator is in flight.
131
+
132
+ ### Bug fixes
133
+
134
+ #### Scope-driven system sampling now works
135
+
136
+ Before: setting `sampling_auto_start=false` silently disabled all
137
+ system metrics, even inside `GFL_SCOPE` regions. The flag's name
138
+ suggested "wait for explicit start" semantics but the code disabled
139
+ sampling entirely. Now, under the renamed `continuous_system_sampling
140
+ = false`, the sampler activates while inside any scope or between
141
+ `systemStart` / `systemStop` calls, then idles outside that window.
142
+
143
+ #### EventWrapper envelope on upload POSTs
144
+
145
+ The initial `uploadLogs()` draft POSTed bare NDJSON event lines.
146
+ The backend's `EventIngestionController` deserialized those into an
147
+ `EventWrapper` with every field null, the inner `objectMapper.readValue
148
+ (null, ...)` threw, the exception was caught and swallowed, and the
149
+ controller returned 200 OK anyway — silent data loss. Every event is
150
+ now correctly wrapped in `{data, agentSendingTime, hostname, ipAddr}`.
151
+
152
+ Regression test added in `tests/upload/test_upload_logs.cpp`.
153
+
154
+ ### Tests added
155
+
156
+ - `tests/core/test_sampler.cpp` — 8 scenarios for the ref-counted
157
+ Sampler (activate/deactivate, nesting, force-shutdown, unbalanced
158
+ deactivate clamping).
159
+ - `tests/upload/test_upload_logs.cpp` — 12 scenarios for the upload
160
+ path (happy path, headers, cursor refusal + force override, auth
161
+ failure, malformed lines, session-id filter, all-sessions,
162
+ lifecycle ordering, EventWrapper envelope regression guard).
163
+ - `tests/python/test_continuous_system_sampling.py` — 5 integration
164
+ scenarios for the three sampling modes plus deprecation behavior.
165
+
166
+ ### Internal / build
167
+
168
+ - Removed `include/gpufl/core/logger/http_log_sink.{hpp,cpp}`.
169
+ - Added `include/gpufl/upload/upload_logs.{hpp,cpp}` to the CMake
170
+ target sources.
171
+ - `CMakeLists.txt` `project(VERSION)` bumped to 1.1.0; new
172
+ `GPUFL_VERSION_SUFFIX` variable layers optional PEP 440 pre-release
173
+ tokens onto `GPUFL_CLIENT_VERSION`.
174
+
175
+ ### Migration checklist for 1.0.x → 1.1.0
176
+
177
+ **Optional in v1.1, required by v1.2:**
178
+
179
+ - [ ] Python: replace every `gpufl.init(remote_upload=True, ...)` call
180
+ with `with gpufl.session(backend_url=..., api_key=...):` or an
181
+ explicit `gpufl.upload_logs(...)` after `shutdown()`. The old form
182
+ still works in v1.1 with a `DeprecationWarning`; v1.2 will remove it.
183
+ - [ ] C++: replace `opts.remote_upload = true;` with an explicit
184
+ `gpufl::uploadLogs(uopts)` after `gpufl::shutdown()`. The field
185
+ still compiles in v1.1 but is a no-op; v1.2 will delete it.
186
+ - [ ] Container manifests: prefer dropping `GPUFL_REMOTE_UPLOAD` and
187
+ driving upload via your app code (or the `gpufl upload` CLI in a
188
+ lifecycle hook). The env var still routes through the Python shim
189
+ in v1.1; v1.2 stops reading it.
190
+ - [ ] Future-proof: start passing `backend_url` and `api_key` directly
191
+ to `gpufl::uploadLogs()` / `gpufl.upload_logs()` rather than relying
192
+ on the InitOptions fields. Those InitOptions fields will move to
193
+ UploadOptions only in v1.2.
194
+
195
+ **Required in v1.1 (no grace period):**
196
+
197
+ - [ ] Python: rename `sampling_auto_start` → `continuous_system_sampling`.
198
+ The old name still works with a `DeprecationWarning` (removed in v1.2).
199
+ - [ ] C++: rename `opts.sampling_auto_start` → `opts.continuous_system_sampling`
200
+ (compile-time error otherwise — no grace period for C++).
201
+ - [ ] If you `#include`'d `http_log_sink.hpp` directly anywhere,
202
+ drop the include — the header is gone.
203
+
204
+ ---
205
+
206
+ ## Releases prior to 1.1.0
207
+
208
+ See git tags for the historical record. Highlights:
209
+
210
+ - **1.0.3** — `ScopeMeta` benchmark-iteration helper, scope iterator
211
+ form, `gpufl.report` text summary improvements.
212
+ - **1.0.2** — first version published to PyPI; "Stable" status.
213
+ - **1.0.1** — `kernel_sample_rate_ms` deprecated (no-op).
214
+ - **1.0.0** — first stable contract.
@@ -1,11 +1,17 @@
1
1
  cmake_minimum_required(VERSION 3.31)
2
2
 
3
3
  project(gpufl_client
4
- VERSION 1.0.2
4
+ VERSION 1.1.0
5
5
  LANGUAGES CXX
6
6
  DESCRIPTION "Header-only GPU monitoring client library"
7
7
  )
8
8
 
9
+ # Pre-release suffix appended to GPUFL_CLIENT_VERSION below. PEP 440
10
+ # pre-release tokens (`rc1`, `a1`, `b1`, …) aren't valid in CMake's
11
+ # `project(... VERSION ...)`, so we layer them on top here. Final releases
12
+ # leave this empty.
13
+ set(GPUFL_VERSION_SUFFIX "")
14
+
9
15
  # -----------------------
10
16
  # CUDA Architectures (CI Friendly)
11
17
  # -----------------------
@@ -22,6 +28,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
22
28
  # -----------------------
23
29
  option(GPUFL_ENABLE_NVIDIA "Enable NVIDIA backends (CUDA + NVML when available)" ON)
24
30
  option(GPUFL_ENABLE_AMD "Enable AMD backends (ROCm when available)" OFF)
31
+ option(GPUFL_ENABLE_AMD_ROCPROFILER "Enable AMD rocprofiler-sdk tracing backend when available" ON)
25
32
 
26
33
  option(BUILD_GPUFL_EXAMPLE "Build gpufl example application" ON)
27
34
  option(BUILD_GPUFL_MONITOR "Build gpufl-monitor standalone daemon" ON)
@@ -56,7 +63,7 @@ target_compile_features(gpufl INTERFACE cxx_std_17)
56
63
  # inline the literal at call sites and the mismatch becomes visible
57
64
  # as comparison failures (e.g. test asserting on User-Agent).
58
65
  target_compile_definitions(gpufl PUBLIC
59
- GPUFL_CLIENT_VERSION="${PROJECT_VERSION}"
66
+ GPUFL_CLIENT_VERSION="${PROJECT_VERSION}${GPUFL_VERSION_SUFFIX}"
60
67
  )
61
68
 
62
69
  # Enable PIC for static library (required when linking into shared libraries like Python modules)
@@ -68,7 +75,7 @@ target_sources(gpufl PRIVATE
68
75
  include/gpufl/core/logger/logger.cpp
69
76
  include/gpufl/core/logger/log_rotator.cpp
70
77
  include/gpufl/core/logger/file_log_sink.cpp
71
- include/gpufl/core/logger/http_log_sink.cpp
78
+ include/gpufl/upload/upload_logs.cpp
72
79
  include/gpufl/core/host_info.cpp
73
80
  include/gpufl/core/remote_config.cpp
74
81
  include/gpufl/core/model/batch_models.cpp
@@ -147,7 +154,9 @@ target_sources(gpufl PRIVATE include/gpufl/core/logger/file_compressor.cpp)
147
154
 
148
155
 
149
156
  # -----------------------
150
- # cpp-httplib — HTTP client for direct-to-backend log upload (HttpLogSink).
157
+ # cpp-httplib — HTTP client used by gpufl::uploadLogs (deferred upload
158
+ # of session NDJSON files to the backend, called after gpufl::shutdown).
159
+ # Also used by remote_config.cpp for the post-init version probe.
151
160
  #
152
161
  # Single-header library. Fetched once via FetchContent so every build gets
153
162
  # the same version regardless of the host system. HTTPS support is gated
@@ -167,30 +176,37 @@ FetchContent_MakeAvailable(httplib)
167
176
 
168
177
  find_package(OpenSSL QUIET)
169
178
  if(OpenSSL_FOUND)
170
- message(STATUS "Found OpenSSL: ${OPENSSL_VERSION} — HttpLogSink HTTPS enabled")
179
+ message(STATUS "Found OpenSSL: ${OPENSSL_VERSION} — HTTPS upload enabled")
171
180
  target_compile_definitions(gpufl PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT=1)
172
181
  target_link_libraries(gpufl PRIVATE OpenSSL::SSL OpenSSL::Crypto)
173
182
  set(GPUFL_HTTPLIB_TLS 1)
174
183
  else()
175
184
  message(WARNING
176
- "OpenSSL not found — HttpLogSink will support HTTP only. "
177
- "Users pointing remote_config at an https:// endpoint will see "
178
- "upload failures. Install OpenSSL (apt: libssl-dev, vcpkg: openssl, "
179
- "brew: openssl) to enable TLS.")
185
+ "OpenSSL not found — gpufl::uploadLogs will support HTTP only. "
186
+ "Pointing backend_url at an https:// endpoint will fail to verify. "
187
+ "Install OpenSSL (apt: libssl-dev, vcpkg: openssl, brew: openssl) "
188
+ "to enable TLS.")
180
189
  set(GPUFL_HTTPLIB_TLS 0)
181
190
  endif()
182
191
  target_compile_definitions(gpufl PRIVATE GPUFL_HTTPLIB_TLS=${GPUFL_HTTPLIB_TLS})
183
- # NOTE: cpp-httplib's gzip request-body path (CPPHTTPLIB_ZLIB_SUPPORT)
184
- # is intentionally NOT defined. HttpLogSink sends uncompressed JSON
185
- # per event; bandwidth-conscious deployments use gpufl-agent against
186
- # the rotated NDJSON files (where deflate amortizes its dictionary
187
- # across the whole file for 10-15× compression vs the ~5× we'd get
188
- # per-POST). See the architectural note above HttpLogSink::Options
189
- # in http_log_sink.hpp for the full rationale.
192
+ # Enable cpp-httplib's gzip path. PUBLIC so any consumer that includes
193
+ # httplib.h via gpufl's headers (notably the test target's embedded
194
+ # httplib::Server) gets the same wire-format support without this,
195
+ # server::Post handlers return 415 on incoming Content-Encoding: gzip
196
+ # requests, which is exactly what uploadLogs sends in v1.2+.
197
+ #
198
+ # In production, the Spring Boot backend handles gzip natively, so
199
+ # this define affects the test target more than the client. We still
200
+ # define it on gpufl PUBLIC because:
201
+ # - It enables future use of httplib::Client::set_compress(true)
202
+ # to compress the body in-place instead of our manual gzipString
203
+ # (no current need, but a free optimization if we ever want it).
204
+ # - It propagates to tests, which is the immediate motivation.
190
205
  #
191
- # ZLIB is still linked above (around line 96) — file_compressor.cpp
192
- # uses it to gzip rotated NDJSON files written by FileLogSink, which
193
- # is a separate, working feature.
206
+ # ZLIB is already linked above — file_compressor.cpp uses it to gzip
207
+ # rotated NDJSON files, and upload_logs.cpp uses it both to read those
208
+ # files back and to gzip outgoing stream-chunks.
209
+ target_compile_definitions(gpufl PUBLIC CPPHTTPLIB_ZLIB_SUPPORT=1)
194
210
  target_link_libraries(gpufl PRIVATE httplib::httplib)
195
211
 
196
212
 
@@ -225,6 +241,7 @@ if(GPUFL_ENABLE_NVIDIA)
225
241
  include/gpufl/backends/nvidia/sampler/cupti_sass.hpp
226
242
  include/gpufl/backends/nvidia/cuda_collector.cpp
227
243
  include/gpufl/backends/nvidia/cupti_utils.cpp
244
+ include/gpufl/backends/nvidia/cuda_cleanup_handler.cpp
228
245
  include/gpufl/backends/nvidia/resource_handler.cpp
229
246
  include/gpufl/backends/nvidia/kernel_launch_handler.cpp
230
247
  include/gpufl/backends/nvidia/mem_transfer_handler.cpp
@@ -233,6 +250,7 @@ if(GPUFL_ENABLE_NVIDIA)
233
250
  include/gpufl/backends/nvidia/cupti_backend.cpp
234
251
  include/gpufl/backends/nvidia/engine/pc_sampling_engine.cpp
235
252
  include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp
253
+ include/gpufl/backends/nvidia/engine/pm_sampling_engine.cpp
236
254
  include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp
237
255
  include/gpufl/backends/nvidia/engine/range_profiler_engine.cpp)
238
256
  target_link_libraries(gpufl PRIVATE CUDA::cudart CUDA::cuda_driver)
@@ -481,18 +499,22 @@ if(GPUFL_ENABLE_AMD)
481
499
  endif()
482
500
  endif()
483
501
 
484
- find_package(rocprofiler-sdk QUIET CONFIG HINTS /opt/rocm)
485
- if(TARGET rocprofiler-sdk::rocprofiler-sdk)
486
- set(GPUFL_HAS_ROCPROFILER_SDK 1)
487
- target_link_libraries(gpufl PRIVATE rocprofiler-sdk::rocprofiler-sdk)
488
- target_sources(gpufl PRIVATE
489
- include/gpufl/backends/amd/monitor_adapter_amd.cpp
490
- include/gpufl/backends/amd/rocprofiler_backend.cpp
491
- include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp
492
- )
493
- message(STATUS "Found ROCprofiler-SDK support")
502
+ if(GPUFL_ENABLE_AMD_ROCPROFILER)
503
+ find_package(rocprofiler-sdk QUIET CONFIG HINTS /opt/rocm)
504
+ if(TARGET rocprofiler-sdk::rocprofiler-sdk)
505
+ set(GPUFL_HAS_ROCPROFILER_SDK 1)
506
+ target_link_libraries(gpufl PRIVATE rocprofiler-sdk::rocprofiler-sdk)
507
+ target_sources(gpufl PRIVATE
508
+ include/gpufl/backends/amd/monitor_adapter_amd.cpp
509
+ include/gpufl/backends/amd/rocprofiler_backend.cpp
510
+ include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp
511
+ )
512
+ message(STATUS "Found ROCprofiler-SDK support")
513
+ else()
514
+ message(STATUS "ROCprofiler-SDK not found; AMD kernel tracing disabled")
515
+ endif()
494
516
  else()
495
- message(STATUS "ROCprofiler-SDK not found; AMD kernel tracing disabled")
517
+ message(STATUS "ROCprofiler-SDK support disabled by GPUFL_ENABLE_AMD_ROCPROFILER=OFF")
496
518
  endif()
497
519
 
498
520
  if(GPUFL_HAS_ROCM_SMI OR GPUFL_HAS_HIP)
@@ -44,6 +44,7 @@ RUN cmake -S . -B /build \
44
44
  -DBUILD_GPUFL_MONITOR=ON \
45
45
  -DGPUFL_ENABLE_NVIDIA=OFF \
46
46
  -DGPUFL_ENABLE_AMD=ON \
47
+ -DGPUFL_ENABLE_AMD_ROCPROFILER=OFF \
47
48
  && cmake --build /build --target gpufl-monitor --parallel
48
49
 
49
50
  # ── Stage 2: pull the pre-built Java agent ────────────────────────────────────
@@ -53,22 +54,13 @@ FROM ghcr.io/gpu-flight/gpufl-agent:latest AS agent-jar
53
54
  FROM eclipse-temurin:25-jre AS jre
54
55
 
55
56
  # ── Stage 4: runtime image ─────────────────────────────────────────────────────
56
- FROM ubuntu:24.04
57
+ # Use ROCm base runtime to avoid fragile manual ROCm apt repo/package wiring.
58
+ FROM rocm/dev-ubuntu-24.04:6.4-complete
57
59
 
58
60
  ENV DEBIAN_FRONTEND=noninteractive
59
61
 
60
- # Add ROCm apt repository
61
62
  RUN apt-get update && apt-get install -y --no-install-recommends \
62
- ca-certificates \
63
- wget \
64
- gnupg \
65
- && wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/keyrings/rocm.gpg \
66
- && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.4 noble main' \
67
- > /etc/apt/sources.list.d/rocm.list \
68
- && apt-get update && apt-get install -y --no-install-recommends \
69
63
  supervisor \
70
- rocm-smi-lib \
71
- rocm-hip-runtime \
72
64
  && rm -rf /var/lib/apt/lists/*
73
65
 
74
66
  # Copy Java 25 JRE from Temurin (Ubuntu 24.04 repos only ship up to openjdk-21)