gpufl 1.0.2__tar.gz → 1.1.0rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. {gpufl-1.0.2 → gpufl-1.1.0rc2}/.dockerignore +4 -4
  2. {gpufl-1.0.2 → gpufl-1.1.0rc2}/.github/workflows/build.yml +20 -0
  3. {gpufl-1.0.2 → gpufl-1.1.0rc2}/.github/workflows/release.yml +1 -1
  4. {gpufl-1.0.2 → gpufl-1.1.0rc2}/.gitignore +28 -1
  5. gpufl-1.1.0rc2/CHANGELOG.md +219 -0
  6. {gpufl-1.0.2 → gpufl-1.1.0rc2}/CMakeLists.txt +52 -30
  7. {gpufl-1.0.2 → gpufl-1.1.0rc2}/Dockerfile.monitor.amd +3 -11
  8. {gpufl-1.0.2 → gpufl-1.1.0rc2}/PKG-INFO +104 -70
  9. {gpufl-1.0.2 → gpufl-1.1.0rc2}/README.md +103 -69
  10. {gpufl-1.0.2 → gpufl-1.1.0rc2}/benchmark/README.md +3 -3
  11. gpufl-1.1.0rc2/benchmark/profile_pytorch_kernels.py +193 -0
  12. gpufl-1.1.0rc2/benchmark/profile_pytorch_via_gpufl.py +304 -0
  13. {gpufl-1.0.2 → gpufl-1.1.0rc2}/benchmark/run_benchmark.py +5 -8
  14. {gpufl-1.0.2 → gpufl-1.1.0rc2}/daemon/monitor/main.cpp +6 -2
  15. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/gpufl_scope_demo.cpp +1 -1
  16. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/CMakeLists.txt +11 -1
  17. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/block_style_example.cu +2 -2
  18. gpufl-1.1.0rc2/example/cuda/manykernel_benchmark.cu +549 -0
  19. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/memory_coalescing_demo.cu +2 -2
  20. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/occupancy_demo.cu +1 -1
  21. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/sass_divergence_demo.cu +37 -9
  22. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/system_monitor.cu +1 -1
  23. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/vector_add_benchmark.cu +2 -2
  24. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/02_numba_cuda.py +31 -8
  25. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/03_pytorch_benchmark.py +27 -10
  26. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/rocm_collector.cpp +2 -1
  27. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/rocprofiler_backend.cpp +8 -2
  28. gpufl-1.1.0rc2/include/gpufl/backends/nvidia/cuda_cleanup_handler.cpp +93 -0
  29. gpufl-1.1.0rc2/include/gpufl/backends/nvidia/cuda_cleanup_handler.hpp +27 -0
  30. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_backend.cpp +505 -56
  31. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_backend.hpp +94 -6
  32. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_common.hpp +43 -10
  33. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_utils.cpp +10 -0
  34. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cupti_utils.hpp +6 -0
  35. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.cpp +80 -137
  36. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.hpp +6 -0
  37. gpufl-1.1.0rc2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp +194 -0
  38. gpufl-1.1.0rc2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.hpp +93 -0
  39. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/profiling_engine.hpp +15 -0
  40. gpufl-1.1.0rc2/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +734 -0
  41. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/sass_metrics_engine.hpp +31 -4
  42. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/kernel_launch_handler.cpp +148 -21
  43. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/mem_transfer_handler.cpp +9 -0
  44. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/resource_handler.cpp +3 -3
  45. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/synchronization_handler.cpp +8 -0
  46. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/config_file_loader.cpp +9 -6
  47. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/debug_logger.hpp +12 -10
  48. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/dictionary_manager.cpp +94 -24
  49. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/events.hpp +29 -3
  50. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/gpufl.cpp +282 -123
  51. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/host_info.hpp +3 -3
  52. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/json/json.cpp +33 -3
  53. gpufl-1.1.0rc2/include/gpufl/core/logger/file_compressor.cpp +55 -0
  54. gpufl-1.1.0rc2/include/gpufl/core/logger/file_log_sink.cpp +231 -0
  55. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/logger/file_log_sink.hpp +15 -4
  56. gpufl-1.1.0rc2/include/gpufl/core/logger/log_rotator.cpp +105 -0
  57. gpufl-1.1.0rc2/include/gpufl/core/logger/log_rotator.hpp +65 -0
  58. gpufl-1.1.0rc2/include/gpufl/core/logger/logger.cpp +60 -0
  59. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/logger/logger.hpp +19 -3
  60. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/batch_models.cpp +8 -3
  61. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/lifecycle_model.cpp +22 -0
  62. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/lifecycle_model.hpp +8 -0
  63. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor.cpp +12 -7
  64. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor.hpp +91 -10
  65. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor_adapter.cpp +11 -0
  66. gpufl-1.1.0rc2/include/gpufl/core/remote_config.cpp +118 -0
  67. gpufl-1.1.0rc2/include/gpufl/core/remote_config.hpp +39 -0
  68. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/sampler.cpp +69 -29
  69. gpufl-1.1.0rc2/include/gpufl/core/sampler.hpp +117 -0
  70. gpufl-1.1.0rc2/include/gpufl/gpufl.hpp +466 -0
  71. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/report/text_report.cpp +105 -8
  72. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/report/text_report.hpp +13 -0
  73. gpufl-1.1.0rc2/include/gpufl/upload/upload_logs.cpp +1503 -0
  74. gpufl-1.1.0rc2/include/gpufl/upload/upload_logs.hpp +243 -0
  75. {gpufl-1.0.2 → gpufl-1.1.0rc2}/pyproject.toml +8 -1
  76. gpufl-1.1.0rc2/python/bindings.cpp +329 -0
  77. gpufl-1.1.0rc2/python/gpufl/__init__.py +1073 -0
  78. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/analyzer/analyzer.py +143 -24
  79. gpufl-1.1.0rc2/python/gpufl/cli.py +178 -0
  80. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/report/text_report.py +141 -33
  81. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/viz/reader.py +36 -2
  82. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/viz/visualizer.py +4 -4
  83. gpufl-1.1.0rc2/run_tests.py +309 -0
  84. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/CMakeLists.txt +56 -14
  85. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/nvidia/test_engine_coverage.cpp +35 -13
  86. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/log_utils.cpp +76 -19
  87. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_analyzer.cpp +2 -2
  88. gpufl-1.1.0rc2/tests/core/test_api_path_routing.cpp +50 -0
  89. gpufl-1.1.0rc2/tests/core/test_bench_invoker.cpp +139 -0
  90. gpufl-1.1.0rc2/tests/core/test_disabled.cpp +171 -0
  91. gpufl-1.1.0rc2/tests/core/test_sampler.cpp +195 -0
  92. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_wire_contract.cpp +45 -2
  93. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/python/test_bindings.py +13 -6
  94. gpufl-1.1.0rc2/tests/python/test_continuous_system_sampling.py +192 -0
  95. gpufl-1.1.0rc2/tests/python/test_disabled.py +178 -0
  96. gpufl-1.1.0rc2/tests/python/test_scope_iterable.py +308 -0
  97. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/run_engine_coverage.ps1 +3 -3
  98. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/run_engine_coverage.sh +3 -3
  99. gpufl-1.1.0rc2/tests/upload/test_upload_logs.cpp +1029 -0
  100. gpufl-1.1.0rc2/tests/verify_pipeline.py +114 -0
  101. gpufl-1.0.2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp +0 -70
  102. gpufl-1.0.2/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.hpp +0 -65
  103. gpufl-1.0.2/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +0 -421
  104. gpufl-1.0.2/include/gpufl/core/logger/file_compressor.cpp +0 -44
  105. gpufl-1.0.2/include/gpufl/core/logger/file_log_sink.cpp +0 -151
  106. gpufl-1.0.2/include/gpufl/core/logger/http_log_sink.cpp +0 -476
  107. gpufl-1.0.2/include/gpufl/core/logger/http_log_sink.hpp +0 -206
  108. gpufl-1.0.2/include/gpufl/core/logger/log_rotator.cpp +0 -65
  109. gpufl-1.0.2/include/gpufl/core/logger/log_rotator.hpp +0 -32
  110. gpufl-1.0.2/include/gpufl/core/logger/logger.cpp +0 -47
  111. gpufl-1.0.2/include/gpufl/core/remote_config.cpp +0 -276
  112. gpufl-1.0.2/include/gpufl/core/remote_config.hpp +0 -60
  113. gpufl-1.0.2/include/gpufl/core/sampler.hpp +0 -63
  114. gpufl-1.0.2/include/gpufl/gpufl.hpp +0 -244
  115. gpufl-1.0.2/python/bindings.cpp +0 -189
  116. gpufl-1.0.2/python/gpufl/__init__.py +0 -211
  117. gpufl-1.0.2/tests/core/test_api_path_routing.cpp +0 -213
  118. gpufl-1.0.2/tests/core/test_http_log_sink.cpp +0 -409
  119. gpufl-1.0.2/tests/python/test_remote_upload_smoke.py +0 -185
  120. gpufl-1.0.2/tests/verify_pipeline.py +0 -102
  121. {gpufl-1.0.2 → gpufl-1.1.0rc2}/.clang-format +0 -0
  122. {gpufl-1.0.2 → gpufl-1.1.0rc2}/.github/pull_request_template.md +0 -0
  123. {gpufl-1.0.2 → gpufl-1.1.0rc2}/CONTRIBUTING.md +0 -0
  124. {gpufl-1.0.2 → gpufl-1.1.0rc2}/Dockerfile.demo +0 -0
  125. {gpufl-1.0.2 → gpufl-1.1.0rc2}/Dockerfile.monitor +0 -0
  126. {gpufl-1.0.2 → gpufl-1.1.0rc2}/Dockerfile.monitor.supervisord.conf +0 -0
  127. {gpufl-1.0.2 → gpufl-1.1.0rc2}/LICENSE +0 -0
  128. {gpufl-1.0.2 → gpufl-1.1.0rc2}/THIRD-PARTY-NOTICES.txt +0 -0
  129. {gpufl-1.0.2 → gpufl-1.1.0rc2}/benchmark/cuda_gemm.py +0 -0
  130. {gpufl-1.0.2 → gpufl-1.1.0rc2}/benchmark/pytorch_train.py +0 -0
  131. {gpufl-1.0.2 → gpufl-1.1.0rc2}/build.sh +0 -0
  132. {gpufl-1.0.2 → gpufl-1.1.0rc2}/daemon/README.md +0 -0
  133. {gpufl-1.0.2 → gpufl-1.1.0rc2}/daemon/monitor/CMakeLists.txt +0 -0
  134. {gpufl-1.0.2 → gpufl-1.1.0rc2}/docker-compose.monitor.amd.yml +0 -0
  135. {gpufl-1.0.2 → gpufl-1.1.0rc2}/docker-compose.monitor.yml +0 -0
  136. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/CMakeLists.txt +0 -0
  137. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/README.md +0 -0
  138. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/check_device.cpp +0 -0
  139. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/amd/vector_add_benchmark.cpp +0 -0
  140. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/check_conflict.cu +0 -0
  141. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/check_device.cu +0 -0
  142. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/cupti_basic.cu +0 -0
  143. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/cupti_pc_sampling.cu +0 -0
  144. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/list_sass_metrics.cu +0 -0
  145. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/cuda/test_occupancy.cu +0 -0
  146. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/01_basic.py +0 -0
  147. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/analyzer/01_analyzer_sample.py +0 -0
  148. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/requirements.txt +0 -0
  149. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/viz/01_plot_memory_timeline.py +0 -0
  150. {gpufl-1.0.2 → gpufl-1.1.0rc2}/example/python/viz/02_plot_stress_timeline.py +0 -0
  151. {gpufl-1.0.2 → gpufl-1.1.0rc2}/images/Screenshot1.png +0 -0
  152. {gpufl-1.0.2 → gpufl-1.1.0rc2}/images/Screenshot2.png +0 -0
  153. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/engine/amd_profiling_engine.hpp +0 -0
  154. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp +0 -0
  155. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/engine/dispatch_counter_engine.hpp +0 -0
  156. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/hip_static_collector.cpp +0 -0
  157. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/hip_static_collector.hpp +0 -0
  158. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/monitor_adapter_amd.cpp +0 -0
  159. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/monitor_adapter_amd.hpp +0 -0
  160. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/rocm_collector.hpp +0 -0
  161. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/amd/rocprofiler_backend.hpp +0 -0
  162. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/host_collector.hpp +0 -0
  163. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cuda_collector.cpp +0 -0
  164. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cuda_collector.hpp +0 -0
  165. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/cuda_feature_guards.hpp +0 -0
  166. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/range_profiler_engine.cpp +0 -0
  167. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/engine/range_profiler_engine.hpp +0 -0
  168. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/kernel_launch_handler.hpp +0 -0
  169. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/mem_transfer_handler.hpp +0 -0
  170. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.cpp +0 -0
  171. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.hpp +0 -0
  172. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/nvml_collector.cpp +0 -0
  173. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/nvml_collector.hpp +0 -0
  174. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/resource_handler.hpp +0 -0
  175. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/sampler/cupti_sass.cpp +0 -0
  176. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/sampler/cupti_sass.hpp +0 -0
  177. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/backends/nvidia/synchronization_handler.hpp +0 -0
  178. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/activity_record.hpp +0 -0
  179. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/backend_factory.cpp +0 -0
  180. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/backend_factory.hpp +0 -0
  181. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/backend_interfaces.hpp +0 -0
  182. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/batch_buffer.hpp +0 -0
  183. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/common.cpp +0 -0
  184. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/common.hpp +0 -0
  185. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/config_file_loader.hpp +0 -0
  186. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/debug_logger.cpp +0 -0
  187. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/dictionary_manager.hpp +0 -0
  188. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/host_info.cpp +0 -0
  189. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/itanium_demangle.cpp +0 -0
  190. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/itanium_demangle.hpp +0 -0
  191. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/json/json.hpp +0 -0
  192. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/logger/file_compressor.hpp +0 -0
  193. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/logger/log_sink.hpp +0 -0
  194. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/batch_models.hpp +0 -0
  195. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/graph_launch_event_model.cpp +0 -0
  196. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/graph_launch_event_model.hpp +0 -0
  197. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/kernel_event_model.cpp +0 -0
  198. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/kernel_event_model.hpp +0 -0
  199. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/memcpy_event_model.cpp +0 -0
  200. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/memcpy_event_model.hpp +0 -0
  201. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/memory_alloc_event_model.cpp +0 -0
  202. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/memory_alloc_event_model.hpp +0 -0
  203. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/model_utils.hpp +0 -0
  204. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/nvtx_marker_model.cpp +0 -0
  205. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/nvtx_marker_model.hpp +0 -0
  206. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/perf_metric_model.cpp +0 -0
  207. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/perf_metric_model.hpp +0 -0
  208. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/profile_sample_model.cpp +0 -0
  209. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/profile_sample_model.hpp +0 -0
  210. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/scope_event_model.cpp +0 -0
  211. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/scope_event_model.hpp +0 -0
  212. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/serializable.hpp +0 -0
  213. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/synchronization_event_model.cpp +0 -0
  214. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/synchronization_event_model.hpp +0 -0
  215. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/system_event_model.cpp +0 -0
  216. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/model/system_event_model.hpp +0 -0
  217. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor_adapter.hpp +0 -0
  218. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/monitor_backend.hpp +0 -0
  219. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/ring_buffer.hpp +0 -0
  220. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/runtime.cpp +0 -0
  221. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/runtime.hpp +0 -0
  222. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/sass_compressor.cpp +0 -0
  223. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/sass_compressor.hpp +0 -0
  224. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/scope_registry.cpp +0 -0
  225. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/scope_registry.hpp +0 -0
  226. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/stack_registry.hpp +0 -0
  227. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/stack_trace.cpp +0 -0
  228. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/stack_trace.hpp +0 -0
  229. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/stream_handle.hpp +0 -0
  230. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/trace_type.hpp +0 -0
  231. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/core/version.hpp +0 -0
  232. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/report/hint_engine.cpp +0 -0
  233. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl/report/hint_engine.hpp +0 -0
  234. {gpufl-1.0.2 → gpufl-1.1.0rc2}/include/gpufl.hpp +0 -0
  235. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/.gitignore +0 -0
  236. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/analyzer/__init__.py +0 -0
  237. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/cupy/__init__.py +0 -0
  238. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/jax/__init__.py +0 -0
  239. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/numba/__init__.py +0 -0
  240. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/report/__init__.py +0 -0
  241. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/__init__.py +0 -0
  242. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/dispatch.py +0 -0
  243. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/profile.py +0 -0
  244. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/stack.py +0 -0
  245. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/torch/trace_import.py +0 -0
  246. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/triton/__init__.py +0 -0
  247. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/utils.py +0 -0
  248. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/viz/__init__.py +0 -0
  249. {gpufl-1.0.2 → gpufl-1.1.0rc2}/python/gpufl/viz/timeline.py +0 -0
  250. {gpufl-1.0.2 → gpufl-1.1.0rc2}/scripts/docker-demo-loop.sh +0 -0
  251. {gpufl-1.0.2 → gpufl-1.1.0rc2}/scripts/windows/run-monitor-local.bat +0 -0
  252. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/amd/test_rocm_collector.cpp +0 -0
  253. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/nvidia/test_cuda_collector.cpp +0 -0
  254. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/nvidia/test_nvidia_backend.cpp +0 -0
  255. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/backends/nvidia/test_nvml_collector.cpp +0 -0
  256. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/log_utils.hpp +0 -0
  257. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/test_kernel.cu +0 -0
  258. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/test_kernel.hpp +0 -0
  259. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/common/test_utils.hpp +0 -0
  260. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_batch_models.cpp +0 -0
  261. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_itanium_demangle.cpp +0 -0
  262. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/core/test_monitor.cpp +0 -0
  263. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/main_test_runner.cpp +0 -0
  264. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/python/conftest.py +0 -0
  265. {gpufl-1.0.2 → gpufl-1.1.0rc2}/tests/python/test_analyzer.py +0 -0
@@ -1,6 +1,6 @@
1
- # Python / notebooks — not needed for the C++ daemon build
2
- python/
3
- example/python/
1
+ # Python / notebooks — uncomment if building Python wheels inside Docker
2
+ # python/
3
+ # example/python/
4
4
  **/.Trash-*
5
5
  **/__pycache__/
6
6
  **/*.pyc
@@ -15,4 +15,4 @@ build/
15
15
  .git/
16
16
  .idea/
17
17
  .vscode/
18
- *.md
18
+ # *.md
@@ -3,8 +3,28 @@ name: Build GPUFl Client
3
3
  on:
4
4
  push:
5
5
  branches: [ "main" ]
6
+ paths-ignore:
7
+ - '**.md'
8
+ - 'docs/**'
9
+ - 'Dockerfile*'
10
+ - '**/Dockerfile'
11
+ - '**/Dockerfile.*'
12
+ - 'LICENSE'
13
+ - 'THIRD-PARTY-NOTICES.txt'
14
+ - '.gitignore'
15
+ - 'images/**'
6
16
  pull_request:
7
17
  branches: [ "main" ]
18
+ paths-ignore:
19
+ - '**.md'
20
+ - 'docs/**'
21
+ - 'Dockerfile*'
22
+ - '**/Dockerfile'
23
+ - '**/Dockerfile.*'
24
+ - 'LICENSE'
25
+ - 'THIRD-PARTY-NOTICES.txt'
26
+ - '.gitignore'
27
+ - 'images/**'
8
28
 
9
29
  jobs:
10
30
  build:
@@ -148,7 +148,7 @@ jobs:
148
148
  # quay.io/pypa/manylinux_2_28_x86_64 image (OpenSSL 3.5.5).
149
149
  CIBW_ENVIRONMENT_LINUX: "CUDA_HOME=/usr/local/cuda PATH=/usr/local/cuda/bin:$PATH CMAKE_ARGS='-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF -DOPENSSL_INCLUDE_DIR=/usr/include/openssl3 -DOPENSSL_SSL_LIBRARY=/usr/lib64/openssl3/libssl.so -DOPENSSL_CRYPTO_LIBRARY=/usr/lib64/openssl3/libcrypto.so'"
150
150
  # Windows build needs the OpenSSL install path so find_package(OpenSSL)
151
- # in CMakeLists.txt succeeds, otherwise HTTPS upload (HttpLogSink)
151
+ # in CMakeLists.txt succeeds, otherwise HTTPS upload (gpufl::uploadLogs)
152
152
  # silently falls back to HTTP-only — see openssl-windows.html in the
153
153
  # manual repo for the user-facing story. CIBW_BEFORE_ALL_WINDOWS
154
154
  # installs choco's openssl package into this path.
@@ -1,5 +1,6 @@
1
- ### claude
1
+ ### ai
2
2
  .claude/
3
+ .junie/
3
4
 
4
5
  ### idea
5
6
  .idea/**
@@ -14,6 +15,7 @@ wget-log*
14
15
 
15
16
  ### docker
16
17
  example/python/docker/**/
18
+ dist/
17
19
 
18
20
  ### C++ template
19
21
  # Prerequisites
@@ -89,3 +91,28 @@ example/python/docker/**/
89
91
  *.hex
90
92
 
91
93
  *.log
94
+
95
+ ### Python
96
+ # Byte-compiled / optimized files
97
+ __pycache__/
98
+ *.py[cod]
99
+ *$py.class
100
+
101
+ # Test / coverage caches
102
+ .pytest_cache/
103
+ .mypy_cache/
104
+ .ruff_cache/
105
+ .coverage
106
+ .coverage.*
107
+ htmlcov/
108
+ coverage.xml
109
+
110
+ # Packaging / distribution
111
+ *.egg-info/
112
+ *.egg
113
+ .eggs/
114
+
115
+ # Virtualenvs
116
+ .venv/
117
+ venv/
118
+ env/
@@ -0,0 +1,219 @@
1
+ # Changelog
2
+
3
+ All notable changes to `gpufl-client` are documented here. Format
4
+ inspired by [Keep a Changelog](https://keepachangelog.com/en/1.1.0/);
5
+ versioning follows PEP 440 for the Python wheel and semver-style
6
+ `MAJOR.MINOR.PATCH` for the C++ library.
7
+
8
+ ## [1.1.0] — Unreleased
9
+
10
+ Currently validating as **`1.1.0rc2`**. Once it survives a full smoke
11
+ cycle in dev + a sample PyTorch run + the example Dockerfile build,
12
+ the `rc2` suffix gets dropped to ship as `1.1.0`.
13
+
14
+ ### Breaking changes
15
+
16
+ #### `HttpLogSink` removed — upload is now a separate post-shutdown step
17
+
18
+ The in-process `HttpLogSink` that POSTed every NDJSON event live
19
+ during a session has been deleted. Network failures during the
20
+ workload could leak into the GPU job's exit code, and per-event HTTP
21
+ added measurable jitter to PyTorch training runs. Upload now happens
22
+ as an explicit step after `gpufl::shutdown()` returns.
23
+
24
+ For Python customers, the migration is **soft** — `remote_upload=True`
25
+ still works in v1.1 as a deprecation shim (see Deprecations below).
26
+ For pure-C++ customers who `#include`'d the header directly, the
27
+ break is a compile error.
28
+
29
+ | Surface | Before (v1.0.x) | New (v1.1+) | v1.1 backward-compat behavior |
30
+ |---|---|---|---|
31
+ | Python `init(remote_upload=True)` | Live HttpLogSink during session | `with gpufl.session(...)` or `gpufl.upload_logs(...)` after shutdown | **Still works** — `DeprecationWarning` at init + `atexit` handler that calls `upload_logs()` at interpreter exit |
32
+ | C++ `opts.remote_upload = true;` | Live HttpLogSink during session | `gpufl::uploadLogs(uopts)` after `shutdown()` | **Still works** — deprecation log at init + auto-call to `gpufl::uploadLogs()` at the end of `gpufl::shutdown()` (shutdown now blocks until upload completes) |
33
+ | Env var `GPUFL_REMOTE_UPLOAD=1` | Live HttpLogSink during session | `gpufl.upload_logs()` post-shutdown | **Still works** — routes through the Python shim above |
34
+ | `#include "gpufl/core/logger/http_log_sink.hpp"` | The header | gone | **Compile error** — drop the include |
35
+
36
+ See [docs/getting-started/sending-data.md](docs/getting-started/sending-data.md)
37
+ for the full migration guide.
38
+
39
+ ### Deprecations (scheduled for v1.2 removal)
40
+
41
+ | Field / kwarg | Status in v1.1 | What to use instead |
42
+ |---|---|---|
43
+ | `InitOptions::remote_upload` (Python kwarg + C++ field) | DeprecationWarning + atexit shim that calls `upload_logs()` at interpreter exit | `with gpufl.session(...)` or call `gpufl.upload_logs()` explicitly after `shutdown()` |
44
+ | `InitOptions::backend_url` | Still functional; read by the version-discovery probe and stored for `upload_logs()` to read back | Pass `backend_url` directly to `UploadOptions` / `gpufl.upload_logs()` |
45
+ | `InitOptions::api_key` | Same as `backend_url` | Pass `api_key` directly to `UploadOptions` / `gpufl.upload_logs()` |
46
+ | `GPUFL_REMOTE_UPLOAD` env var | Still read; routes to the Python atexit shim | Drop from container manifests / start scripts |
47
+
48
+ All three fields ship in v1.1 to keep the migration painless and will
49
+ be removed together in v1.2 — at which point creds live exclusively on
50
+ `UploadOptions` and `gpufl::init()` stops touching network config
51
+ entirely.
52
+
53
+ ### Breaking changes (cont.)
54
+
55
+ #### `sampling_auto_start` renamed to `continuous_system_sampling`
56
+
57
+ The old name only described init-time behavior. The new flag covers
58
+ the full policy — the semantics also got fixed (see Bug fixes).
59
+
60
+ - **Python**: old kwarg still accepted for this release with a
61
+ `DeprecationWarning`. Will be removed in the next release.
62
+ - **C++**: hard rename. Compile error points at the call site with
63
+ a clear "no member named 'sampling_auto_start'" message.
64
+
65
+ ### Added
66
+
67
+ #### Deferred upload — `gpufl.upload_logs()` / `gpufl::uploadLogs()`
68
+
69
+ A new module under `include/gpufl/upload/`. Reads the session's
70
+ NDJSON files post-shutdown, POSTs each event to the existing
71
+ `/api/v1/events/{eventType}` backend endpoints. Never throws on
72
+ network errors; returns an `UploadResult` with `.success`,
73
+ `.events_uploaded`, `.warnings`, etc.
74
+
75
+ Python orchestration via context manager:
76
+
77
+ ```python
78
+ with gpufl.session(app_name="train",
79
+ backend_url="https://api.gpuflight.com",
80
+ api_key="gpfl_xxxxx"):
81
+ train_one_epoch()
82
+ # On __exit__: shutdown() then upload_logs() — automatic.
83
+ ```
84
+
85
+ #### `gpufl upload` CLI
86
+
87
+ Post-mortem / ad-hoc shipping tool. Registered via
88
+ `[project.scripts]` in `pyproject.toml`:
89
+
90
+ ```bash
91
+ gpufl upload /tmp/runs/train --backend-url ... --api-key ...
92
+ gpufl upload /tmp/runs/train --session-id <uuid>
93
+ gpufl upload /tmp/runs/train --all-sessions
94
+ gpufl upload /tmp/runs/train --force # bypass cursor check
95
+ ```
96
+
97
+ Default behavior uploads only the **latest** session found in the
98
+ directory (most recent `job_start.ts_ns`). `--session-id` picks a
99
+ specific one; `--all-sessions` ships every session present.
100
+
101
+ #### Session-aware cursor file
102
+
103
+ `.gpufl-upload-cursor.json` (in the log directory) tracks which
104
+ sessions have completed a successful upload. Re-running `gpufl
105
+ upload` on a completed session refuses with a clear message
106
+ suggesting `--force`; `--all-sessions` mode silently skips completed
107
+ sessions and uploads the rest. Survives across runs to skip
108
+ already-uploaded rotated files.
109
+
110
+ #### `ProfilingEngine` — clarified names
111
+
112
+ The engine enum was reworked into a single, plainly-named ladder
113
+ (no aliases). New default is `Monitor` (telemetry only, no CUPTI).
114
+
115
+ | Name | What it captures |
116
+ |---|---|
117
+ | `Monitor` | GPU/host health metrics only — no CUPTI. The default. |
118
+ | `Trace` | + activity trace: kernels, memcpy, sync (no sampling) |
119
+ | `PcSampling` | + PC stall-reason sampling |
120
+ | `SassMetrics` | + per-instruction SASS counters |
121
+ | `RangeProfiler` | + hardware throughput counters |
122
+ | `Deep` | `PcSampling` + `SassMetrics` in one run |
123
+
124
+ Replaces the earlier `None` / `KernelTrace` / `Continuous` / `Range` /
125
+ `PcSamplingWithSass` names. Pre-1.0, no deprecation shim — the old
126
+ names are gone.
127
+
128
+ #### Ref-counted system-metric sampler
129
+
130
+ `Sampler::configure()` / `activate()` / `deactivate()` / `shutdown()`
131
+ replaces the old `start()` / `stop()`. Activation count composes
132
+ across `continuous_system_sampling` baseline, `GFL_SCOPE` enter/exit,
133
+ and explicit `systemStart()` / `systemStop()` calls — the sampler
134
+ runs while any activator is in flight.
135
+
136
+ ### Bug fixes
137
+
138
+ #### Scope-driven system sampling now works
139
+
140
+ Before: setting `sampling_auto_start=false` silently disabled all
141
+ system metrics, even inside `GFL_SCOPE` regions. The flag's name
142
+ suggested "wait for explicit start" semantics but the code disabled
143
+ sampling entirely. Now, under the renamed `continuous_system_sampling
144
+ = false`, the sampler activates while inside any scope or between
145
+ `systemStart` / `systemStop` calls, then idles outside that window.
146
+
147
+ #### EventWrapper envelope on upload POSTs
148
+
149
+ The initial `uploadLogs()` draft POSTed bare NDJSON event lines.
150
+ The backend's `EventIngestionController` deserialized those into an
151
+ `EventWrapper` with every field null, the inner `objectMapper.readValue
152
+ (null, ...)` threw, the exception was caught and swallowed, and the
153
+ controller returned 200 OK anyway — silent data loss. Every event is
154
+ now correctly wrapped in `{data, agentSendingTime, hostname, ipAddr}`.
155
+
156
+ Regression test added in `tests/upload/test_upload_logs.cpp`.
157
+
158
+ ### Tests added
159
+
160
+ - `tests/core/test_sampler.cpp` — 8 scenarios for the ref-counted
161
+ Sampler (activate/deactivate, nesting, force-shutdown, unbalanced
162
+ deactivate clamping).
163
+ - `tests/upload/test_upload_logs.cpp` — 12 scenarios for the upload
164
+ path (happy path, headers, cursor refusal + force override, auth
165
+ failure, malformed lines, session-id filter, all-sessions,
166
+ lifecycle ordering, EventWrapper envelope regression guard).
167
+ - `tests/python/test_continuous_system_sampling.py` — 5 integration
168
+ scenarios for the three sampling modes plus deprecation behavior.
169
+
170
+ ### Internal / build
171
+
172
+ - Removed `include/gpufl/core/logger/http_log_sink.{hpp,cpp}`.
173
+ - Added `include/gpufl/upload/upload_logs.{hpp,cpp}` to the CMake
174
+ target sources.
175
+ - `CMakeLists.txt` `project(VERSION)` bumped to 1.1.0; new
176
+ `GPUFL_VERSION_SUFFIX` variable layers the PEP 440 pre-release
177
+ token onto `GPUFL_CLIENT_VERSION` (currently `"rc2"`; set to `""`
178
+ to promote to 1.1.0 final).
179
+
180
+ ### Migration checklist for 1.0.x → 1.1.0rc2
181
+
182
+ **Optional in v1.1, required by v1.2:**
183
+
184
+ - [ ] Python: replace every `gpufl.init(remote_upload=True, ...)` call
185
+ with `with gpufl.session(backend_url=..., api_key=...):` or an
186
+ explicit `gpufl.upload_logs(...)` after `shutdown()`. The old form
187
+ still works in v1.1 with a `DeprecationWarning`; v1.2 will remove it.
188
+ - [ ] C++: replace `opts.remote_upload = true;` with an explicit
189
+ `gpufl::uploadLogs(uopts)` after `gpufl::shutdown()`. The field
190
+ still compiles in v1.1 but is a no-op; v1.2 will delete it.
191
+ - [ ] Container manifests: prefer dropping `GPUFL_REMOTE_UPLOAD` and
192
+ driving upload via your app code (or the `gpufl upload` CLI in a
193
+ lifecycle hook). The env var still routes through the Python shim
194
+ in v1.1; v1.2 stops reading it.
195
+ - [ ] Future-proof: start passing `backend_url` and `api_key` directly
196
+ to `gpufl::uploadLogs()` / `gpufl.upload_logs()` rather than relying
197
+ on the InitOptions fields. Those InitOptions fields will move to
198
+ UploadOptions only in v1.2.
199
+
200
+ **Required in v1.1 (no grace period):**
201
+
202
+ - [ ] Python: rename `sampling_auto_start` → `continuous_system_sampling`.
203
+ The old name still works with a `DeprecationWarning` (removed in v1.2).
204
+ - [ ] C++: rename `opts.sampling_auto_start` → `opts.continuous_system_sampling`
205
+ (compile-time error otherwise — no grace period for C++).
206
+ - [ ] If you `#include`'d `http_log_sink.hpp` directly anywhere,
207
+ drop the include — the header is gone.
208
+
209
+ ---
210
+
211
+ ## Releases prior to 1.1.0
212
+
213
+ See git tags for the historical record. Highlights:
214
+
215
+ - **1.0.3** — `ScopeMeta` benchmark-iteration helper, scope iterator
216
+ form, `gpufl.report` text summary improvements.
217
+ - **1.0.2** — first version published to PyPI; "Stable" status.
218
+ - **1.0.1** — `kernel_sample_rate_ms` deprecated (no-op).
219
+ - **1.0.0** — first stable contract.
@@ -1,11 +1,18 @@
1
1
  cmake_minimum_required(VERSION 3.31)
2
2
 
3
3
  project(gpufl_client
4
- VERSION 1.0.2
4
+ VERSION 1.1.0
5
5
  LANGUAGES CXX
6
6
  DESCRIPTION "Header-only GPU monitoring client library"
7
7
  )
8
8
 
9
+ # Pre-release suffix appended to GPUFL_CLIENT_VERSION below. PEP 440
10
+ # pre-release tokens (`rc1`, `a1`, `b1`, …) aren't valid in CMake's
11
+ # `project(... VERSION ...)`, so we layer them on top here. To promote
12
+ # 1.1.0rc2 → 1.1.0 final, set this to the empty string. To bump rc2 →
13
+ # rc3 mid-validation, change just this line.
14
+ set(GPUFL_VERSION_SUFFIX "rc2")
15
+
9
16
  # -----------------------
10
17
  # CUDA Architectures (CI Friendly)
11
18
  # -----------------------
@@ -22,6 +29,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
22
29
  # -----------------------
23
30
  option(GPUFL_ENABLE_NVIDIA "Enable NVIDIA backends (CUDA + NVML when available)" ON)
24
31
  option(GPUFL_ENABLE_AMD "Enable AMD backends (ROCm when available)" OFF)
32
+ option(GPUFL_ENABLE_AMD_ROCPROFILER "Enable AMD rocprofiler-sdk tracing backend when available" ON)
25
33
 
26
34
  option(BUILD_GPUFL_EXAMPLE "Build gpufl example application" ON)
27
35
  option(BUILD_GPUFL_MONITOR "Build gpufl-monitor standalone daemon" ON)
@@ -56,7 +64,7 @@ target_compile_features(gpufl INTERFACE cxx_std_17)
56
64
  # inline the literal at call sites and the mismatch becomes visible
57
65
  # as comparison failures (e.g. test asserting on User-Agent).
58
66
  target_compile_definitions(gpufl PUBLIC
59
- GPUFL_CLIENT_VERSION="${PROJECT_VERSION}"
67
+ GPUFL_CLIENT_VERSION="${PROJECT_VERSION}${GPUFL_VERSION_SUFFIX}"
60
68
  )
61
69
 
62
70
  # Enable PIC for static library (required when linking into shared libraries like Python modules)
@@ -68,7 +76,7 @@ target_sources(gpufl PRIVATE
68
76
  include/gpufl/core/logger/logger.cpp
69
77
  include/gpufl/core/logger/log_rotator.cpp
70
78
  include/gpufl/core/logger/file_log_sink.cpp
71
- include/gpufl/core/logger/http_log_sink.cpp
79
+ include/gpufl/upload/upload_logs.cpp
72
80
  include/gpufl/core/host_info.cpp
73
81
  include/gpufl/core/remote_config.cpp
74
82
  include/gpufl/core/model/batch_models.cpp
@@ -147,7 +155,9 @@ target_sources(gpufl PRIVATE include/gpufl/core/logger/file_compressor.cpp)
147
155
 
148
156
 
149
157
  # -----------------------
150
- # cpp-httplib — HTTP client for direct-to-backend log upload (HttpLogSink).
158
+ # cpp-httplib — HTTP client used by gpufl::uploadLogs (deferred upload
159
+ # of session NDJSON files to the backend, called after gpufl::shutdown).
160
+ # Also used by remote_config.cpp for the post-init version probe.
151
161
  #
152
162
  # Single-header library. Fetched once via FetchContent so every build gets
153
163
  # the same version regardless of the host system. HTTPS support is gated
@@ -167,30 +177,37 @@ FetchContent_MakeAvailable(httplib)
167
177
 
168
178
  find_package(OpenSSL QUIET)
169
179
  if(OpenSSL_FOUND)
170
- message(STATUS "Found OpenSSL: ${OPENSSL_VERSION} — HttpLogSink HTTPS enabled")
180
+ message(STATUS "Found OpenSSL: ${OPENSSL_VERSION} — HTTPS upload enabled")
171
181
  target_compile_definitions(gpufl PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT=1)
172
182
  target_link_libraries(gpufl PRIVATE OpenSSL::SSL OpenSSL::Crypto)
173
183
  set(GPUFL_HTTPLIB_TLS 1)
174
184
  else()
175
185
  message(WARNING
176
- "OpenSSL not found — HttpLogSink will support HTTP only. "
177
- "Users pointing remote_config at an https:// endpoint will see "
178
- "upload failures. Install OpenSSL (apt: libssl-dev, vcpkg: openssl, "
179
- "brew: openssl) to enable TLS.")
186
+ "OpenSSL not found — gpufl::uploadLogs will support HTTP only. "
187
+ "Pointing backend_url at an https:// endpoint will fail to verify. "
188
+ "Install OpenSSL (apt: libssl-dev, vcpkg: openssl, brew: openssl) "
189
+ "to enable TLS.")
180
190
  set(GPUFL_HTTPLIB_TLS 0)
181
191
  endif()
182
192
  target_compile_definitions(gpufl PRIVATE GPUFL_HTTPLIB_TLS=${GPUFL_HTTPLIB_TLS})
183
- # NOTE: cpp-httplib's gzip request-body path (CPPHTTPLIB_ZLIB_SUPPORT)
184
- # is intentionally NOT defined. HttpLogSink sends uncompressed JSON
185
- # per event; bandwidth-conscious deployments use gpufl-agent against
186
- # the rotated NDJSON files (where deflate amortizes its dictionary
187
- # across the whole file for 10-15× compression vs the ~5× we'd get
188
- # per-POST). See the architectural note above HttpLogSink::Options
189
- # in http_log_sink.hpp for the full rationale.
193
+ # Enable cpp-httplib's gzip path. PUBLIC so any consumer that includes
194
+ # httplib.h via gpufl's headers (notably the test target's embedded
195
+ # httplib::Server) gets the same wire-format support without this,
196
+ # server::Post handlers return 415 on incoming Content-Encoding: gzip
197
+ # requests, which is exactly what uploadLogs sends in v1.2+.
198
+ #
199
+ # In production, the Spring Boot backend handles gzip natively, so
200
+ # this define affects the test target more than the client. We still
201
+ # define it on gpufl PUBLIC because:
202
+ # - It enables future use of httplib::Client::set_compress(true)
203
+ # to compress the body in-place instead of our manual gzipString
204
+ # (no current need, but a free optimization if we ever want it).
205
+ # - It propagates to tests, which is the immediate motivation.
190
206
  #
191
- # ZLIB is still linked above (around line 96) — file_compressor.cpp
192
- # uses it to gzip rotated NDJSON files written by FileLogSink, which
193
- # is a separate, working feature.
207
+ # ZLIB is already linked above — file_compressor.cpp uses it to gzip
208
+ # rotated NDJSON files, and upload_logs.cpp uses it both to read those
209
+ # files back and to gzip outgoing stream-chunks.
210
+ target_compile_definitions(gpufl PUBLIC CPPHTTPLIB_ZLIB_SUPPORT=1)
194
211
  target_link_libraries(gpufl PRIVATE httplib::httplib)
195
212
 
196
213
 
@@ -225,6 +242,7 @@ if(GPUFL_ENABLE_NVIDIA)
225
242
  include/gpufl/backends/nvidia/sampler/cupti_sass.hpp
226
243
  include/gpufl/backends/nvidia/cuda_collector.cpp
227
244
  include/gpufl/backends/nvidia/cupti_utils.cpp
245
+ include/gpufl/backends/nvidia/cuda_cleanup_handler.cpp
228
246
  include/gpufl/backends/nvidia/resource_handler.cpp
229
247
  include/gpufl/backends/nvidia/kernel_launch_handler.cpp
230
248
  include/gpufl/backends/nvidia/mem_transfer_handler.cpp
@@ -481,18 +499,22 @@ if(GPUFL_ENABLE_AMD)
481
499
  endif()
482
500
  endif()
483
501
 
484
- find_package(rocprofiler-sdk QUIET CONFIG HINTS /opt/rocm)
485
- if(TARGET rocprofiler-sdk::rocprofiler-sdk)
486
- set(GPUFL_HAS_ROCPROFILER_SDK 1)
487
- target_link_libraries(gpufl PRIVATE rocprofiler-sdk::rocprofiler-sdk)
488
- target_sources(gpufl PRIVATE
489
- include/gpufl/backends/amd/monitor_adapter_amd.cpp
490
- include/gpufl/backends/amd/rocprofiler_backend.cpp
491
- include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp
492
- )
493
- message(STATUS "Found ROCprofiler-SDK support")
502
+ if(GPUFL_ENABLE_AMD_ROCPROFILER)
503
+ find_package(rocprofiler-sdk QUIET CONFIG HINTS /opt/rocm)
504
+ if(TARGET rocprofiler-sdk::rocprofiler-sdk)
505
+ set(GPUFL_HAS_ROCPROFILER_SDK 1)
506
+ target_link_libraries(gpufl PRIVATE rocprofiler-sdk::rocprofiler-sdk)
507
+ target_sources(gpufl PRIVATE
508
+ include/gpufl/backends/amd/monitor_adapter_amd.cpp
509
+ include/gpufl/backends/amd/rocprofiler_backend.cpp
510
+ include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp
511
+ )
512
+ message(STATUS "Found ROCprofiler-SDK support")
513
+ else()
514
+ message(STATUS "ROCprofiler-SDK not found; AMD kernel tracing disabled")
515
+ endif()
494
516
  else()
495
- message(STATUS "ROCprofiler-SDK not found; AMD kernel tracing disabled")
517
+ message(STATUS "ROCprofiler-SDK support disabled by GPUFL_ENABLE_AMD_ROCPROFILER=OFF")
496
518
  endif()
497
519
 
498
520
  if(GPUFL_HAS_ROCM_SMI OR GPUFL_HAS_HIP)
@@ -44,6 +44,7 @@ RUN cmake -S . -B /build \
44
44
  -DBUILD_GPUFL_MONITOR=ON \
45
45
  -DGPUFL_ENABLE_NVIDIA=OFF \
46
46
  -DGPUFL_ENABLE_AMD=ON \
47
+ -DGPUFL_ENABLE_AMD_ROCPROFILER=OFF \
47
48
  && cmake --build /build --target gpufl-monitor --parallel
48
49
 
49
50
  # ── Stage 2: pull the pre-built Java agent ────────────────────────────────────
@@ -53,22 +54,13 @@ FROM ghcr.io/gpu-flight/gpufl-agent:latest AS agent-jar
53
54
  FROM eclipse-temurin:25-jre AS jre
54
55
 
55
56
  # ── Stage 4: runtime image ─────────────────────────────────────────────────────
56
- FROM ubuntu:24.04
57
+ # Use ROCm base runtime to avoid fragile manual ROCm apt repo/package wiring.
58
+ FROM rocm/dev-ubuntu-24.04:6.4-complete
57
59
 
58
60
  ENV DEBIAN_FRONTEND=noninteractive
59
61
 
60
- # Add ROCm apt repository
61
62
  RUN apt-get update && apt-get install -y --no-install-recommends \
62
- ca-certificates \
63
- wget \
64
- gnupg \
65
- && wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/keyrings/rocm.gpg \
66
- && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.4 noble main' \
67
- > /etc/apt/sources.list.d/rocm.list \
68
- && apt-get update && apt-get install -y --no-install-recommends \
69
63
  supervisor \
70
- rocm-smi-lib \
71
- rocm-hip-runtime \
72
64
  && rm -rf /var/lib/apt/lists/*
73
65
 
74
66
  # Copy Java 25 JRE from Temurin (Ubuntu 24.04 repos only ship up to openjdk-21)