gpufl 0.1.4__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. {gpufl-0.1.4 → gpufl-1.0.0}/.github/workflows/release.yml +84 -3
  2. {gpufl-0.1.4 → gpufl-1.0.0}/CMakeLists.txt +1 -1
  3. {gpufl-0.1.4 → gpufl-1.0.0}/Dockerfile.monitor +5 -1
  4. {gpufl-0.1.4 → gpufl-1.0.0}/Dockerfile.monitor.amd +5 -1
  5. {gpufl-0.1.4 → gpufl-1.0.0}/Dockerfile.monitor.supervisord.conf +3 -1
  6. {gpufl-0.1.4 → gpufl-1.0.0}/PKG-INFO +69 -10
  7. {gpufl-0.1.4 → gpufl-1.0.0}/README.md +66 -7
  8. {gpufl-0.1.4 → gpufl-1.0.0}/daemon/README.md +6 -5
  9. {gpufl-0.1.4 → gpufl-1.0.0}/docker-compose.monitor.amd.yml +6 -2
  10. {gpufl-0.1.4 → gpufl-1.0.0}/docker-compose.monitor.yml +6 -2
  11. {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/gpufl_scope_demo.cpp +0 -1
  12. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/block_style_example.cu +0 -1
  13. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/memory_coalescing_demo.cu +0 -1
  14. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/occupancy_demo.cu +0 -1
  15. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/sass_divergence_demo.cu +2 -5
  16. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/vector_add_benchmark.cu +0 -1
  17. {gpufl-0.1.4 → gpufl-1.0.0}/example/python/02_numba_cuda.py +42 -2
  18. {gpufl-0.1.4 → gpufl-1.0.0}/example/python/03_pytorch_benchmark.py +2 -4
  19. gpufl-1.0.0/images/Screenshot2.png +0 -0
  20. gpufl-1.0.0/include/gpufl/backends/nvidia/cuda_feature_guards.hpp +39 -0
  21. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_backend.hpp +17 -0
  22. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/kernel_launch_handler.cpp +86 -76
  23. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/resource_handler.cpp +9 -0
  24. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/config_file_loader.cpp +4 -2
  25. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/events.hpp +17 -5
  26. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/gpufl.cpp +1 -12
  27. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/itanium_demangle.cpp +58 -4
  28. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/itanium_demangle.hpp +5 -0
  29. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/http_log_sink.cpp +80 -12
  30. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/http_log_sink.hpp +28 -3
  31. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/lifecycle_model.cpp +0 -8
  32. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor.hpp +0 -1
  33. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/remote_config.cpp +0 -3
  34. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/stack_trace.cpp +17 -1
  35. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/gpufl.hpp +0 -1
  36. {gpufl-0.1.4 → gpufl-1.0.0}/pyproject.toml +14 -3
  37. {gpufl-0.1.4 → gpufl-1.0.0}/python/bindings.cpp +39 -55
  38. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/__init__.py +3 -19
  39. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/analyzer/analyzer.py +52 -3
  40. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/report/__init__.py +2 -0
  41. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/report/text_report.py +8 -0
  42. gpufl-1.0.0/python/gpufl/viz/reader.py +239 -0
  43. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/viz/timeline.py +194 -78
  44. {gpufl-0.1.4 → gpufl-1.0.0}/scripts/windows/run-monitor-local.bat +3 -1
  45. {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/nvidia/test_engine_coverage.cpp +0 -1
  46. {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_http_log_sink.cpp +109 -0
  47. {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_wire_contract.cpp +16 -1
  48. {gpufl-0.1.4 → gpufl-1.0.0}/tests/python/test_bindings.py +0 -3
  49. {gpufl-0.1.4 → gpufl-1.0.0}/tests/verify_pipeline.py +4 -3
  50. gpufl-0.1.4/python/gpufl/viz/reader.py +0 -48
  51. {gpufl-0.1.4 → gpufl-1.0.0}/.clang-format +0 -0
  52. {gpufl-0.1.4 → gpufl-1.0.0}/.dockerignore +0 -0
  53. {gpufl-0.1.4 → gpufl-1.0.0}/.github/pull_request_template.md +0 -0
  54. {gpufl-0.1.4 → gpufl-1.0.0}/.github/workflows/build.yml +0 -0
  55. {gpufl-0.1.4 → gpufl-1.0.0}/.gitignore +0 -0
  56. {gpufl-0.1.4 → gpufl-1.0.0}/CONTRIBUTING.md +0 -0
  57. {gpufl-0.1.4 → gpufl-1.0.0}/Dockerfile.demo +0 -0
  58. {gpufl-0.1.4 → gpufl-1.0.0}/LICENSE +0 -0
  59. {gpufl-0.1.4 → gpufl-1.0.0}/benchmark/README.md +0 -0
  60. {gpufl-0.1.4 → gpufl-1.0.0}/benchmark/cuda_gemm.py +0 -0
  61. {gpufl-0.1.4 → gpufl-1.0.0}/benchmark/pytorch_train.py +0 -0
  62. {gpufl-0.1.4 → gpufl-1.0.0}/benchmark/run_benchmark.py +0 -0
  63. {gpufl-0.1.4 → gpufl-1.0.0}/build.sh +0 -0
  64. {gpufl-0.1.4 → gpufl-1.0.0}/daemon/monitor/CMakeLists.txt +0 -0
  65. {gpufl-0.1.4 → gpufl-1.0.0}/daemon/monitor/main.cpp +0 -0
  66. {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/CMakeLists.txt +0 -0
  67. {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/README.md +0 -0
  68. {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/check_device.cpp +0 -0
  69. {gpufl-0.1.4 → gpufl-1.0.0}/example/amd/vector_add_benchmark.cpp +0 -0
  70. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/CMakeLists.txt +0 -0
  71. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/check_conflict.cu +0 -0
  72. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/check_device.cu +0 -0
  73. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/cupti_basic.cu +0 -0
  74. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/cupti_pc_sampling.cu +0 -0
  75. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/list_sass_metrics.cu +0 -0
  76. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/system_monitor.cu +0 -0
  77. {gpufl-0.1.4 → gpufl-1.0.0}/example/cuda/test_occupancy.cu +0 -0
  78. {gpufl-0.1.4 → gpufl-1.0.0}/example/python/01_basic.py +0 -0
  79. {gpufl-0.1.4 → gpufl-1.0.0}/example/python/analyzer/01_analyzer_sample.py +0 -0
  80. {gpufl-0.1.4 → gpufl-1.0.0}/example/python/requirements.txt +0 -0
  81. {gpufl-0.1.4 → gpufl-1.0.0}/example/python/viz/01_plot_memory_timeline.py +0 -0
  82. {gpufl-0.1.4 → gpufl-1.0.0}/example/python/viz/02_plot_stress_timeline.py +0 -0
  83. {gpufl-0.1.4 → gpufl-1.0.0}/images/Screenshot1.png +0 -0
  84. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/engine/amd_profiling_engine.hpp +0 -0
  85. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/engine/dispatch_counter_engine.cpp +0 -0
  86. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/engine/dispatch_counter_engine.hpp +0 -0
  87. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/hip_static_collector.cpp +0 -0
  88. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/hip_static_collector.hpp +0 -0
  89. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/monitor_adapter_amd.cpp +0 -0
  90. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/monitor_adapter_amd.hpp +0 -0
  91. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/rocm_collector.cpp +0 -0
  92. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/rocm_collector.hpp +0 -0
  93. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/rocprofiler_backend.cpp +0 -0
  94. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/amd/rocprofiler_backend.hpp +0 -0
  95. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/host_collector.hpp +0 -0
  96. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cuda_collector.cpp +0 -0
  97. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cuda_collector.hpp +0 -0
  98. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_backend.cpp +0 -0
  99. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_common.hpp +0 -0
  100. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_utils.cpp +0 -0
  101. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/cupti_utils.hpp +0 -0
  102. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.cpp +0 -0
  103. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/pc_sampling_engine.hpp +0 -0
  104. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.cpp +0 -0
  105. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/pc_sampling_with_sass_engine.hpp +0 -0
  106. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/profiling_engine.hpp +0 -0
  107. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/range_profiler_engine.cpp +0 -0
  108. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/range_profiler_engine.hpp +0 -0
  109. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/sass_metrics_engine.cpp +0 -0
  110. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/engine/sass_metrics_engine.hpp +0 -0
  111. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/kernel_launch_handler.hpp +0 -0
  112. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/mem_transfer_handler.cpp +0 -0
  113. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/mem_transfer_handler.hpp +0 -0
  114. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.cpp +0 -0
  115. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/monitor_adapter_nvidia.hpp +0 -0
  116. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/nvml_collector.cpp +0 -0
  117. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/nvml_collector.hpp +0 -0
  118. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/resource_handler.hpp +0 -0
  119. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/sampler/cupti_sass.cpp +0 -0
  120. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/sampler/cupti_sass.hpp +0 -0
  121. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/synchronization_handler.cpp +0 -0
  122. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/backends/nvidia/synchronization_handler.hpp +0 -0
  123. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/activity_record.hpp +0 -0
  124. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/backend_factory.cpp +0 -0
  125. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/backend_factory.hpp +0 -0
  126. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/backend_interfaces.hpp +0 -0
  127. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/batch_buffer.hpp +0 -0
  128. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/common.cpp +0 -0
  129. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/common.hpp +0 -0
  130. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/config_file_loader.hpp +0 -0
  131. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/debug_logger.cpp +0 -0
  132. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/debug_logger.hpp +0 -0
  133. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/dictionary_manager.cpp +0 -0
  134. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/dictionary_manager.hpp +0 -0
  135. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/host_info.cpp +0 -0
  136. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/host_info.hpp +0 -0
  137. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/json/json.cpp +0 -0
  138. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/json/json.hpp +0 -0
  139. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/file_compressor.cpp +0 -0
  140. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/file_compressor.hpp +0 -0
  141. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/file_log_sink.cpp +0 -0
  142. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/file_log_sink.hpp +0 -0
  143. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/log_rotator.cpp +0 -0
  144. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/log_rotator.hpp +0 -0
  145. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/log_sink.hpp +0 -0
  146. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/logger.cpp +0 -0
  147. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/logger/logger.hpp +0 -0
  148. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/batch_models.cpp +0 -0
  149. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/batch_models.hpp +0 -0
  150. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/graph_launch_event_model.cpp +0 -0
  151. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/graph_launch_event_model.hpp +0 -0
  152. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/kernel_event_model.cpp +0 -0
  153. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/kernel_event_model.hpp +0 -0
  154. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/lifecycle_model.hpp +0 -0
  155. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/memcpy_event_model.cpp +0 -0
  156. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/memcpy_event_model.hpp +0 -0
  157. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/memory_alloc_event_model.cpp +0 -0
  158. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/memory_alloc_event_model.hpp +0 -0
  159. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/model_utils.hpp +0 -0
  160. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/nvtx_marker_model.cpp +0 -0
  161. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/nvtx_marker_model.hpp +0 -0
  162. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/perf_metric_model.cpp +0 -0
  163. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/perf_metric_model.hpp +0 -0
  164. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/profile_sample_model.cpp +0 -0
  165. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/profile_sample_model.hpp +0 -0
  166. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/scope_event_model.cpp +0 -0
  167. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/scope_event_model.hpp +0 -0
  168. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/serializable.hpp +0 -0
  169. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/synchronization_event_model.cpp +0 -0
  170. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/synchronization_event_model.hpp +0 -0
  171. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/system_event_model.cpp +0 -0
  172. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/model/system_event_model.hpp +0 -0
  173. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor.cpp +0 -0
  174. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor_adapter.cpp +0 -0
  175. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor_adapter.hpp +0 -0
  176. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/monitor_backend.hpp +0 -0
  177. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/remote_config.hpp +0 -0
  178. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/ring_buffer.hpp +0 -0
  179. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/runtime.cpp +0 -0
  180. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/runtime.hpp +0 -0
  181. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/sampler.cpp +0 -0
  182. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/sampler.hpp +0 -0
  183. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/sass_compressor.cpp +0 -0
  184. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/sass_compressor.hpp +0 -0
  185. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/scope_registry.cpp +0 -0
  186. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/scope_registry.hpp +0 -0
  187. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/stack_registry.hpp +0 -0
  188. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/stack_trace.hpp +0 -0
  189. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/stream_handle.hpp +0 -0
  190. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/trace_type.hpp +0 -0
  191. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/core/version.hpp +0 -0
  192. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/report/hint_engine.cpp +0 -0
  193. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/report/hint_engine.hpp +0 -0
  194. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/report/text_report.cpp +0 -0
  195. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl/report/text_report.hpp +0 -0
  196. {gpufl-0.1.4 → gpufl-1.0.0}/include/gpufl.hpp +0 -0
  197. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/.gitignore +0 -0
  198. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/analyzer/__init__.py +0 -0
  199. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/cupy/__init__.py +0 -0
  200. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/jax/__init__.py +0 -0
  201. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/numba/__init__.py +0 -0
  202. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/__init__.py +0 -0
  203. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/dispatch.py +0 -0
  204. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/profile.py +0 -0
  205. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/stack.py +0 -0
  206. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/torch/trace_import.py +0 -0
  207. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/triton/__init__.py +0 -0
  208. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/utils.py +0 -0
  209. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/viz/__init__.py +0 -0
  210. {gpufl-0.1.4 → gpufl-1.0.0}/python/gpufl/viz/visualizer.py +0 -0
  211. {gpufl-0.1.4 → gpufl-1.0.0}/scripts/docker-demo-loop.sh +0 -0
  212. {gpufl-0.1.4 → gpufl-1.0.0}/tests/CMakeLists.txt +0 -0
  213. {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/amd/test_rocm_collector.cpp +0 -0
  214. {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/nvidia/test_cuda_collector.cpp +0 -0
  215. {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/nvidia/test_nvidia_backend.cpp +0 -0
  216. {gpufl-0.1.4 → gpufl-1.0.0}/tests/backends/nvidia/test_nvml_collector.cpp +0 -0
  217. {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/log_utils.cpp +0 -0
  218. {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/log_utils.hpp +0 -0
  219. {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/test_kernel.cu +0 -0
  220. {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/test_kernel.hpp +0 -0
  221. {gpufl-0.1.4 → gpufl-1.0.0}/tests/common/test_utils.hpp +0 -0
  222. {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_analyzer.cpp +0 -0
  223. {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_api_path_routing.cpp +0 -0
  224. {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_batch_models.cpp +0 -0
  225. {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_itanium_demangle.cpp +0 -0
  226. {gpufl-0.1.4 → gpufl-1.0.0}/tests/core/test_monitor.cpp +0 -0
  227. {gpufl-0.1.4 → gpufl-1.0.0}/tests/main_test_runner.cpp +0 -0
  228. {gpufl-0.1.4 → gpufl-1.0.0}/tests/python/conftest.py +0 -0
  229. {gpufl-0.1.4 → gpufl-1.0.0}/tests/python/test_analyzer.py +0 -0
  230. {gpufl-0.1.4 → gpufl-1.0.0}/tests/python/test_remote_upload_smoke.py +0 -0
  231. {gpufl-0.1.4 → gpufl-1.0.0}/tests/run_engine_coverage.ps1 +0 -0
  232. {gpufl-0.1.4 → gpufl-1.0.0}/tests/run_engine_coverage.sh +0 -0
@@ -136,16 +136,65 @@ jobs:
136
136
  uses: pypa/cibuildwheel@v2.22.0
137
137
  env:
138
138
  CIBW_VIRTUALENV_VERSION: "20.27.1"
139
- CIBW_ENVIRONMENT_LINUX: "CUDA_HOME=/usr/local/cuda PATH=/usr/local/cuda/bin:$PATH CMAKE_ARGS='-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF'"
139
+ # OpenSSL hints: manylinux_2_28 (AlmaLinux 8) ships OpenSSL 1.1.1
140
+ # as the system default, but cpp-httplib v0.18.5 requires >= 3.0.0
141
+ # (SSL_get1_peer_certificate). We install EL8's `openssl3-devel`
142
+ # compat package (see CIBW_BEFORE_ALL_LINUX) which lays OpenSSL 3.x
143
+ # down under NON-standard prefixes — headers in /usr/include/openssl3,
144
+ # dev symlinks in /usr/lib64/openssl3 — so find_package(OpenSSL)
145
+ # won't see it without these explicit cache vars. The runtime
146
+ # SONAME is still libssl.so.3 in /usr/lib64, so auditwheel bundles
147
+ # it into gpufl.libs/ as before. Verified against the actual
148
+ # quay.io/pypa/manylinux_2_28_x86_64 image (OpenSSL 3.5.5).
149
+ CIBW_ENVIRONMENT_LINUX: "CUDA_HOME=/usr/local/cuda PATH=/usr/local/cuda/bin:$PATH CMAKE_ARGS='-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF -DOPENSSL_INCLUDE_DIR=/usr/include/openssl3 -DOPENSSL_SSL_LIBRARY=/usr/lib64/openssl3/libssl.so -DOPENSSL_CRYPTO_LIBRARY=/usr/lib64/openssl3/libcrypto.so'"
150
+ # Windows build needs the OpenSSL install path so find_package(OpenSSL)
151
+ # in CMakeLists.txt succeeds, otherwise HTTPS upload (HttpLogSink)
152
+ # silently falls back to HTTP-only — see openssl-windows.html in the
153
+ # manual repo for the user-facing story. CIBW_BEFORE_ALL_WINDOWS
154
+ # installs choco's openssl package into this path.
155
+ CIBW_ENVIRONMENT_WINDOWS: >-
156
+ OPENSSL_ROOT_DIR="C:/Program Files/OpenSSL-Win64"
157
+ CMAKE_ARGS="-DGPUFL_ENABLE_NVIDIA=ON -DGPUFL_ENABLE_AMD=OFF -DBUILD_TESTING=OFF"
140
158
  # cuda-nvml-devel-13-1 ships the libnvidia-ml.so stub under
141
159
  # targets/x86_64-linux/lib/stubs/ — without it CMake's NVML probe
142
160
  # finds nothing and (since v0.1.1) fails the build loudly. Every
143
161
  # release before v0.1.1 silently shipped wheels without NVML
144
162
  # because this package was missing here.
163
+ #
164
+ # openssl3-devel (NOT openssl-devel — that's 1.1.1 on EL8) provides
165
+ # OpenSSL 3.x headers + .so symlinks so cpp-httplib's
166
+ # CPPHTTPLIB_OPENSSL_SUPPORT path compiles (it #errors on < 3.0.0).
167
+ # It installs under /usr/include/openssl3 + /usr/lib64/openssl3 —
168
+ # see the OPENSSL_* hints in CIBW_ENVIRONMENT_LINUX. auditwheel
169
+ # bundles the resulting libssl.so.3 / libcrypto.so.3 (from
170
+ # /usr/lib64) into the wheel under gpufl.libs/ automatically
171
+ # (they're not on the manylinux_2_28 whitelist or our --exclude list).
145
172
  CIBW_BEFORE_ALL_LINUX: >-
146
173
  curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo > /etc/yum.repos.d/cuda.repo &&
147
- dnf install -y --nogpgcheck cuda-nvcc-13-1 cuda-cudart-devel-13-1 cuda-cupti-13-1 cuda-driver-devel-13-1 cuda-nvml-devel-13-1
148
- CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
174
+ dnf install -y --nogpgcheck cuda-nvcc-13-1 cuda-cudart-devel-13-1 cuda-cupti-13-1 cuda-driver-devel-13-1 cuda-nvml-devel-13-1 openssl3-devel
175
+ # Install OpenSSL on the Windows runner so find_package(OpenSSL)
176
+ # in CMakeLists.txt succeeds and cpp-httplib gets compiled with
177
+ # CPPHTTPLIB_OPENSSL_SUPPORT=1. Chocolatey is pre-installed on
178
+ # GitHub's windows-latest runners; the openssl package installs
179
+ # to C:\Program Files\OpenSSL-Win64\ (matches OPENSSL_ROOT_DIR
180
+ # in CIBW_ENVIRONMENT_WINDOWS above).
181
+ CIBW_BEFORE_ALL_WINDOWS: choco install -y openssl --no-progress
182
+ # cibuildwheel ships NO default Windows repair command and only
183
+ # auto-installs delvewheel for that (nonexistent) default. Because
184
+ # we override CIBW_REPAIR_WHEEL_COMMAND_WINDOWS below (to pass
185
+ # --add-path for the OpenSSL DLLs), we must install delvewheel
186
+ # ourselves. The repair step runs in this same build env, so the
187
+ # tool lands on PATH. Without this the repair dies with
188
+ # "'delvewheel' is not recognized as an internal or external command".
189
+ CIBW_BEFORE_BUILD_WINDOWS: pip install delvewheel
190
+ # Pin a recent manylinux_2_28 image (AlmaLinux 8.10). cibuildwheel
191
+ # 2.22.0's default pin (2024.11.16-1) is an AlmaLinux 8.6 snapshot
192
+ # whose repos only carry OpenSSL 1.1.1 — there is NO openssl3
193
+ # package — so the build died with "No match for argument:
194
+ # openssl3-devel". 8.10 ships openssl3 (3.5.5). Same glibc-2.28 /
195
+ # manylinux_2_28 ABI; verified to have cp312+cp313, openssl3-devel,
196
+ # and libssl.so.3 in /usr/lib64 (so auditwheel still bundles it).
197
+ CIBW_MANYLINUX_X86_64_IMAGE: quay.io/pypa/manylinux_2_28_x86_64:2026.05.17-1
149
198
  CIBW_BUILD: "cp312-manylinux_x86_64 cp313-manylinux_x86_64 cp312-win_amd64 cp313-win_amd64"
150
199
  # libnvidia-ml.so.1 is excluded for the same reason as libcuda.so.1:
151
200
  # it ships with the NVIDIA driver, not the CUDA toolkit, and is
@@ -156,7 +205,39 @@ jobs:
156
205
  # could not be located"). The toolkit's `libnvidia-ml.so` stub is
157
206
  # only the unversioned link-time placeholder — the versioned
158
207
  # `.so.1` the SONAME chains to lives on the user's machine.
208
+ #
209
+ # libssl/libcrypto are NOT excluded — auditwheel bundles them
210
+ # under gpufl.libs/ so the wheel ships its own OpenSSL and
211
+ # HTTPS works on user machines without any system install.
159
212
  CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair --plat manylinux_2_28_x86_64 --exclude libcuda.so.1 --exclude libnvidia-ml.so.1 -w {dest_dir} {wheel}"
213
+ # On Windows, cibuildwheel's default is `delvewheel repair`.
214
+ # We need delvewheel to find the OpenSSL DLLs (libssl-3-x64.dll,
215
+ # libcrypto-3-x64.dll) so it copies them into the wheel. The
216
+ # choco install puts them under C:\Program Files\OpenSSL-Win64\bin\
217
+ # — give that to delvewheel via --add-path. Without this, the
218
+ # rebuilt wheel imports cleanly on a system that already has
219
+ # OpenSSL on PATH but fails on a clean machine.
220
+ # delvewheel vendors the wheel's DLL deps. We must:
221
+ # * --add-path the dirs holding the DLLs to bundle. OpenSSL is in
222
+ # its choco bin; cudart64_*.dll is in CUDA\vX.Y\bin (on PATH, but
223
+ # listed for safety); cupti64_*.dll lives in CUDA's
224
+ # extras\CUPTI\lib64, which is NOT on PATH — without it delvewheel
225
+ # fails with "Unable to find library: cupti64_2025.4.0.dll". We
226
+ # bundle cudart+cupti so the wheel is self-contained, matching the
227
+ # Linux wheel (auditwheel bundles libcudart/libcupti). The CUPTI
228
+ # version-suffixed DLL name also makes excluding it fragile across
229
+ # CUDA point releases, so bundling is the robust choice.
230
+ # * --exclude the driver DLLs that ship with the user's driver, not
231
+ # the toolkit, and are absent on the GPU-less runner: nvcuda.dll
232
+ # (== libcuda.so.1) and nvml.dll (== libnvidia-ml.so.1). Mirrors
233
+ # the Linux auditwheel --exclude flags.
234
+ # Paths use the pinned CUDA 13.1 location (see the Jimver cuda-toolkit
235
+ # step). Both --add-path and --exclude are ';'-delimited.
236
+ CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: >-
237
+ delvewheel repair
238
+ --add-path "C:\\Program Files\\OpenSSL-Win64\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.1\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.1\\extras\\CUPTI\\lib64"
239
+ --exclude "nvcuda.dll;nvml.dll"
240
+ -w {dest_dir} {wheel}
160
241
 
161
242
  - uses: actions/upload-artifact@v4
162
243
  with:
@@ -1,7 +1,7 @@
1
1
  cmake_minimum_required(VERSION 3.31)
2
2
 
3
3
  project(gpufl_client
4
- VERSION 0.1.4
4
+ VERSION 1.0.0
5
5
  LANGUAGES CXX
6
6
  DESCRIPTION "Header-only GPU monitoring client library"
7
7
  )
@@ -5,10 +5,14 @@
5
5
  #
6
6
  # Run:
7
7
  # docker run --gpus all \
8
- # -e GPUFL_HTTP_URL=http://my-backend:8080/api/v1/events/ \
8
+ # -e GPUFL_HTTP_HOST=https://api.gpuflight.com \
9
9
  # -e GPUFL_HTTP_TOKEN=gfl_... \
10
10
  # gpufl/monitor:latest
11
11
  #
12
+ # GPUFL_HTTP_HOST is just the scheme+host. The agent appends the
13
+ # /api/{version}/events/<type> path automatically; override the
14
+ # version with GPUFL_HTTP_API_VERSION when the backend cuts v2 etc.
15
+ #
12
16
  # The Java agent JAR is pulled from the pre-built ghcr.io/gpu-flight/gpufl-agent image.
13
17
  # No local gpufl-agent checkout is required.
14
18
 
@@ -7,10 +7,14 @@
7
7
  # docker run -d \
8
8
  # --device /dev/kfd --device /dev/dri \
9
9
  # --group-add video --group-add render \
10
- # -e GPUFL_HTTP_URL=http://my-backend:8080/api/v1/events/ \
10
+ # -e GPUFL_HTTP_HOST=https://api.gpuflight.com \
11
11
  # -e GPUFL_HTTP_TOKEN=gfl_... \
12
12
  # gpufl/monitor-amd:latest
13
13
  #
14
+ # GPUFL_HTTP_HOST is just the scheme+host. The agent appends the
15
+ # /api/{version}/events/<type> path automatically; override the
16
+ # version with GPUFL_HTTP_API_VERSION when the backend cuts v2 etc.
17
+ #
14
18
  # The Java agent JAR is pulled from the pre-built ghcr.io/gpu-flight/gpufl-agent image.
15
19
  # No local gpufl-agent checkout is required.
16
20
 
@@ -21,7 +21,9 @@ stderr_logfile_maxbytes=0
21
21
  ; GPUFL_SOURCE_FOLDER — must match the log dir used by gpufl-monitor
22
22
  ; GPUFL_SOURCE_PREFIX — must match GPUFL_MONITOR_LOG_DIR base name (default: session)
23
23
  ; GPUFL_PUBLISHER_TYPE — http or kafka
24
- ; GPUFL_HTTP_URL — e.g. http://backend:8080/api/v1/events/
24
+ ; GPUFL_HTTP_HOST scheme+host, e.g. https://api.gpuflight.com
25
+ ; (agent appends /api/{version}/events/<type> automatically)
26
+ ; GPUFL_HTTP_API_VERSION — optional; defaults to v1
25
27
  ; GPUFL_HTTP_TOKEN — Bearer token
26
28
  ; GPUFL_LOG_TYPES — default: device,scope,system (override to restrict channels)
27
29
  ; GPUFL_CURSOR_FILE — default: ./cursor.json (override for persistence across restarts)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: gpufl
3
- Version: 0.1.4
3
+ Version: 1.0.0
4
4
  Summary: GPU Monitoring Client
5
5
  Author-Email: Myoungho Shin <myounghoshin84@gmail.com>
6
6
  Classifier: Development Status :: 3 - Alpha
@@ -22,7 +22,7 @@ Requires-Dist: jax>=0.4; extra == "jax"
22
22
  Provides-Extra: triton
23
23
  Requires-Dist: triton>=2.1; extra == "triton"
24
24
  Provides-Extra: numba
25
- Requires-Dist: numba; extra == "numba"
25
+ Requires-Dist: numba-cuda; extra == "numba"
26
26
  Requires-Dist: numpy; extra == "numba"
27
27
  Provides-Extra: viz
28
28
  Requires-Dist: pandas>=1.5; extra == "viz"
@@ -36,7 +36,7 @@ Requires-Dist: requests>=2.28; extra == "all"
36
36
  Requires-Dist: cupy-cuda12x>=12; extra == "all"
37
37
  Requires-Dist: jax>=0.4; extra == "all"
38
38
  Requires-Dist: triton>=2.1; extra == "all"
39
- Requires-Dist: numba; extra == "all"
39
+ Requires-Dist: numba-cuda; extra == "all"
40
40
  Requires-Dist: numpy; extra == "all"
41
41
  Requires-Dist: pandas>=1.5; extra == "all"
42
42
  Requires-Dist: matplotlib>=3.7; extra == "all"
@@ -63,7 +63,7 @@ To keep the initial design coherent, **we are not currently accepting major feat
63
63
 
64
64
  Try the portal with real session data — no sign-up required:
65
65
 
66
- **[https://gpufl-front.vercel.app/demo/gdemo_4R98GA5MzYdosqvNsUqdp_MaUgEcDHABS2C5PHbCDQE](https://gpufl-front.vercel.app/demo/gdemo_4R98GA5MzYdosqvNsUqdp_MaUgEcDHABS2C5PHbCDQE)**
66
+ **[Demo Link](https://demo.gpuflight.com)**
67
67
 
68
68
  ## Key Features
69
69
 
@@ -128,7 +128,6 @@ import gpufl
128
128
  gpufl.init("my-app",
129
129
  log_path="./my_logs",
130
130
  sampling_auto_start=True,
131
- enable_kernel_details=True,
132
131
  enable_stack_trace=True)
133
132
 
134
133
  a = torch.randn(1024, 1024, device="cuda")
@@ -150,8 +149,7 @@ from gpufl import ProfilingEngine
150
149
 
151
150
  gpufl.init("my-app",
152
151
  log_path="./logs",
153
- profiling_engine=ProfilingEngine.PcSampling,
154
- enable_kernel_details=True)
152
+ profiling_engine=ProfilingEngine.PcSampling)
155
153
  ```
156
154
 
157
155
  | Engine | What it collects | Analyzer method | Best for |
@@ -167,7 +165,6 @@ gpufl.init("my-app",
167
165
  gpufl::InitOptions opts;
168
166
  opts.app_name = "my_app";
169
167
  opts.log_path = "my_logs";
170
- opts.enable_kernel_details = true;
171
168
  opts.enable_stack_trace = true;
172
169
  opts.sampling_auto_start = true;
173
170
  opts.profiling_engine = gpufl::ProfilingEngine::SassMetrics;
@@ -305,6 +302,60 @@ viz.show()
305
302
 
306
303
  ---
307
304
 
305
+ ## Report Generation
306
+
307
+ For a quick, shareable text summary of a session — session metadata, kernel
308
+ hotspots, duration percentiles, and system metrics — generate a **text report**.
309
+ It's the fastest way to see "what happened" without opening the dashboard, and
310
+ it drops cleanly into CI logs, PR comments, or a plain terminal.
311
+
312
+ ![Text report example](images/Screenshot2.png)
313
+
314
+ The report includes:
315
+ - **Session Summary** — app name, session ID, duration, GPU device + SM count.
316
+ - **Kernel Execution Summary** — total / unique kernels, GPU-busy %, and
317
+ duration statistics (avg / median / P90 / P99 / min / max). When a SASS
318
+ profiling engine was active, kernel durations include instrumentation
319
+ overhead and the report labels them accordingly.
320
+ - **Top kernels by total GPU time** — with per-kernel call counts.
321
+ - **Per-kernel details** — grid/block dimensions, occupancy, registers,
322
+ shared memory (static + dynamic), register spills, and Waves/SM.
323
+
324
+ ### From C++
325
+
326
+ Call `generateReport()` after `shutdown()` — it reads the NDJSON logs written
327
+ during the session:
328
+
329
+ ```cpp
330
+ gpufl::init(opts);
331
+ // ... your CUDA / HIP work ...
332
+ gpufl::shutdown();
333
+
334
+ gpufl::generateReport(); // print to stdout
335
+ gpufl::generateReport("report.txt"); // or save to a file
336
+ ```
337
+
338
+ ### From Python
339
+
340
+ ```python
341
+ from gpufl.report import generate_report
342
+
343
+ # Print the report — wrap in print() so newlines render. In a Jupyter
344
+ # notebook this also keeps the table columns aligned (stdout renders in
345
+ # a monospace font). A bare `generate_report(...)` as a cell's last
346
+ # expression shows an escaped one-line string, so always print() it.
347
+ text = generate_report("./logs", log_prefix="my_app", top_n=10)
348
+ print(text)
349
+
350
+ # Or save it straight to a file
351
+ generate_report("./logs", log_prefix="my_app", top_n=10, output_path="report.txt")
352
+ ```
353
+
354
+ The Python version reads the same NDJSON logs the analyzer uses — no GPU
355
+ required, so you can generate reports from logs copied off another machine.
356
+
357
+ ---
358
+
308
359
  ## Testing
309
360
 
310
361
  ### C++ Tests
@@ -345,5 +396,13 @@ To allow non-root users to profile GPU kernels (using CUPTI/PC Sampling) on Linu
345
396
 
346
397
  ---
347
398
 
348
- *GPU Flight is open source: [github.com/gpu-flight](https://github.com/gpu-flight)*
349
- *Python package: [pypi.org/project/gpufl](https://pypi.org/project/gpufl/)*
399
+ ## Where your logs go
400
+
401
+ By default the client writes NDJSON to disk. To stream them to a hosted
402
+ dashboard, set `backend_url` + `api_key` (or the `GPUFL_BACKEND_URL` /
403
+ `GPUFL_API_KEY` env vars) and they're delivered live to
404
+ [app.gpuflight.com](https://app.gpuflight.com). Create a workspace at
405
+ [gpuflight.com](https://gpuflight.com)
406
+
407
+ This client (gpufl-client) is open source. The ingestion service and
408
+ the dashboard UI are proprietary and managed-only today.
@@ -18,7 +18,7 @@ To keep the initial design coherent, **we are not currently accepting major feat
18
18
 
19
19
  Try the portal with real session data — no sign-up required:
20
20
 
21
- **[https://gpufl-front.vercel.app/demo/gdemo_4R98GA5MzYdosqvNsUqdp_MaUgEcDHABS2C5PHbCDQE](https://gpufl-front.vercel.app/demo/gdemo_4R98GA5MzYdosqvNsUqdp_MaUgEcDHABS2C5PHbCDQE)**
21
+ **[Demo Link](https://demo.gpuflight.com)**
22
22
 
23
23
  ## Key Features
24
24
 
@@ -83,7 +83,6 @@ import gpufl
83
83
  gpufl.init("my-app",
84
84
  log_path="./my_logs",
85
85
  sampling_auto_start=True,
86
- enable_kernel_details=True,
87
86
  enable_stack_trace=True)
88
87
 
89
88
  a = torch.randn(1024, 1024, device="cuda")
@@ -105,8 +104,7 @@ from gpufl import ProfilingEngine
105
104
 
106
105
  gpufl.init("my-app",
107
106
  log_path="./logs",
108
- profiling_engine=ProfilingEngine.PcSampling,
109
- enable_kernel_details=True)
107
+ profiling_engine=ProfilingEngine.PcSampling)
110
108
  ```
111
109
 
112
110
  | Engine | What it collects | Analyzer method | Best for |
@@ -122,7 +120,6 @@ gpufl.init("my-app",
122
120
  gpufl::InitOptions opts;
123
121
  opts.app_name = "my_app";
124
122
  opts.log_path = "my_logs";
125
- opts.enable_kernel_details = true;
126
123
  opts.enable_stack_trace = true;
127
124
  opts.sampling_auto_start = true;
128
125
  opts.profiling_engine = gpufl::ProfilingEngine::SassMetrics;
@@ -260,6 +257,60 @@ viz.show()
260
257
 
261
258
  ---
262
259
 
260
+ ## Report Generation
261
+
262
+ For a quick, shareable text summary of a session — session metadata, kernel
263
+ hotspots, duration percentiles, and system metrics — generate a **text report**.
264
+ It's the fastest way to see "what happened" without opening the dashboard, and
265
+ it drops cleanly into CI logs, PR comments, or a plain terminal.
266
+
267
+ ![Text report example](images/Screenshot2.png)
268
+
269
+ The report includes:
270
+ - **Session Summary** — app name, session ID, duration, GPU device + SM count.
271
+ - **Kernel Execution Summary** — total / unique kernels, GPU-busy %, and
272
+ duration statistics (avg / median / P90 / P99 / min / max). When a SASS
273
+ profiling engine was active, kernel durations include instrumentation
274
+ overhead and the report labels them accordingly.
275
+ - **Top kernels by total GPU time** — with per-kernel call counts.
276
+ - **Per-kernel details** — grid/block dimensions, occupancy, registers,
277
+ shared memory (static + dynamic), register spills, and Waves/SM.
278
+
279
+ ### From C++
280
+
281
+ Call `generateReport()` after `shutdown()` — it reads the NDJSON logs written
282
+ during the session:
283
+
284
+ ```cpp
285
+ gpufl::init(opts);
286
+ // ... your CUDA / HIP work ...
287
+ gpufl::shutdown();
288
+
289
+ gpufl::generateReport(); // print to stdout
290
+ gpufl::generateReport("report.txt"); // or save to a file
291
+ ```
292
+
293
+ ### From Python
294
+
295
+ ```python
296
+ from gpufl.report import generate_report
297
+
298
+ # Print the report — wrap in print() so newlines render. In a Jupyter
299
+ # notebook this also keeps the table columns aligned (stdout renders in
300
+ # a monospace font). A bare `generate_report(...)` as a cell's last
301
+ # expression shows an escaped one-line string, so always print() it.
302
+ text = generate_report("./logs", log_prefix="my_app", top_n=10)
303
+ print(text)
304
+
305
+ # Or save it straight to a file
306
+ generate_report("./logs", log_prefix="my_app", top_n=10, output_path="report.txt")
307
+ ```
308
+
309
+ The Python version reads the same NDJSON logs the analyzer uses — no GPU
310
+ required, so you can generate reports from logs copied off another machine.
311
+
312
+ ---
313
+
263
314
  ## Testing
264
315
 
265
316
  ### C++ Tests
@@ -300,5 +351,13 @@ To allow non-root users to profile GPU kernels (using CUPTI/PC Sampling) on Linu
300
351
 
301
352
  ---
302
353
 
303
- *GPU Flight is open source: [github.com/gpu-flight](https://github.com/gpu-flight)*
304
- *Python package: [pypi.org/project/gpufl](https://pypi.org/project/gpufl/)*
354
+ ## Where your logs go
355
+
356
+ By default the client writes NDJSON to disk. To stream them to a hosted
357
+ dashboard, set `backend_url` + `api_key` (or the `GPUFL_BACKEND_URL` /
358
+ `GPUFL_API_KEY` env vars) and they're delivered live to
359
+ [app.gpuflight.com](https://app.gpuflight.com). Create a workspace at
360
+ [gpuflight.com](https://gpuflight.com)
361
+
362
+ This client (gpufl-client) is open source. The ingestion service and
363
+ the dashboard UI are proprietary and managed-only today.
@@ -79,7 +79,7 @@ docker build \
79
79
  Copy `.env.example` to `.env` and set the required variables, then:
80
80
 
81
81
  ```bash
82
- GPUFL_HTTP_URL=https://your-backend/api/v1/events/ \
82
+ GPUFL_HTTP_HOST=https://your-backend \
83
83
  GPUFL_HTTP_TOKEN=gfl_your_token_here \
84
84
  docker compose -f docker-compose.monitor.yml up -d
85
85
  ```
@@ -99,7 +99,7 @@ docker compose -f docker-compose.monitor.yml down
99
99
  ### AMD
100
100
 
101
101
  ```bash
102
- GPUFL_HTTP_URL=https://your-backend/api/v1/events/ \
102
+ GPUFL_HTTP_HOST=https://your-backend \
103
103
  GPUFL_HTTP_TOKEN=gfl_your_token_here \
104
104
  docker compose -f docker-compose.monitor.amd.yml up -d
105
105
  ```
@@ -127,7 +127,7 @@ docker run -d \
127
127
  --name gpufl-monitor \
128
128
  --gpus all \
129
129
  --restart unless-stopped \
130
- -e GPUFL_HTTP_URL=https://your-backend/api/v1/events/ \
130
+ -e GPUFL_HTTP_HOST=https://your-backend \
131
131
  -e GPUFL_HTTP_TOKEN=gfl_your_token_here \
132
132
  -v gpufl-cursor:/var/gpufl/monitor \
133
133
  gpufl/monitor:latest
@@ -141,7 +141,7 @@ docker run -d \
141
141
  --device /dev/kfd --device /dev/dri \
142
142
  --group-add video --group-add render \
143
143
  --restart unless-stopped \
144
- -e GPUFL_HTTP_URL=https://your-backend/api/v1/events/ \
144
+ -e GPUFL_HTTP_HOST=https://your-backend \
145
145
  -e GPUFL_HTTP_TOKEN=gfl_your_token_here \
146
146
  -v gpufl-cursor-amd:/var/gpufl/monitor \
147
147
  gpufl/monitor-amd:latest
@@ -175,7 +175,8 @@ The named volume persists the agent's read cursor so it resumes from where it le
175
175
  | Variable | Default | Description |
176
176
  |---|---|---|
177
177
  | `GPUFL_PUBLISHER_TYPE` | `http` | Publisher backend: `http` or `kafka` |
178
- | `GPUFL_HTTP_URL` | *(required)* | Backend ingest URL, e.g. `https://app.gpuflight.io/api/v1/events/` |
178
+ | `GPUFL_HTTP_HOST` | *(required)* | Backend scheme+host, e.g. `https://api.gpuflight.com`. The agent appends `/api/{version}/events/<type>` automatically. |
179
+ | `GPUFL_HTTP_API_VERSION` | `v1` | Backend API version. Bump when the backend cuts v2 etc. |
179
180
  | `GPUFL_HTTP_TOKEN` | *(empty)* | Bearer token for the backend API |
180
181
  | `GPUFL_HTTP_TIMEOUT_SEC` | `10` | HTTP request timeout in seconds |
181
182
 
@@ -22,9 +22,13 @@ services:
22
22
  GPUFL_LOG_TYPES: ${GPUFL_LOG_TYPES:-device,scope,system}
23
23
  GPUFL_CURSOR_FILE: ${GPUFL_CURSOR_FILE:-/var/gpufl/monitor/cursor.json}
24
24
 
25
- # Java agent — publisher (HTTP)
25
+ # Java agent — publisher (HTTP). Set GPUFL_HTTP_HOST to just the
26
+ # scheme+host (e.g. https://api.gpuflight.com); the agent builds
27
+ # the /api/{version}/events/<type> path itself. Override the
28
+ # version via GPUFL_HTTP_API_VERSION when the backend bumps to v2.
26
29
  GPUFL_PUBLISHER_TYPE: ${GPUFL_PUBLISHER_TYPE:-http}
27
- GPUFL_HTTP_URL: ${GPUFL_HTTP_URL}
30
+ GPUFL_HTTP_HOST: ${GPUFL_HTTP_HOST}
31
+ GPUFL_HTTP_API_VERSION: ${GPUFL_HTTP_API_VERSION:-v1}
28
32
  GPUFL_HTTP_TOKEN: ${GPUFL_HTTP_TOKEN:-}
29
33
  GPUFL_HTTP_TIMEOUT_SEC: ${GPUFL_HTTP_TIMEOUT_SEC:-10}
30
34
 
@@ -39,9 +39,13 @@ services:
39
39
  GPUFL_SOURCE_FOLDERS: ${GPUFL_SOURCE_FOLDERS:-/var/gpufl/monitor,/var/gpufl/demo}
40
40
  GPUFL_CURSOR_FILE: ${GPUFL_CURSOR_FILE:-/var/gpufl/monitor/cursor.json}
41
41
 
42
- # Java agent — publisher (HTTP)
42
+ # Java agent — publisher (HTTP). Set GPUFL_HTTP_HOST to just the
43
+ # scheme+host (e.g. https://api.gpuflight.com); the agent builds
44
+ # the /api/{version}/events/<type> path itself. Override the
45
+ # version via GPUFL_HTTP_API_VERSION when the backend bumps to v2.
43
46
  GPUFL_PUBLISHER_TYPE: ${GPUFL_PUBLISHER_TYPE:-http}
44
- GPUFL_HTTP_URL: ${GPUFL_HTTP_URL}
47
+ GPUFL_HTTP_HOST: ${GPUFL_HTTP_HOST}
48
+ GPUFL_HTTP_API_VERSION: ${GPUFL_HTTP_API_VERSION:-v1}
45
49
  GPUFL_HTTP_TOKEN: ${GPUFL_HTTP_TOKEN:-}
46
50
  GPUFL_HTTP_TIMEOUT_SEC: ${GPUFL_HTTP_TIMEOUT_SEC:-10}
47
51
 
@@ -130,7 +130,6 @@ int main() {
130
130
  opts.system_sample_rate_ms = 50;
131
131
  opts.kernel_sample_rate_ms = 0;
132
132
  opts.sampling_auto_start = true;
133
- opts.enable_kernel_details = true;
134
133
  opts.enable_debug_output = true;
135
134
  opts.enable_stack_trace = false;
136
135
  opts.profiling_engine = gpufl::ProfilingEngine::SassMetrics;
@@ -38,7 +38,6 @@ int main() {
38
38
  opts.log_path = "gfl_block";
39
39
  opts.system_sample_rate_ms = 50;
40
40
  opts.kernel_sample_rate_ms = 50;
41
- opts.enable_kernel_details = true;
42
41
  opts.sampling_auto_start = true;
43
42
  opts.enable_debug_output = true;
44
43
  opts.enable_source_collection = true;
@@ -61,7 +61,6 @@ int main() {
61
61
  opts.log_path = "memory_coalescing_demo";
62
62
  opts.system_sample_rate_ms = 10;
63
63
  opts.kernel_sample_rate_ms = 10;
64
- opts.enable_kernel_details = true;
65
64
  opts.sampling_auto_start = true;
66
65
  opts.enable_debug_output = true;
67
66
  opts.profiling_engine = gpufl::ProfilingEngine::PcSamplingWithSass;
@@ -79,7 +79,6 @@ int main()
79
79
  gpufl::InitOptions opts;
80
80
  opts.app_name = "occupancy_demo";
81
81
  opts.log_path = "occupancy_demo.log";
82
- opts.enable_kernel_details = true; // required for occupancy breakdown fields
83
82
  opts.sampling_auto_start = true;
84
83
  opts.enable_debug_output = false;
85
84
 
@@ -144,15 +144,12 @@ int main() {
144
144
  gpufl::InitOptions opts;
145
145
  opts.app_name = "sass_divergence_demo";
146
146
  opts.log_path = "sass_divergence";
147
- if (const char* k = std::getenv("GPUFL_API_KEY")) opts.api_key = k;
148
- opts.backend_url = "http://localhost:8080";
149
- opts.remote_upload = true;
147
+ opts.remote_upload = false;
150
148
  opts.system_sample_rate_ms = 10;
151
- opts.enable_kernel_details = true;
152
149
  opts.enable_debug_output = true;
153
150
  opts.sampling_auto_start = true;
154
151
  opts.enable_stack_trace = true;
155
- opts.profiling_engine = gpufl::ProfilingEngine::PcSampling;
152
+ opts.profiling_engine = gpufl::ProfilingEngine::PcSamplingWithSass;
156
153
 
157
154
  if (!gpufl::init(opts)) {
158
155
  std::cerr << "Failed to initialize gpufl" << std::endl;
@@ -27,7 +27,6 @@ int main() {
27
27
  opts.log_path = "vector_add_benchmark";
28
28
  opts.system_sample_rate_ms = 50;
29
29
  opts.kernel_sample_rate_ms = 50;
30
- opts.enable_kernel_details = true;
31
30
  opts.sampling_auto_start = true;
32
31
  opts.enable_debug_output = true;
33
32
  opts.enable_source_collection = true;
@@ -1,7 +1,9 @@
1
1
  import gpufl as gfl
2
+ from gpufl.report import generate_report
2
3
  import numpy as np
3
4
  from numba import cuda
4
5
  import math
6
+ import os
5
7
  import time
6
8
 
7
9
  # --- 1. Define a Real CUDA Kernel (Matrix Mul) ---
@@ -20,9 +22,32 @@ def matmul_kernel(A, B, C):
20
22
 
21
23
  def run_benchmark():
22
24
  # --- 2. Initialize GPUFL ---
23
- # We enable the background sampler (16ms) to catch VRAM/Power usage during the heavy compute
25
+ # LOG_PATH is the file prefix the FileLogSink writes to it produces
26
+ # <LOG_PATH>.device.log / .scope.log / .system.log. We reuse it below
27
+ # to point generate_report() at the same files.
28
+ LOG_PATH = "./gfl_logs"
29
+
30
+ BACKEND_URL = os.environ.get("GPUFL_BACKEND_URL", "api.gpuflight.com")
31
+ API_KEY = os.environ.get("GPUFL_API_KEY", "")
32
+ REMOTE_UPLOAD = bool(API_KEY)
33
+
24
34
  print("[GPUFL] Initializing...")
25
- gfl.init("Numba_App", "./gfl_logs", 100)
35
+ if REMOTE_UPLOAD:
36
+ print(f"[GPUFL] Live upload ON -> {BACKEND_URL}")
37
+ else:
38
+ print("[GPUFL] Live upload OFF (set GPUFL_API_KEY to enable). Local files only.")
39
+
40
+ gfl.init(
41
+ app_name="Numba_App",
42
+ log_path=LOG_PATH,
43
+ sampling_auto_start=True,
44
+ system_sample_rate_ms=100,
45
+ enable_debug_output=True,
46
+ profiling_engine=gfl.ProfilingEngine.PcSamplingWithSass,
47
+ backend_url=BACKEND_URL,
48
+ api_key=API_KEY,
49
+ remote_upload=REMOTE_UPLOAD,
50
+ )
26
51
 
27
52
  try:
28
53
  # --- 3. Setup Data (Heavy Load) ---
@@ -70,6 +95,21 @@ def run_benchmark():
70
95
  print("[GPUFL] Shutting down...")
71
96
  gfl.shutdown()
72
97
 
98
+ # --- 6. Generate a text report from the logs we just wrote ---
99
+ # shutdown() above flushes and closes the NDJSON channels, so the
100
+ # report reflects the full session. generate_report reads the same
101
+ # logs the analyzer uses — no GPU required for this step. We split
102
+ # LOG_PATH into (dir, prefix) the way GpuFlightSession expects:
103
+ # "./gfl_logs" -> dir=".", prefix="gfl_logs"
104
+ # -> reads ./gfl_logs.{device,scope,system}.log
105
+ # Wrap in print() so the report renders with real newlines (and,
106
+ # in a Jupyter notebook, in the monospace stdout area so the
107
+ # kernel tables stay aligned).
108
+ log_dir = os.path.dirname(LOG_PATH) or "."
109
+ log_prefix = os.path.basename(LOG_PATH)
110
+ print("\n[GPUFL] Session report:\n")
111
+ print(generate_report(log_dir, log_prefix=log_prefix, top_n=10))
112
+
73
113
  if __name__ == "__main__":
74
114
  if cuda.is_available():
75
115
  run_benchmark()
@@ -31,9 +31,7 @@ def run_stress_test():
31
31
  sampling_auto_start=True,
32
32
  system_sample_rate_ms=50,
33
33
  kernel_sample_rate_ms=50,
34
- enable_kernel_details=True,
35
- enable_debug_output=True,
36
- enable_profiling=True,
34
+ enable_debug_output=False,
37
35
  enable_stack_trace=True,
38
36
  # opt-in to memory tracking. Default-off in v1
39
37
  # because TF eager and similar workloads can produce
@@ -49,7 +47,7 @@ def run_stress_test():
49
47
  remote_upload=remote_upload,
50
48
  api_key=api_key,
51
49
  backend_url=backend_url,
52
- profiling_engine=gpufl.ProfilingEngine.PcSamplingWithSass)
50
+ profiling_engine=gpufl.ProfilingEngine.PcSampling)
53
51
 
54
52
  try:
55
53
  # 2. Allocate (Uses approx 3GB VRAM)
Binary file