stormlog 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. stormlog-0.2.3/.editorconfig +16 -0
  2. stormlog-0.2.3/.flake8 +17 -0
  3. stormlog-0.2.3/.github/workflows/ci.yml +266 -0
  4. stormlog-0.2.3/.github/workflows/release.yml +58 -0
  5. stormlog-0.2.3/.gitignore +112 -0
  6. stormlog-0.2.3/.pre-commit-config.yaml +25 -0
  7. stormlog-0.2.3/.readthedocs.yaml +14 -0
  8. stormlog-0.2.3/CHANGELOG.md +190 -0
  9. stormlog-0.2.3/CODE_OF_CONDUCT.md +143 -0
  10. stormlog-0.2.3/CONTRIBUTING.md +271 -0
  11. stormlog-0.2.3/LICENSE +21 -0
  12. stormlog-0.2.3/PKG-INFO +368 -0
  13. stormlog-0.2.3/PROJECT_STATUS.md +70 -0
  14. stormlog-0.2.3/README.md +256 -0
  15. stormlog-0.2.3/RELEASE_CHECKLIST.md +90 -0
  16. stormlog-0.2.3/SECURITY.md +113 -0
  17. stormlog-0.2.3/STYLE_GUIDE.md +197 -0
  18. stormlog-0.2.3/docs/api.md +55 -0
  19. stormlog-0.2.3/docs/architecture.md +433 -0
  20. stormlog-0.2.3/docs/article.md +1112 -0
  21. stormlog-0.2.3/docs/benchmark_harness.md +53 -0
  22. stormlog-0.2.3/docs/benchmarks/v0.2_budgets.json +10 -0
  23. stormlog-0.2.3/docs/cli.md +149 -0
  24. stormlog-0.2.3/docs/compatibility_matrix.md +61 -0
  25. stormlog-0.2.3/docs/conf.py +71 -0
  26. stormlog-0.2.3/docs/cpu_compatibility.md +433 -0
  27. stormlog-0.2.3/docs/examples/test_guides/README.md +120 -0
  28. stormlog-0.2.3/docs/examples.md +308 -0
  29. stormlog-0.2.3/docs/gpu-profiler-1.png +0 -0
  30. stormlog-0.2.3/docs/gpu-profiler-2.png +0 -0
  31. stormlog-0.2.3/docs/gpu-profiler-overview.gif +0 -0
  32. stormlog-0.2.3/docs/gpu-profiler-overview.mp4 +0 -0
  33. stormlog-0.2.3/docs/gpu_setup.md +99 -0
  34. stormlog-0.2.3/docs/index.md +42 -0
  35. stormlog-0.2.3/docs/installation.md +194 -0
  36. stormlog-0.2.3/docs/pytorch_testing_guide.md +1337 -0
  37. stormlog-0.2.3/docs/reference/api.rst +16 -0
  38. stormlog-0.2.3/docs/reference/index.md +15 -0
  39. stormlog-0.2.3/docs/requirements-rtd.txt +4 -0
  40. stormlog-0.2.3/docs/schemas/telemetry_event_v2.schema.json +114 -0
  41. stormlog-0.2.3/docs/telemetry_schema.md +122 -0
  42. stormlog-0.2.3/docs/tensorflow_testing_guide.md +896 -0
  43. stormlog-0.2.3/docs/testing.md +628 -0
  44. stormlog-0.2.3/docs/troubleshooting.md +518 -0
  45. stormlog-0.2.3/docs/tui-distributed-diagnostics-workflow.png +0 -0
  46. stormlog-0.2.3/docs/tui-distributed-diagnostics-workflow.svg +78 -0
  47. stormlog-0.2.3/docs/tui.md +161 -0
  48. stormlog-0.2.3/docs/usage.md +216 -0
  49. stormlog-0.2.3/examples/advanced/__init__.py +1 -0
  50. stormlog-0.2.3/examples/advanced/tracking_demo.py +190 -0
  51. stormlog-0.2.3/examples/basic/__init__.py +1 -0
  52. stormlog-0.2.3/examples/basic/pytorch_demo.py +120 -0
  53. stormlog-0.2.3/examples/basic/tensorflow_demo.py +98 -0
  54. stormlog-0.2.3/examples/cli/__init__.py +1 -0
  55. stormlog-0.2.3/examples/cli/benchmark_harness.py +296 -0
  56. stormlog-0.2.3/examples/cli/capability_matrix.py +368 -0
  57. stormlog-0.2.3/examples/cli/quickstart.py +69 -0
  58. stormlog-0.2.3/examples/common/__init__.py +65 -0
  59. stormlog-0.2.3/examples/common/capability_matrix_utils.py +89 -0
  60. stormlog-0.2.3/examples/common/cli.py +48 -0
  61. stormlog-0.2.3/examples/common/device.py +105 -0
  62. stormlog-0.2.3/examples/common/formatting.py +19 -0
  63. stormlog-0.2.3/examples/common/summary.py +53 -0
  64. stormlog-0.2.3/examples/common/tf_workflow.py +62 -0
  65. stormlog-0.2.3/examples/common/torch_workflow.py +81 -0
  66. stormlog-0.2.3/examples/scenarios/__init__.py +1 -0
  67. stormlog-0.2.3/examples/scenarios/cpu_telemetry_scenario.py +136 -0
  68. stormlog-0.2.3/examples/scenarios/mps_telemetry_scenario.py +153 -0
  69. stormlog-0.2.3/examples/scenarios/oom_flight_recorder_scenario.py +240 -0
  70. stormlog-0.2.3/examples/scenarios/tf_end_to_end_scenario.py +204 -0
  71. stormlog-0.2.3/examples/test_guides/README.md +16 -0
  72. stormlog-0.2.3/gpumemprof/__init__.py +150 -0
  73. stormlog-0.2.3/gpumemprof/_version.py +34 -0
  74. stormlog-0.2.3/gpumemprof/analyzer.py +895 -0
  75. stormlog-0.2.3/gpumemprof/cli.py +967 -0
  76. stormlog-0.2.3/gpumemprof/collective_attribution.py +603 -0
  77. stormlog-0.2.3/gpumemprof/context_profiler.py +349 -0
  78. stormlog-0.2.3/gpumemprof/cpu_profiler.py +472 -0
  79. stormlog-0.2.3/gpumemprof/device_collectors.py +244 -0
  80. stormlog-0.2.3/gpumemprof/diagnose.py +310 -0
  81. stormlog-0.2.3/gpumemprof/distributed_analysis.py +481 -0
  82. stormlog-0.2.3/gpumemprof/gap_analysis.py +234 -0
  83. stormlog-0.2.3/gpumemprof/oom_flight_recorder.py +226 -0
  84. stormlog-0.2.3/gpumemprof/profiler.py +493 -0
  85. stormlog-0.2.3/gpumemprof/telemetry.py +814 -0
  86. stormlog-0.2.3/gpumemprof/tracker.py +882 -0
  87. stormlog-0.2.3/gpumemprof/tui/__init__.py +21 -0
  88. stormlog-0.2.3/gpumemprof/tui/app.py +1617 -0
  89. stormlog-0.2.3/gpumemprof/tui/builders.py +256 -0
  90. stormlog-0.2.3/gpumemprof/tui/commands.py +73 -0
  91. stormlog-0.2.3/gpumemprof/tui/distributed_diagnostics.py +896 -0
  92. stormlog-0.2.3/gpumemprof/tui/monitor.py +358 -0
  93. stormlog-0.2.3/gpumemprof/tui/profiles.py +126 -0
  94. stormlog-0.2.3/gpumemprof/tui/styles.py +275 -0
  95. stormlog-0.2.3/gpumemprof/tui/widgets/__init__.py +26 -0
  96. stormlog-0.2.3/gpumemprof/tui/widgets/panels.py +21 -0
  97. stormlog-0.2.3/gpumemprof/tui/widgets/tables.py +218 -0
  98. stormlog-0.2.3/gpumemprof/tui/widgets/timeline.py +168 -0
  99. stormlog-0.2.3/gpumemprof/tui/widgets/welcome.py +73 -0
  100. stormlog-0.2.3/gpumemprof/tui/workloads.py +84 -0
  101. stormlog-0.2.3/gpumemprof/utils.py +545 -0
  102. stormlog-0.2.3/gpumemprof/visualizer.py +824 -0
  103. stormlog-0.2.3/pyproject.toml +252 -0
  104. stormlog-0.2.3/pytest.ini +8 -0
  105. stormlog-0.2.3/requirements-ci-base.txt +21 -0
  106. stormlog-0.2.3/requirements-ci-lint.in +5 -0
  107. stormlog-0.2.3/requirements-ci-lint.txt +48 -0
  108. stormlog-0.2.3/requirements-dev.txt +39 -0
  109. stormlog-0.2.3/requirements-test.txt +28 -0
  110. stormlog-0.2.3/requirements.txt +24 -0
  111. stormlog-0.2.3/setup.cfg +4 -0
  112. stormlog-0.2.3/setup.py +6 -0
  113. stormlog-0.2.3/stormlog.egg-info/PKG-INFO +368 -0
  114. stormlog-0.2.3/stormlog.egg-info/SOURCES.txt +161 -0
  115. stormlog-0.2.3/stormlog.egg-info/dependency_links.txt +1 -0
  116. stormlog-0.2.3/stormlog.egg-info/entry_points.txt +4 -0
  117. stormlog-0.2.3/stormlog.egg-info/requires.txt +70 -0
  118. stormlog-0.2.3/stormlog.egg-info/top_level.txt +2 -0
  119. stormlog-0.2.3/tests/conftest.py +4 -0
  120. stormlog-0.2.3/tests/e2e/test_tui_pty.py +90 -0
  121. stormlog-0.2.3/tests/gap_test_helpers.py +50 -0
  122. stormlog-0.2.3/tests/test_benchmark_harness.py +86 -0
  123. stormlog-0.2.3/tests/test_cli_analyze.py +241 -0
  124. stormlog-0.2.3/tests/test_cli_diagnose.py +441 -0
  125. stormlog-0.2.3/tests/test_cli_info.py +271 -0
  126. stormlog-0.2.3/tests/test_cli_oom_flight_recorder.py +150 -0
  127. stormlog-0.2.3/tests/test_collective_attribution.py +338 -0
  128. stormlog-0.2.3/tests/test_core_profiler.py +416 -0
  129. stormlog-0.2.3/tests/test_cpu_profiler.py +746 -0
  130. stormlog-0.2.3/tests/test_device_collectors.py +79 -0
  131. stormlog-0.2.3/tests/test_distributed_analysis.py +359 -0
  132. stormlog-0.2.3/tests/test_docs_regressions.py +40 -0
  133. stormlog-0.2.3/tests/test_examples_scenarios.py +79 -0
  134. stormlog-0.2.3/tests/test_gap_analysis.py +269 -0
  135. stormlog-0.2.3/tests/test_import_hardening.py +123 -0
  136. stormlog-0.2.3/tests/test_oom_flight_recorder.py +268 -0
  137. stormlog-0.2.3/tests/test_profiler.py +67 -0
  138. stormlog-0.2.3/tests/test_profiler_regressions.py +149 -0
  139. stormlog-0.2.3/tests/test_telemetry_v2.py +453 -0
  140. stormlog-0.2.3/tests/test_tf_env.py +25 -0
  141. stormlog-0.2.3/tests/test_tf_gap_analysis.py +281 -0
  142. stormlog-0.2.3/tests/test_tf_runtime_regressions.py +197 -0
  143. stormlog-0.2.3/tests/test_tf_telemetry_export.py +172 -0
  144. stormlog-0.2.3/tests/test_tfmemprof_diagnose.py +382 -0
  145. stormlog-0.2.3/tests/test_tracker_input_guards.py +30 -0
  146. stormlog-0.2.3/tests/test_utils.py +196 -0
  147. stormlog-0.2.3/tests/test_visualizer_cross_rank.py +57 -0
  148. stormlog-0.2.3/tests/tui/test_app_helpers.py +40 -0
  149. stormlog-0.2.3/tests/tui/test_app_pilot.py +524 -0
  150. stormlog-0.2.3/tests/tui/test_app_snapshots.py +175 -0
  151. stormlog-0.2.3/tests/tui/test_distributed_diagnostics.py +506 -0
  152. stormlog-0.2.3/tests/tui/test_monitor.py +145 -0
  153. stormlog-0.2.3/tests/tui/test_workloads.py +63 -0
  154. stormlog-0.2.3/tfmemprof/__init__.py +28 -0
  155. stormlog-0.2.3/tfmemprof/analyzer.py +452 -0
  156. stormlog-0.2.3/tfmemprof/cli.py +644 -0
  157. stormlog-0.2.3/tfmemprof/context_profiler.py +369 -0
  158. stormlog-0.2.3/tfmemprof/diagnose.py +282 -0
  159. stormlog-0.2.3/tfmemprof/profiler.py +466 -0
  160. stormlog-0.2.3/tfmemprof/tf_env.py +8 -0
  161. stormlog-0.2.3/tfmemprof/tracker.py +505 -0
  162. stormlog-0.2.3/tfmemprof/utils.py +600 -0
  163. stormlog-0.2.3/tfmemprof/visualizer.py +329 -0
@@ -0,0 +1,16 @@
1
+ # EditorConfig helps maintain consistent coding styles
2
+ root = true
3
+
4
+ [*]
5
+ charset = utf-8
6
+ end_of_line = lf
7
+ insert_final_newline = true
8
+ trim_trailing_whitespace = true
9
+ indent_style = space
10
+ indent_size = 4
11
+
12
+ [*.md]
13
+ trim_trailing_whitespace = false
14
+
15
+ [*.py]
16
+ indent_size = 4
stormlog-0.2.3/.flake8 ADDED
@@ -0,0 +1,17 @@
1
+ [flake8]
2
+ max-line-length = 88
3
+ extend-ignore = E203, W503, T499, E501
4
+ exclude = .git,__pycache__,docs,build,dist,venv,.venv,.eggs,*.egg,*.egg-info,.mypy_cache
5
+ per-file-ignores =
6
+ */__init__.py: F401, E402
7
+ examples/*: E402, F401, F811
8
+ tests/*: E402, F841
9
+ gpumemprof/tui/app.py: E402
10
+ gpumemprof/tui/monitor.py: E402
11
+ gpumemprof/tui/profiles.py: E402
12
+ gpumemprof/analyzer.py: F841
13
+ gpumemprof/cli.py: F841
14
+ gpumemprof/context_profiler.py: F841
15
+ gpumemprof/cpu_profiler.py: F841
16
+ gpumemprof/profiler.py: F841
17
+ tfmemprof/*: E402, F841
@@ -0,0 +1,266 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, develop]
6
+ pull_request:
7
+ branches: [main, release/v0.2-readiness]
8
+ schedule:
9
+ - cron: "0 3 * * *"
10
+
11
+ jobs:
12
+ test:
13
+ if: github.event_name != 'schedule'
14
+ runs-on: ubuntu-latest
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ python-version: ["3.10", "3.11", "3.12"]
19
+ framework: [pytorch, tensorflow]
20
+ exclude:
21
+ - python-version: "3.12"
22
+ framework: tensorflow
23
+
24
+ steps:
25
+ - uses: actions/checkout@v4
26
+
27
+ - name: Set up Python ${{ matrix.python-version }}
28
+ uses: actions/setup-python@v4
29
+ with:
30
+ python-version: ${{ matrix.python-version }}
31
+
32
+ - name: Cache pip dependencies
33
+ uses: actions/cache@v3
34
+ with:
35
+ path: ~/.cache/pip
36
+ key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}
37
+ restore-keys: |
38
+ ${{ runner.os }}-pip-${{ matrix.python-version }}-
39
+
40
+ - name: Install base dependencies
41
+ run: |
42
+ python3 -m pip install --upgrade pip
43
+ pip install -e . --no-deps
44
+ pip install -r requirements-ci-base.txt
45
+
46
+ - name: Install framework dependencies
47
+ run: |
48
+ if [ "${{ matrix.framework }}" = "pytorch" ]; then
49
+ pip install torch==2.2.2 --index-url https://download.pytorch.org/whl/cpu
50
+ else
51
+ pip install tensorflow-cpu==2.15.0
52
+ fi
53
+
54
+ - name: Run tests
55
+ run: |
56
+ if [ "${{ matrix.framework }}" = "pytorch" ]; then
57
+ python3 -m pytest tests/ --ignore-glob="tests/test_tf*.py" -v -m "not tui_pilot and not tui_snapshot and not tui_pty" --cov=gpumemprof --cov=tfmemprof --cov-report=xml
58
+ else
59
+ python3 -m pytest tests/ -o "python_files=test_tf*.py" -v -m "not tui_pilot and not tui_snapshot and not tui_pty" --cov=gpumemprof --cov=tfmemprof --cov-report=xml
60
+ fi
61
+
62
+ - name: Upload coverage to Codecov
63
+ uses: codecov/codecov-action@v3
64
+ with:
65
+ file: ./coverage.xml
66
+ flags: unittests
67
+ name: codecov-umbrella
68
+
69
+ tui-pr-gate:
70
+ if: github.event_name == 'pull_request' || (github.event_name == 'push' && github.ref == 'refs/heads/develop')
71
+ runs-on: ubuntu-latest
72
+
73
+ steps:
74
+ - uses: actions/checkout@v4
75
+
76
+ - name: Set up Python
77
+ uses: actions/setup-python@v4
78
+ with:
79
+ python-version: "3.11"
80
+
81
+ - name: Cache pip dependencies
82
+ uses: actions/cache@v3
83
+ with:
84
+ path: ~/.cache/pip
85
+ key: ${{ runner.os }}-pip-tui-pr-${{ hashFiles('pyproject.toml') }}
86
+ restore-keys: |
87
+ ${{ runner.os }}-pip-tui-pr-
88
+
89
+ - name: Install TUI test dependencies
90
+ run: |
91
+ python3 -m pip install --upgrade pip
92
+ pip install -e . --no-deps
93
+ pip install -r requirements-ci-base.txt
94
+ pip install torch==2.2.2 --index-url https://download.pytorch.org/whl/cpu
95
+
96
+ - name: Run TUI pilot and snapshot suites
97
+ run: |
98
+ python3 -m pytest tests/tui/ -m "tui_pilot or tui_snapshot" -v
99
+
100
+ tui-pty-smoke:
101
+ if: github.event_name == 'schedule' || (github.event_name == 'push' && github.ref == 'refs/heads/main')
102
+ runs-on: ubuntu-latest
103
+
104
+ steps:
105
+ - uses: actions/checkout@v4
106
+
107
+ - name: Set up Python
108
+ uses: actions/setup-python@v4
109
+ with:
110
+ python-version: "3.11"
111
+
112
+ - name: Cache pip dependencies
113
+ uses: actions/cache@v3
114
+ with:
115
+ path: ~/.cache/pip
116
+ key: ${{ runner.os }}-pip-tui-pty-${{ hashFiles('pyproject.toml') }}
117
+ restore-keys: |
118
+ ${{ runner.os }}-pip-tui-pty-
119
+
120
+ - name: Install PTY test dependencies
121
+ run: |
122
+ python3 -m pip install --upgrade pip
123
+ pip install -e . --no-deps
124
+ pip install -r requirements-ci-base.txt
125
+ pip install torch==2.2.2 --index-url https://download.pytorch.org/whl/cpu
126
+
127
+ - name: Run TUI PTY smoke suite
128
+ run: |
129
+ python3 -m pytest tests/e2e/test_tui_pty.py -m tui_pty -v
130
+
131
+ lint:
132
+ if: github.event_name != 'schedule'
133
+ runs-on: ubuntu-latest
134
+ strategy:
135
+ matrix:
136
+ python-version: ["3.10"]
137
+
138
+ steps:
139
+ - uses: actions/checkout@v4
140
+
141
+ - name: Set up Python ${{ matrix.python-version }}
142
+ uses: actions/setup-python@v4
143
+ with:
144
+ python-version: ${{ matrix.python-version }}
145
+
146
+ - name: Install dependencies
147
+ run: |
148
+ python3 -m pip install --upgrade pip
149
+ # Lint does not require heavy runtime ML dependencies.
150
+ pip install -e . --no-deps
151
+ pip install -r requirements-ci-lint.txt
152
+
153
+ - name: Run isort
154
+ run: |
155
+ python3 -m isort --check --diff gpumemprof/ tfmemprof/ tests/ examples/
156
+
157
+ - name: Run black
158
+ run: |
159
+ python3 -m black --check gpumemprof/ tfmemprof/ tests/ examples/
160
+
161
+ - name: Run flake8
162
+ run: |
163
+ python3 -m flake8 gpumemprof/ tfmemprof/ tests/ examples/ --show-source --statistics
164
+
165
+ - name: Run mypy
166
+ run: |
167
+ python3 -m mypy gpumemprof/ tfmemprof/
168
+
169
+ docs:
170
+ if: github.event_name != 'schedule'
171
+ runs-on: ubuntu-latest
172
+
173
+ steps:
174
+ - uses: actions/checkout@v4
175
+
176
+ - name: Set up Python
177
+ uses: actions/setup-python@v4
178
+ with:
179
+ python-version: "3.11"
180
+
181
+ - name: Cache pip dependencies
182
+ uses: actions/cache@v3
183
+ with:
184
+ path: ~/.cache/pip
185
+ key: ${{ runner.os }}-pip-docs-${{ hashFiles('pyproject.toml', 'docs/requirements-rtd.txt') }}
186
+ restore-keys: |
187
+ ${{ runner.os }}-pip-docs-
188
+
189
+ - name: Install docs dependencies
190
+ run: |
191
+ python3 -m pip install --upgrade pip
192
+ pip install -e . --no-deps
193
+ pip install -r docs/requirements-rtd.txt
194
+
195
+ - name: Build docs with warnings as errors
196
+ run: |
197
+ python3 -m sphinx -W --keep-going -b html docs docs/_build/html
198
+
199
+ build:
200
+ if: github.event_name != 'schedule'
201
+ runs-on: ubuntu-latest
202
+ needs: [test, lint, docs]
203
+
204
+ steps:
205
+ - uses: actions/checkout@v4
206
+
207
+ - name: Set up Python
208
+ uses: actions/setup-python@v4
209
+ with:
210
+ python-version: "3.10"
211
+
212
+ - name: Install build dependencies
213
+ run: |
214
+ python3 -m pip install --upgrade pip
215
+ pip install build twine
216
+
217
+ - name: Build package
218
+ run: |
219
+ python3 -m build
220
+
221
+ - name: Check package
222
+ run: |
223
+ twine check dist/*
224
+
225
+ - name: Upload build artifacts
226
+ uses: actions/upload-artifact@v4
227
+ with:
228
+ name: dist
229
+ path: dist/
230
+
231
+ cli-test:
232
+ if: github.event_name != 'schedule'
233
+ runs-on: ubuntu-latest
234
+ needs: [test, lint, docs]
235
+
236
+ steps:
237
+ - uses: actions/checkout@v4
238
+
239
+ - name: Set up Python
240
+ uses: actions/setup-python@v4
241
+ with:
242
+ python-version: "3.10"
243
+
244
+ - name: Install package
245
+ run: |
246
+ python3 -m pip install --upgrade pip
247
+ pip install -e .
248
+
249
+ - name: Test CLI tools
250
+ run: |
251
+ gpumemprof --help
252
+ tfmemprof --help
253
+ gpumemprof info
254
+ tfmemprof info
255
+
256
+ - name: Run documented CLI examples smoke test
257
+ env:
258
+ CUDA_VISIBLE_DEVICES: ""
259
+ run: |
260
+ python3 -m examples.cli.quickstart
261
+
262
+ - name: Run capability matrix smoke test
263
+ env:
264
+ CUDA_VISIBLE_DEVICES: ""
265
+ run: |
266
+ python3 -m examples.cli.capability_matrix --mode smoke --target auto --oom-mode simulated --skip-tui
@@ -0,0 +1,58 @@
1
+ name: Release
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ deploy:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ with:
13
+ fetch-depth: 0
14
+ fetch-tags: true
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: "3.10"
20
+
21
+ - name: Resolve package version from release tag
22
+ run: |
23
+ VERSION="${GITHUB_REF_NAME#v}"
24
+ if [ -z "$VERSION" ] || echo "$VERSION" | grep -Eq '(\.dev|[+])'; then
25
+ echo "Invalid release tag for PyPI: ${GITHUB_REF_NAME}" >&2
26
+ exit 1
27
+ fi
28
+ echo "SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}" >> "$GITHUB_ENV"
29
+ echo "Release tag: ${GITHUB_REF_NAME}"
30
+ echo "Package version: ${VERSION}"
31
+
32
+ - name: Install build dependencies
33
+ run: |
34
+ python3 -m pip install --upgrade pip
35
+ pip install build twine
36
+
37
+ - name: Build package
38
+ run: |
39
+ python3 -m build
40
+
41
+ - name: Check package
42
+ run: |
43
+ twine check dist/*
44
+
45
+ - name: Enforce release artifact version policy
46
+ run: |
47
+ if ls dist/* | grep -Eq '(\.dev|[+])'; then
48
+ echo "Refusing upload: dist contains dev/local versions." >&2
49
+ ls -1 dist
50
+ exit 1
51
+ fi
52
+
53
+ - name: Publish to PyPI
54
+ env:
55
+ TWINE_USERNAME: __token__
56
+ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
57
+ run: |
58
+ twine upload --skip-existing dist/*
@@ -0,0 +1,112 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ env/
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ # Usually these files are written by a python script from a template
29
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ debug.log
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ snapshot_report*.html
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+ artifacts/
52
+
53
+ # Translations
54
+ *.mo
55
+ *.pot
56
+
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+
61
+ # Flask stuff:
62
+ instance/
63
+ .webassets-cache
64
+
65
+ # Scrapy stuff:
66
+ .scrapy
67
+
68
+ # Sphinx documentation
69
+ /docs/_build/
70
+
71
+ # PyBuilder
72
+ .target/
73
+
74
+ # Jupyter Notebook
75
+ .ipynb_checkpoints
76
+
77
+ # IPython
78
+ profile_default/
79
+ ipython_config.py
80
+
81
+ # pyenv
82
+ .python-version
83
+
84
+ # pipenv
85
+ Pipfile.lock
86
+
87
+ # poetry
88
+ poetry.lock
89
+
90
+ # mypy
91
+ .mypy_cache/
92
+ .dmypy.json
93
+
94
+ # Pyre type checker
95
+ .pyre/
96
+
97
+ # VS Code
98
+ .vscode/
99
+
100
+ # macOS
101
+ .DS_Store
102
+
103
+ # Virtual environments
104
+ venv/
105
+ ENV/
106
+ env.bak/
107
+ venv.bak/
108
+
109
+ # Version files
110
+ gpumemprof/_version.py
111
+ tfmemprof/_version.py
112
+ .venv/
@@ -0,0 +1,25 @@
1
+ repos:
2
+ - repo: https://github.com/pycqa/isort
3
+ rev: 5.13.2
4
+ hooks:
5
+ - id: isort
6
+ args: ["--profile", "black"]
7
+ - repo: https://github.com/psf/black
8
+ rev: 24.3.0
9
+ hooks:
10
+ - id: black
11
+ - repo: https://github.com/PyCQA/flake8
12
+ rev: 7.0.0
13
+ hooks:
14
+ - id: flake8
15
+ - repo: https://github.com/pre-commit/pre-commit-hooks
16
+ rev: v4.5.0
17
+ hooks:
18
+ - id: trailing-whitespace
19
+ - id: end-of-file-fixer
20
+ - id: check-yaml
21
+ - id: check-added-large-files
22
+ - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.8.0
24
+ hooks:
25
+ - id: mypy
@@ -0,0 +1,14 @@
1
+ version: 2
2
+
3
+ build:
4
+ os: ubuntu-22.04
5
+ tools:
6
+ python: "3.11"
7
+
8
+ sphinx:
9
+ configuration: docs/conf.py
10
+ fail_on_warning: true
11
+
12
+ python:
13
+ install:
14
+ - requirements: docs/requirements-rtd.txt
@@ -0,0 +1,190 @@
1
+ # Changelog
2
+
3
+ All notable changes to GPU Memory Profiler will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.2.0] - Unreleased
9
+
10
+ ### Added
11
+
12
+ - Launch QA scenario modules under `examples/scenarios/` for CPU telemetry, MPS telemetry, OOM flight recorder coverage, and TensorFlow end-to-end telemetry/diagnose checks.
13
+ - Capability matrix orchestrator (`python -m examples.cli.capability_matrix`) with smoke/full modes, target selection (`auto|cpu|mps|both`), OOM mode controls, and machine-readable reports.
14
+ - Scenario smoke tests (`tests/test_examples_scenarios.py`) and updated TUI pilot coverage for launch quick actions.
15
+ - Updated TUI snapshot baselines for intentional CLI & Actions tab changes.
16
+
17
+ ### Changed
18
+
19
+ - Drop support for Python 3.8 and 3.9; minimum supported runtime is now Python 3.10.
20
+ - Migration note: users on Python 3.8/3.9 should upgrade to Python 3.10+ or pin `gpu-memory-profiler<0.2.0`.
21
+ - Breaking change: the Textual TUI launcher command is now `stormlog` (old: `gpu-profiler`).
22
+ - Migration note: use `stormlog` instead of `gpu-profiler` when launching the TUI.
23
+ - Refresh docs/API examples to match current CLI and profiler behavior.
24
+ - Publish a versioned compatibility matrix for v0.2 and link it from top-level docs.
25
+ - Stabilize benchmark harness defaults (`--iterations 200`) and align benchmark/testing documentation to this baseline.
26
+ - Expand TUI CLI/Playbook guidance and quick actions to highlight diagnose, OOM scenario, and capability matrix workflows.
27
+ - Refresh release-facing docs (`README`, examples guides, `RELEASE_CHECKLIST`, `PROJECT_STATUS`) for v0.2 launch readiness.
28
+
29
+ ### Deprecated
30
+
31
+ - [Future deprecations will be listed here]
32
+
33
+ ### Removed
34
+
35
+ - [Future removals will be listed here]
36
+
37
+ ### Fixed
38
+
39
+ - Remove stale docs references to unsupported CLI options and non-existent profiler APIs.
40
+ - Fix `examples.basic.tensorflow_demo` constructor/API mismatch so the demo runs against the current TensorFlow profiler implementation.
41
+
42
+ ### Security
43
+
44
+ - [Future security fixes will be listed here]
45
+
46
+ ## [0.1.0] - 2024-12-19
47
+
48
+ ### Added
49
+
50
+ - **Core PyTorch Profiler (`gpumemprof`)**
51
+
52
+ - Real-time GPU memory monitoring with configurable sampling intervals
53
+ - Memory leak detection using statistical analysis and pattern recognition
54
+ - Interactive visualizations with matplotlib and plotly support
55
+ - Context-aware profiling with function decorators and context managers
56
+ - Command-line interface for standalone usage
57
+ - Advanced analytics including pattern detection and fragmentation analysis
58
+ - Alert system with configurable thresholds
59
+ - Data export capabilities (CSV, JSON)
60
+ - Automatic memory management with watchdog system
61
+
62
+ - **Core TensorFlow Profiler (`tfmemprof`)**
63
+
64
+ - Real-time TensorFlow GPU memory monitoring
65
+ - TensorFlow-specific memory leak detection
66
+ - Integration with TensorFlow's memory management system
67
+ - Support for TensorFlow sessions and graph execution
68
+ - Keras model profiling capabilities
69
+ - Mixed precision profiling support
70
+ - Multi-GPU strategy profiling
71
+ - Command-line interface for TensorFlow workflows
72
+
73
+ - **Visualization & Analysis**
74
+
75
+ - Memory timeline plots with interactive features
76
+ - Function comparison charts
77
+ - Memory usage heatmaps
78
+ - Interactive dashboards with Plotly
79
+ - Memory fragmentation analysis
80
+ - Performance correlation analysis
81
+ - Optimization scoring and recommendations
82
+
83
+ - **Command Line Tools**
84
+
85
+ - `gpumemprof` CLI for PyTorch profiling
86
+ - `tfmemprof` CLI for TensorFlow profiling
87
+ - System information display
88
+ - Real-time monitoring capabilities
89
+ - Background tracking with alerts
90
+ - Results analysis and visualization
91
+
92
+ - **CPU Compatibility**
93
+
94
+ - CPU memory profiling for systems without GPU
95
+ - Cross-platform compatibility
96
+ - CPU-based model training profiling
97
+ - System RAM monitoring capabilities
98
+
99
+ - **Testing & Documentation**
100
+ - Comprehensive test suite for both GPU and CPU environments
101
+ - PyTorch testing guide with examples
102
+ - TensorFlow testing guide with examples
103
+ - CPU compatibility guide
104
+ - Complete API documentation
105
+ - Usage examples and tutorials
106
+ - Troubleshooting guides
107
+
108
+ ### Technical Features
109
+
110
+ - Modular architecture with 7 core components
111
+ - Thread-safe profiling with background monitoring
112
+ - Configurable sampling intervals and alert thresholds
113
+ - Support for multiple GPU devices
114
+ - Memory snapshot capture and analysis
115
+ - Tensor lifecycle tracking (PyTorch)
116
+ - Graph execution monitoring (TensorFlow)
117
+ - Export capabilities for further analysis
118
+
119
+ ### Documentation
120
+
121
+ - Comprehensive documentation in `/docs/` directory
122
+ - Quick start guides for both PyTorch and TensorFlow
123
+ - API reference with examples
124
+ - CLI usage guide
125
+ - CPU compatibility guide
126
+ - Testing guides for both frameworks
127
+ - In-depth technical article
128
+ - Contributing guidelines and code of conduct
129
+
130
+ ### Infrastructure
131
+
132
+ - Open source project structure
133
+ - MIT License
134
+ - Contributing guidelines (CONTRIBUTING.md)
135
+ - Code of Conduct (CODE_OF_CONDUCT.md)
136
+ - Security policy (SECURITY.md)
137
+ - Changelog tracking
138
+ - Development setup instructions
139
+
140
+ ---
141
+
142
+ ## Version History
143
+
144
+ - **0.1.0** (2024-12-19): Initial release with full PyTorch and TensorFlow support
145
+
146
+ ## Release Notes
147
+
148
+ ### Version 0.1.0
149
+
150
+ This is the initial release of GPU Memory Profiler, providing comprehensive memory profiling capabilities for both PyTorch and TensorFlow deep learning frameworks. The release includes:
151
+
152
+ - Complete PyTorch profiler with real-time monitoring, leak detection, and visualization
153
+ - Complete TensorFlow profiler with TensorFlow-specific optimizations
154
+ - Command-line interfaces for both frameworks
155
+ - CPU compatibility for systems without GPU support
156
+ - Comprehensive documentation and testing guides
157
+ - Open source project structure ready for community contributions
158
+
159
+ ### Breaking Changes
160
+
161
+ None - this is the initial release.
162
+
163
+ ### Known Issues
164
+
165
+ - Some visualization features may require additional dependencies (PyQt5, tkinter)
166
+ - TensorFlow CLI may have dependency conflicts with certain typing-extensions versions
167
+ - CPU profiling is limited compared to GPU profiling capabilities
168
+
169
+ ### Migration Guide
170
+
171
+ N/A - this is the initial release.
172
+
173
+ ---
174
+
175
+ ## Contributing to the Changelog
176
+
177
+ When contributing to the project, please update this changelog by adding entries under the appropriate version section. Follow the format:
178
+
179
+ - **Added** for new features
180
+ - **Changed** for changes in existing functionality
181
+ - **Deprecated** for soon-to-be removed features
182
+ - **Removed** for now removed features
183
+ - **Fixed** for any bug fixes
184
+ - **Security** for security vulnerability fixes
185
+
186
+ Use the present tense ("Add" not "Added") and imperative mood ("Move cursor to..." not "Moves cursor to...").
187
+
188
+ ---
189
+
190
+ **For more information about this project, see the [README](README.md) and [Documentation](docs/index.md).**