metaflow-observability 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. metaflow_observability-0.1.0/.github/workflows/ci.yml +36 -0
  2. metaflow_observability-0.1.0/.github/workflows/publish.yml +46 -0
  3. metaflow_observability-0.1.0/.gitignore +41 -0
  4. metaflow_observability-0.1.0/.pre-commit-config.yaml +17 -0
  5. metaflow_observability-0.1.0/LICENSE +175 -0
  6. metaflow_observability-0.1.0/PKG-INFO +162 -0
  7. metaflow_observability-0.1.0/README.md +140 -0
  8. metaflow_observability-0.1.0/pyproject.toml +81 -0
  9. metaflow_observability-0.1.0/src/metaflow_extensions/observability/__init__.py +3 -0
  10. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/__init__.py +0 -0
  11. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/decorator.py +86 -0
  12. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/export/__init__.py +0 -0
  13. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/export/otel.py +71 -0
  14. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/export/protocol.py +19 -0
  15. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/observer.py +102 -0
  16. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/samplers/__init__.py +0 -0
  17. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/samplers/base.py +48 -0
  18. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/samplers/cpu.py +74 -0
  19. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/samplers/disk.py +111 -0
  20. metaflow_observability-0.1.0/src/metaflow_extensions/observability/plugins/samplers/gpu.py +129 -0
  21. metaflow_observability-0.1.0/tests/__init__.py +0 -0
  22. metaflow_observability-0.1.0/tests/test_cpu_sampler.py +105 -0
  23. metaflow_observability-0.1.0/tests/test_disk_sampler.py +128 -0
  24. metaflow_observability-0.1.0/tests/test_gpu_sampler.py +125 -0
  25. metaflow_observability-0.1.0/tests/test_observer.py +216 -0
  26. metaflow_observability-0.1.0/tests/test_otel_exporter.py +84 -0
@@ -0,0 +1,36 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, master]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ matrix:
13
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up Python ${{ matrix.python-version }}
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: ${{ matrix.python-version }}
22
+
23
+ - name: Install dependencies
24
+ run: pip install -e ".[dev]"
25
+
26
+ - name: Lint
27
+ run: ruff check src tests
28
+
29
+ - name: Format check
30
+ run: ruff format --check src tests
31
+
32
+ - name: Type check
33
+ run: mypy
34
+
35
+ - name: Test
36
+ run: pytest
@@ -0,0 +1,46 @@
1
+ name: Publish
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.12"
18
+
19
+ - name: Install build tools
20
+ run: pip install hatch
21
+
22
+ - name: Build
23
+ run: hatch build
24
+
25
+ - name: Upload dist
26
+ uses: actions/upload-artifact@v4
27
+ with:
28
+ name: dist
29
+ path: dist/
30
+
31
+ publish:
32
+ needs: build
33
+ runs-on: ubuntu-latest
34
+ environment: pypi
35
+ permissions:
36
+ id-token: write # required for OIDC trusted publisher
37
+
38
+ steps:
39
+ - name: Download dist
40
+ uses: actions/download-artifact@v4
41
+ with:
42
+ name: dist
43
+ path: dist/
44
+
45
+ - name: Publish to PyPI
46
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,41 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .eggs/
11
+ .venv/
12
+ venv/
13
+ env/
14
+ .env
15
+
16
+ # Testing / coverage
17
+ .pytest_cache/
18
+ .coverage
19
+ .coverage.*
20
+ htmlcov/
21
+ coverage.xml
22
+
23
+ # Mypy
24
+ .mypy_cache/
25
+
26
+ # Ruff
27
+ .ruff_cache/
28
+
29
+ # Distribution
30
+ *.whl
31
+ *.tar.gz
32
+
33
+ # IDE
34
+ .idea/
35
+ .vscode/
36
+ *.swp
37
+ *.swo
38
+
39
+ # OS
40
+ .DS_Store
41
+ Thumbs.db
@@ -0,0 +1,17 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.4.0
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+
9
+ - repo: https://github.com/pre-commit/pre-commit-hooks
10
+ rev: v4.6.0
11
+ hooks:
12
+ - id: trailing-whitespace
13
+ - id: end-of-file-fixer
14
+ - id: check-yaml
15
+ - id: check-toml
16
+ - id: check-merge-conflict
17
+ - id: debug-statements
@@ -0,0 +1,175 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship made available under
36
+ the License, as indicated by a copyright notice that is included in
37
+ or attached to the work (an example is provided in the Appendix below).
38
+
39
+ "Derivative Works" shall mean any work, whether in Source or Object
40
+ form, that is based on (or derived from) the Work and for which the
41
+ editorial revisions, annotations, elaborations, or other
42
+ transformations represent, as a whole, an original work of authorship.
43
+ For the purposes of this License, Derivative Works shall not include
44
+ works that remain separable from, or merely link (or bind by name)
45
+ to the interfaces of, the Work and the Derivative Works thereof.
46
+
47
+ "Contribution" shall mean, as submitted to the Licensor for inclusion
48
+ in the Work by the copyright owner or by an individual or Legal Entity
49
+ authorized to submit on behalf of the copyright owner. For the purposes
50
+ of this definition, "submitted" means any form of electronic, verbal,
51
+ or written communication sent to the Licensor or its representatives,
52
+ including but not limited to communication on electronic mailing lists,
53
+ source code control systems, and issue tracking systems that are managed
54
+ by, or on behalf of, the Licensor for the purpose of tracking modifications
55
+ to the Work.
56
+
57
+ "Contributor" shall mean Licensor and any Legal Entity on behalf of
58
+ whom a Contribution has been received by the Licensor and included
59
+ within the Work.
60
+
61
+ 2. Grant of Copyright License. Subject to the terms and conditions of
62
+ this License, each Contributor hereby grants to You a perpetual,
63
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
64
+ copyright license to reproduce, prepare Derivative Works of,
65
+ publicly display, publicly perform, sublicense, and distribute the
66
+ Work and such Derivative Works in Source or Object form.
67
+
68
+ 3. Grant of Patent License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ (except as stated in this section) patent license to make, have made,
72
+ use, offer to sell, sell, import, and otherwise transfer the Work,
73
+ where such license applies only to those patent claims licensable
74
+ by such Contributor that are necessarily infringed by their
75
+ Contribution(s) alone or by the combination of their Contribution(s)
76
+ with the Work to which such Contribution(s) was submitted. If You
77
+ institute patent litigation against any entity (including a cross-claim
78
+ or counterclaim in a lawsuit) alleging that the Work or any such
79
+ Contribution constitutes direct or indirect patent infringement, then
80
+ any and all patent licenses granted to You under this License for that
81
+ Work shall terminate as of the date such litigation is filed.
82
+
83
+ 4. Redistribution. You may reproduce and distribute copies of the
84
+ Work or Derivative Works thereof in any medium, with or without
85
+ modifications, and in Source or Object form, provided that You
86
+ meet the following conditions:
87
+
88
+ (a) You must give any other recipients of the Work or Derivative
89
+ Works a copy of this License; and
90
+
91
+ (b) You must cause any modified files to carry prominent notices
92
+ stating that You changed the files; and
93
+
94
+ (c) You must retain, in the Source form of any Derivative Works
95
+ that You distribute, all copyright, patent, trademark, and
96
+ attribution notices from the Source form of the Work,
97
+ excluding those notices that do not pertain to any part of
98
+ the Derivative Works; and
99
+
100
+ (d) If the Work includes a "NOTICE" text file as part of its
101
+ distribution, You must include a readable copy of the
102
+ attribution notices contained within such NOTICE file, in
103
+ at least one of the following places: within a NOTICE text
104
+ file distributed as part of the Derivative Works; within
105
+ the Source form or documentation, if provided along with
106
+ the Derivative Works; or, within a display generated by
107
+ the Derivative Works, if and wherever such third-party notices
108
+ normally appear. The contents of the NOTICE file are for
109
+ informational purposes only and do not modify the License.
110
+ You may add Your own attribution notices within Derivative Works
111
+ that You distribute, alongside or in addition to the NOTICE text
112
+ from the Work, provided that such additional attribution notices
113
+ cannot be construed as modifying the License.
114
+
115
+ You may add Your own license statement for Your modifications and
116
+ may provide additional grant of rights to use, copy, modify, merge,
117
+ publish, distribute, sublicense, and/or sell copies of the Work.
118
+
119
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
120
+ any Contribution intentionally submitted for inclusion in the Work
121
+ by You to the Licensor shall be under the terms and conditions of
122
+ this License, without any additional terms or conditions.
123
+ Notwithstanding the above, nothing herein shall supersede or modify
124
+ the terms of any separate license agreement you may have executed
125
+ with Licensor regarding such Contributions.
126
+
127
+ 6. Trademarks. This License does not grant permission to use the trade
128
+ names, trademarks, service marks, or product names of the Licensor,
129
+ except as required for reasonable and customary use in describing the
130
+ origin of the Work and reproducing the content of the NOTICE file.
131
+
132
+ 7. Disclaimer of Warranty. Unless required by applicable law or
133
+ agreed to in writing, Licensor provides the Work (and each
134
+ Contributor provides its Contributions) on an "AS IS" BASIS,
135
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
136
+ implied, including, without limitation, any conditions of TITLE,
137
+ MERCHANTIBILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely
138
+ responsible for determining the appropriateness of using or
139
+ redistributing the Work and assume any risks associated with Your
140
+ exercise of permissions under this License.
141
+
142
+ 8. Limitation of Liability. In no event and under no legal theory,
143
+ whether in tort (including negligence), contract, or otherwise,
144
+ unless required by applicable law (such as deliberate and grossly
145
+ negligent acts) or agreed to in writing, shall any Contributor be
146
+ liable to You for damages, including any direct, indirect, special,
147
+ incidental, or exemplary damages of whatever character arising as a
148
+ result of this License or out of the use or inability to use the
149
+ Work (even if such Contributor has been advised of the possibility
150
+ of such damages).
151
+
152
+ 9. Accepting Warranty or Additional Liability. While redistributing
153
+ the Work or Derivative Works thereof, You may wish to offer, and
154
+ charge a fee for, acceptance of support, warranty, indemnity, or
155
+ other liability obligations and/or rights consistent with this
156
+ License. However, in accepting such obligations, You may offer only
157
+ conditions consistent with this License, and not additionally charge
158
+ a fee for, acceptance of any liability obligations and/or rights
159
+ consistent with this License.
160
+
161
+ END OF TERMS AND CONDITIONS
162
+
163
+ Copyright 2026 Nissan Pow
164
+
165
+ Licensed under the Apache License, Version 2.0 (the "License");
166
+ you may not use this file except in compliance with the License.
167
+ You may obtain a copy of the License at
168
+
169
+ http://www.apache.org/licenses/LICENSE-2.0
170
+
171
+ Unless required by applicable law or agreed to in writing, software
172
+ distributed under the License is distributed on an "AS IS" BASIS,
173
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
174
+ See the License for the specific language governing permissions and
175
+ limitations under the License.
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: metaflow-observability
3
+ Version: 0.1.0
4
+ Summary: Automatic observability for Metaflow — step duration, CPU, memory, disk, and GPU metrics via OpenTelemetry
5
+ License: Apache-2.0
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.9
8
+ Requires-Dist: metaflow>=2.9
9
+ Requires-Dist: opentelemetry-exporter-prometheus>=0.45b0
10
+ Requires-Dist: opentelemetry-sdk>=1.24
11
+ Requires-Dist: psutil>=5.9
12
+ Provides-Extra: dev
13
+ Requires-Dist: mypy>=1.10; extra == 'dev'
14
+ Requires-Dist: opentelemetry-sdk>=1.24; extra == 'dev'
15
+ Requires-Dist: pytest-cov; extra == 'dev'
16
+ Requires-Dist: pytest>=7; extra == 'dev'
17
+ Requires-Dist: ruff>=0.4; extra == 'dev'
18
+ Requires-Dist: types-psutil; extra == 'dev'
19
+ Provides-Extra: gpu
20
+ Requires-Dist: pynvml>=11.0.0; extra == 'gpu'
21
+ Description-Content-Type: text/markdown
22
+
23
+ # metaflow-observability
24
+
25
+ [![CI](https://github.com/npow/metaflow-observability/actions/workflows/ci.yml/badge.svg)](https://github.com/npow/metaflow-observability/actions/workflows/ci.yml)
26
+ [![PyPI](https://img.shields.io/pypi/v/metaflow-observability)](https://pypi.org/project/metaflow-observability/)
27
+ [![License: Apache-2.0](https://img.shields.io/badge/License-Apache--2.0-blue.svg)](LICENSE)
28
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
29
+ [![Docs](https://img.shields.io/badge/docs-mintlify-18a34a?style=flat-square)](https://mintlify.com/npow/metaflow-observability)
30
+
31
+ Get production metrics for every Metaflow step — without changing your flow code.
32
+
33
+ ## The problem
34
+
35
+ When a Metaflow pipeline slows down or crashes in production, you have no time-series data to tell you whether it was CPU saturation, a memory spike, a disk bottleneck, or a GPU stall. You're left digging through logs after the fact. Metaflow's built-in tooling gives you per-run artifacts and cards, but nothing you can alert on or trend over time.
36
+
37
+ ## Quick start
38
+
39
+ ```bash
40
+ pip install metaflow-observability
41
+ ```
42
+
43
+ ```python
44
+ from metaflow import FlowSpec, step
45
+ from metaflow.decorators import observability
46
+
47
+ class MyFlow(FlowSpec):
48
+
49
+ @observability
50
+ @step
51
+ def train(self):
52
+ ... # your code — metrics collected automatically
53
+ self.next(self.end)
54
+
55
+ @step
56
+ def end(self):
57
+ pass
58
+
59
+ if __name__ == "__main__":
60
+ MyFlow()
61
+ ```
62
+
63
+ Metrics are exported via OpenTelemetry. Point them at Prometheus + Grafana with:
64
+
65
+ ```bash
66
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
67
+ python flow.py run
68
+ ```
69
+
70
+ ## Install
71
+
72
+ ```bash
73
+ # Core (CPU, memory, disk, duration)
74
+ pip install metaflow-observability
75
+
76
+ # With GPU support (NVIDIA only, requires CUDA drivers)
77
+ pip install "metaflow-observability[gpu]"
78
+ ```
79
+
80
+ ## Usage
81
+
82
+ **Zero-config with Prometheus**
83
+
84
+ Add `@observability` to any step. By default, metrics are scraped via a Prometheus endpoint on port 8000.
85
+
86
+ ```python
87
+ @observability
88
+ @step
89
+ def preprocess(self):
90
+ ...
91
+ ```
92
+
93
+ **Custom OTel backend**
94
+
95
+ Use any OpenTelemetry-compatible backend (Grafana Cloud, Datadog, Honeycomb, etc.) via standard OTel environment variables:
96
+
97
+ ```bash
98
+ export OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp.example.com
99
+ export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer <token>"
100
+ ```
101
+
102
+ **GPU metrics**
103
+
104
+ Install the GPU extra and run on a CUDA-enabled machine — GPU utilization and memory are collected automatically per device, tagged with `gpu_index`.
105
+
106
+ ```bash
107
+ pip install "metaflow-observability[gpu]"
108
+ ```
109
+
110
+ ## How it works
111
+
112
+ `@observability` wraps `task_pre_step` / `task_post_step` / `task_exception` hooks in Metaflow's decorator API. Before your step code runs, it starts background threads that sample CPU%, RSS memory, disk I/O throughput, and (optionally) GPU utilization at 1-second intervals. When the step finishes, samples are aggregated and exported as OpenTelemetry instruments:
113
+
114
+ | Metric | Instrument | Tags |
115
+ |---|---|---|
116
+ | `step.duration` | Histogram (seconds) | `step`, `flow`, `run_id`, `retry` |
117
+ | `step.cpu.pct` | Gauge (avg / max / p95) | same |
118
+ | `step.memory.mb` | Gauge (avg / max RSS) | same |
119
+ | `step.disk.read_bytes` | Counter | same |
120
+ | `step.disk.write_bytes` | Counter | same |
121
+ | `step.disk.read_throughput` | Gauge (MB/s) | same |
122
+ | `step.disk.write_throughput` | Gauge (MB/s) | same |
123
+ | `step.gpu.utilization` | Gauge | + `gpu_index` |
124
+ | `step.gpu.memory.used_mb` | Gauge | + `gpu_index` |
125
+ | `step.retries` | Gauge | same |
126
+ | `step.failures` | Counter | same |
127
+
128
+ ## Configuration
129
+
130
+ All configuration is via standard OpenTelemetry environment variables. No extension-specific config needed.
131
+
132
+ | Variable | Purpose |
133
+ |---|---|
134
+ | `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP endpoint for traces and metrics |
135
+ | `OTEL_EXPORTER_OTLP_HEADERS` | Auth headers (e.g., `Authorization=Bearer ...`) |
136
+ | `OTEL_SERVICE_NAME` | Service name tag on all metrics |
137
+
138
+ If neither variable is set, metrics are printed to stdout via the OTel console exporter (useful for local debugging).
139
+
140
+ ## Development
141
+
142
+ ```bash
143
+ git clone https://github.com/npow/metaflow-observability
144
+ cd metaflow-observability
145
+ pip install -e ".[dev]"
146
+
147
+ # Run tests
148
+ pytest
149
+
150
+ # Lint + format
151
+ ruff check src tests
152
+ ruff format src tests
153
+
154
+ # Type check
155
+ mypy
156
+ ```
157
+
158
+ CI runs the full suite across Python 3.9, 3.10, 3.11, and 3.12 on every push.
159
+
160
+ ## License
161
+
162
+ [Apache-2.0](LICENSE)
@@ -0,0 +1,140 @@
1
+ # metaflow-observability
2
+
3
+ [![CI](https://github.com/npow/metaflow-observability/actions/workflows/ci.yml/badge.svg)](https://github.com/npow/metaflow-observability/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/metaflow-observability)](https://pypi.org/project/metaflow-observability/)
5
+ [![License: Apache-2.0](https://img.shields.io/badge/License-Apache--2.0-blue.svg)](LICENSE)
6
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
7
+ [![Docs](https://img.shields.io/badge/docs-mintlify-18a34a?style=flat-square)](https://mintlify.com/npow/metaflow-observability)
8
+
9
+ Get production metrics for every Metaflow step — without changing your flow code.
10
+
11
+ ## The problem
12
+
13
+ When a Metaflow pipeline slows down or crashes in production, you have no time-series data to tell you whether it was CPU saturation, a memory spike, a disk bottleneck, or a GPU stall. You're left digging through logs after the fact. Metaflow's built-in tooling gives you per-run artifacts and cards, but nothing you can alert on or trend over time.
14
+
15
+ ## Quick start
16
+
17
+ ```bash
18
+ pip install metaflow-observability
19
+ ```
20
+
21
+ ```python
22
+ from metaflow import FlowSpec, step
23
+ from metaflow.decorators import observability
24
+
25
+ class MyFlow(FlowSpec):
26
+
27
+ @observability
28
+ @step
29
+ def train(self):
30
+ ... # your code — metrics collected automatically
31
+ self.next(self.end)
32
+
33
+ @step
34
+ def end(self):
35
+ pass
36
+
37
+ if __name__ == "__main__":
38
+ MyFlow()
39
+ ```
40
+
41
+ Metrics are exported via OpenTelemetry. Point them at Prometheus + Grafana with:
42
+
43
+ ```bash
44
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
45
+ python flow.py run
46
+ ```
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ # Core (CPU, memory, disk, duration)
52
+ pip install metaflow-observability
53
+
54
+ # With GPU support (NVIDIA only, requires CUDA drivers)
55
+ pip install "metaflow-observability[gpu]"
56
+ ```
57
+
58
+ ## Usage
59
+
60
+ **Zero-config with Prometheus**
61
+
62
+ Add `@observability` to any step. By default, metrics are scraped via a Prometheus endpoint on port 8000.
63
+
64
+ ```python
65
+ @observability
66
+ @step
67
+ def preprocess(self):
68
+ ...
69
+ ```
70
+
71
+ **Custom OTel backend**
72
+
73
+ Use any OpenTelemetry-compatible backend (Grafana Cloud, Datadog, Honeycomb, etc.) via standard OTel environment variables:
74
+
75
+ ```bash
76
+ export OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp.example.com
77
+ export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer <token>"
78
+ ```
79
+
80
+ **GPU metrics**
81
+
82
+ Install the GPU extra and run on a CUDA-enabled machine — GPU utilization and memory are collected automatically per device, tagged with `gpu_index`.
83
+
84
+ ```bash
85
+ pip install "metaflow-observability[gpu]"
86
+ ```
87
+
88
+ ## How it works
89
+
90
+ `@observability` wraps `task_pre_step` / `task_post_step` / `task_exception` hooks in Metaflow's decorator API. Before your step code runs, it starts background threads that sample CPU%, RSS memory, disk I/O throughput, and (optionally) GPU utilization at 1-second intervals. When the step finishes, samples are aggregated and exported as OpenTelemetry instruments:
91
+
92
+ | Metric | Instrument | Tags |
93
+ |---|---|---|
94
+ | `step.duration` | Histogram (seconds) | `step`, `flow`, `run_id`, `retry` |
95
+ | `step.cpu.pct` | Gauge (avg / max / p95) | same |
96
+ | `step.memory.mb` | Gauge (avg / max RSS) | same |
97
+ | `step.disk.read_bytes` | Counter | same |
98
+ | `step.disk.write_bytes` | Counter | same |
99
+ | `step.disk.read_throughput` | Gauge (MB/s) | same |
100
+ | `step.disk.write_throughput` | Gauge (MB/s) | same |
101
+ | `step.gpu.utilization` | Gauge | + `gpu_index` |
102
+ | `step.gpu.memory.used_mb` | Gauge | + `gpu_index` |
103
+ | `step.retries` | Gauge | same |
104
+ | `step.failures` | Counter | same |
105
+
106
+ ## Configuration
107
+
108
+ All configuration is via standard OpenTelemetry environment variables. No extension-specific config needed.
109
+
110
+ | Variable | Purpose |
111
+ |---|---|
112
+ | `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP endpoint for traces and metrics |
113
+ | `OTEL_EXPORTER_OTLP_HEADERS` | Auth headers (e.g., `Authorization=Bearer ...`) |
114
+ | `OTEL_SERVICE_NAME` | Service name tag on all metrics |
115
+
116
+ If neither variable is set, metrics are printed to stdout via the OTel console exporter (useful for local debugging).
117
+
118
+ ## Development
119
+
120
+ ```bash
121
+ git clone https://github.com/npow/metaflow-observability
122
+ cd metaflow-observability
123
+ pip install -e ".[dev]"
124
+
125
+ # Run tests
126
+ pytest
127
+
128
+ # Lint + format
129
+ ruff check src tests
130
+ ruff format src tests
131
+
132
+ # Type check
133
+ mypy
134
+ ```
135
+
136
+ CI runs the full suite across Python 3.9, 3.10, 3.11, and 3.12 on every push.
137
+
138
+ ## License
139
+
140
+ [Apache-2.0](LICENSE)