reactor-runtime 2.7.4__tar.gz → 2.7.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/PKG-INFO +2 -1
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/pyproject.toml +10 -1
- reactor_runtime-2.7.6/src/reactor_runtime/experiment/__init__.py +30 -0
- reactor_runtime-2.7.6/src/reactor_runtime/experiment/session.py +228 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/internal/output_buffer.py +82 -9
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/__init__.py +2 -0
- reactor_runtime-2.7.6/src/reactor_runtime/profiling/nvml_sampler.py +200 -0
- reactor_runtime-2.7.6/src/reactor_runtime/profiling/torch_chunk_profiler.py +399 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/chunk_encoder.py +88 -11
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/config.py +18 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/markers.py +10 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/http/http_runtime.py +83 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/PKG-INFO +2 -1
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/SOURCES.txt +4 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/requires.txt +1 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/README.md +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/setup.cfg +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/api/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/config.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/defaults.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/driver/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/driver/pipeline_executor.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/driver/step_result.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/connected.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/event.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/messages.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/upload.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/internal/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/internal/input_buffer.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/internal/reactor_core.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/model/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/model/decorators.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/model/handlers.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/model/reactor_model.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/pipeline/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/pipeline/idle.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/pipeline/input_state.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/pipeline/reactor_pipeline.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/tracks/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/tracks/descriptors.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/tracks/input.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/tracks/output.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/upload.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/model_state.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/backends/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/backends/base.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/backends/file.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/backends/otlp.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/helpers.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/plotting/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/plotting/plot_profiling.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/profiler.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/singleton.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/chunk_uploader.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/session_recorder.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/sinks.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/track_resolver.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtime_api.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/headless/config.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/headless/headless_runtime.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/headless/input_feeder.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/http/config.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/http/types.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/schema.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/schema_validator.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/__main__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/commands/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/commands/run.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/commands/schema.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/main.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/utils/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/utils/config.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/utils/runtime.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/audio_track.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/client.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/frame_conversion.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/ice_connection.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/video_track.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/config.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/events.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/client.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/av1.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/base.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/factory.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/h264.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/h265.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/opus.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/vp8.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/vp9.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/av1.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/base.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/factory.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/h264.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/h265.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/opus.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/vp8.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/vp9.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/gst.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/gst_helpers.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/probes/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/probes/fps_probe.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/receiver/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/receiver/audio.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/receiver/base.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/receiver/video.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/bundle.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/codec.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/extmap.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/ice.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sender/__init__.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sender/audio.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sender/base.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sender/video.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/settings.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/signals.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/ice_uris.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/interface.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/media.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/types.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/launch.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/loader.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/log.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/messages.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/paths.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/ports.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/typing.py +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/dependency_links.txt +0 -0
- {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: reactor_runtime
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.6
|
|
4
4
|
Summary: Reactor runtime with public model API
|
|
5
5
|
Author-email: Reactor <team@reactor.inc>
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -23,6 +23,7 @@ Requires-Dist: opentelemetry-exporter-prometheus~=0.63b0
|
|
|
23
23
|
Requires-Dist: grpcio>=1.80.0
|
|
24
24
|
Requires-Dist: grpcio-health-checking>=1.80.0
|
|
25
25
|
Requires-Dist: opentelemetry-instrumentation-grpc~=0.63b0
|
|
26
|
+
Requires-Dist: pynvml>=11.5
|
|
26
27
|
Provides-Extra: gst
|
|
27
28
|
Requires-Dist: PyGObject>=3.56.0; extra == "gst"
|
|
28
29
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "reactor_runtime"
|
|
7
|
-
version = "2.7.
|
|
7
|
+
version = "2.7.6"
|
|
8
8
|
description = "Reactor runtime with public model API"
|
|
9
9
|
authors = [
|
|
10
10
|
{ name = "Reactor", email = "team@reactor.inc" }
|
|
@@ -34,6 +34,15 @@ dependencies = [
|
|
|
34
34
|
"grpcio>=1.80.0",
|
|
35
35
|
"grpcio-health-checking>=1.80.0",
|
|
36
36
|
"opentelemetry-instrumentation-grpc~=0.63b0",
|
|
37
|
+
# pynvml is the experiment-tracking NVML sampler's runtime dep. Kept as a
|
|
38
|
+
# default rather than behind an `experiment` extra because production model
|
|
39
|
+
# images install reactor-runtime via the `[gst]` extra path — putting pynvml
|
|
40
|
+
# behind a separate extra meant tracked runs in those images had no VRAM /
|
|
41
|
+
# GPU-util metrics at all (the sampler hit the ImportError path silently).
|
|
42
|
+
# pynvml is pure Python and has no system-level CUDA dep at install time,
|
|
43
|
+
# so it's safe to include in the base install for non-GPU envs too — the
|
|
44
|
+
# sampler still degrades gracefully when nvmlInit fails at runtime.
|
|
45
|
+
"pynvml>=11.5",
|
|
37
46
|
]
|
|
38
47
|
|
|
39
48
|
[project.optional-dependencies]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
"""Experiment-tracking integration for the Reactor runtime.
|
|
4
|
+
|
|
5
|
+
Activates only when ``EXPERIMENT_ID`` is set in the environment. When unset,
|
|
6
|
+
``maybe_build_session()`` returns ``None`` and the runtime takes the same
|
|
7
|
+
code paths it always has — no overhead, no NVML sampling, no profiler.
|
|
8
|
+
|
|
9
|
+
The runtime's only role is to produce artifacts (recording, metrics
|
|
10
|
+
summary, profile trace, derived config, session times) under a single
|
|
11
|
+
directory. The skill side (``iterate-model.sh``) reads that directory
|
|
12
|
+
post-run, uploads to S3 via presigned URLs, and POSTs ``/update_experiment``.
|
|
13
|
+
This keeps the runtime free of HTTP, boto3, and presigned-URL env vars.
|
|
14
|
+
|
|
15
|
+
See ``internal/experiment_tracking/`` for the backing API.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from reactor_runtime.experiment.session import (
|
|
19
|
+
EXPERIMENT_ENV_ARTIFACTS_DIR,
|
|
20
|
+
EXPERIMENT_ENV_ID,
|
|
21
|
+
ExperimentSession,
|
|
22
|
+
maybe_build_session,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"EXPERIMENT_ENV_ARTIFACTS_DIR",
|
|
27
|
+
"EXPERIMENT_ENV_ID",
|
|
28
|
+
"ExperimentSession",
|
|
29
|
+
"maybe_build_session",
|
|
30
|
+
]
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
"""Container-side experiment session — passive artifact producer.
|
|
4
|
+
|
|
5
|
+
When ``EXPERIMENT_ID`` is set in the env on rank0, the runtime drops the
|
|
6
|
+
following files into ``/tmp/experiment-<id>/`` at session stop:
|
|
7
|
+
|
|
8
|
+
- ``session_times.json`` start + end ISO timestamps (UTC)
|
|
9
|
+
- ``metrics_summary.json`` NVML aggregate (avg/peak VRAM, GPU util %)
|
|
10
|
+
- ``derived_config.json`` model._config snapshot (or runtime merged
|
|
11
|
+
config dict if the model didn't expose one)
|
|
12
|
+
- ``profile.pt.trace.json.gz`` torch.profiler chrome trace, only if
|
|
13
|
+
EXPERIMENT_PROFILE_RANGE is set (see
|
|
14
|
+
reactor_runtime.profiling.ChunkRangeProfiler)
|
|
15
|
+
- ``recording.mp4`` fMP4 init.mp4 + chunk_*.m4s byte-concatenated
|
|
16
|
+
into a single playable file, if the runtime
|
|
17
|
+
had recording enabled and the session
|
|
18
|
+
produced chunks
|
|
19
|
+
|
|
20
|
+
That's it — no HTTP, no boto3, no S3. The script (``iterate-model.sh``
|
|
21
|
+
on the developer's machine) reads the artifacts dir over SSH, tars +
|
|
22
|
+
gzips it, POSTs it to ``/experiments/<id>/finalize``. The service
|
|
23
|
+
extracts the JSONs into RDS and routes the binary blobs (profile +
|
|
24
|
+
recording) into S3 using its own task role — the developer never needs
|
|
25
|
+
S3 PUT credentials.
|
|
26
|
+
|
|
27
|
+
Non-rank0 workers return ``None`` from :func:`maybe_build_session` and
|
|
28
|
+
take the runtime's normal no-op path — no duplicate finalize calls.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
import json
|
|
34
|
+
import os
|
|
35
|
+
import shutil
|
|
36
|
+
from datetime import datetime, timezone
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
from typing import Any
|
|
39
|
+
|
|
40
|
+
from reactor_runtime.profiling.nvml_sampler import NVMLSampler
|
|
41
|
+
from reactor_runtime.utils.log import get_logger
|
|
42
|
+
|
|
43
|
+
# datetime.UTC is a 3.11+ alias for timezone.utc. CI lint runs mypy with
|
|
44
|
+
# --python-version 3.10, so import timezone.utc and alias it ourselves.
|
|
45
|
+
UTC = timezone.utc
|
|
46
|
+
|
|
47
|
+
logger = get_logger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
EXPERIMENT_ENV_ID = "EXPERIMENT_ID"
|
|
51
|
+
# Override the default artifacts dir. Defaults to /tmp/experiment-<id>/.
|
|
52
|
+
EXPERIMENT_ENV_ARTIFACTS_DIR = "EXPERIMENT_ARTIFACTS_DIR"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ExperimentSession:
|
|
56
|
+
"""One in-flight experiment session.
|
|
57
|
+
|
|
58
|
+
Owns the artifacts directory and the session-start/end timestamps. All
|
|
59
|
+
network egress is the caller's responsibility (``iterate-model.sh``).
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
experiment_id: str,
|
|
65
|
+
artifacts_dir: Path,
|
|
66
|
+
) -> None:
|
|
67
|
+
self.experiment_id = experiment_id
|
|
68
|
+
self.artifacts_dir = artifacts_dir
|
|
69
|
+
self.started_at: datetime | None = None
|
|
70
|
+
self.ended_at: datetime | None = None
|
|
71
|
+
self._nvml_sampler: NVMLSampler | None = None
|
|
72
|
+
self.artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
|
|
74
|
+
def mark_start(self, device_index: int = 0) -> None:
|
|
75
|
+
self.started_at = datetime.now(UTC)
|
|
76
|
+
try:
|
|
77
|
+
self._nvml_sampler = NVMLSampler(
|
|
78
|
+
device_index=device_index,
|
|
79
|
+
attrs={"experiment_id": self.experiment_id},
|
|
80
|
+
)
|
|
81
|
+
self._nvml_sampler.start()
|
|
82
|
+
except Exception:
|
|
83
|
+
logger.exception("Failed to start NVML sampler; continuing without")
|
|
84
|
+
self._nvml_sampler = None
|
|
85
|
+
|
|
86
|
+
def mark_end(self) -> None:
|
|
87
|
+
self.ended_at = datetime.now(UTC)
|
|
88
|
+
|
|
89
|
+
def finalize(
|
|
90
|
+
self,
|
|
91
|
+
*,
|
|
92
|
+
derived_config: dict[str, Any] | None,
|
|
93
|
+
recording_session_dir: Path | None = None,
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Write every artifact the skill will ship to the tracker into
|
|
96
|
+
``self.artifacts_dir``.
|
|
97
|
+
|
|
98
|
+
Each artifact is independent — a failure on one doesn't prevent
|
|
99
|
+
the others from being written. The skill treats a missing file as
|
|
100
|
+
"this artifact wasn't produced".
|
|
101
|
+
|
|
102
|
+
Note: the torch.profiler chrome trace + key_averages summary
|
|
103
|
+
(``profile.pt.trace.json.gz`` / ``profile.summary.txt``) are
|
|
104
|
+
written DIRECTLY into ``self.artifacts_dir`` by the model's
|
|
105
|
+
worker process via
|
|
106
|
+
:class:`reactor_runtime.profiling.ChunkRangeProfiler`. This
|
|
107
|
+
method doesn't move them around; it just trusts that they're
|
|
108
|
+
already in place by the time it runs.
|
|
109
|
+
"""
|
|
110
|
+
if self.started_at and self.ended_at:
|
|
111
|
+
self._write_json(
|
|
112
|
+
"session_times.json",
|
|
113
|
+
{
|
|
114
|
+
"start": self.started_at.isoformat(),
|
|
115
|
+
"end": self.ended_at.isoformat(),
|
|
116
|
+
},
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
metrics_summary = self._stop_sampling_and_summarize()
|
|
120
|
+
if metrics_summary is not None:
|
|
121
|
+
self._write_json("metrics_summary.json", metrics_summary)
|
|
122
|
+
|
|
123
|
+
if derived_config is not None:
|
|
124
|
+
self._write_json("derived_config.json", derived_config)
|
|
125
|
+
|
|
126
|
+
if recording_session_dir is not None:
|
|
127
|
+
self._assemble_recording(recording_session_dir)
|
|
128
|
+
|
|
129
|
+
logger.info(
|
|
130
|
+
"Experiment artifacts finalized",
|
|
131
|
+
experiment_id=self.experiment_id,
|
|
132
|
+
artifacts_dir=str(self.artifacts_dir),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def _assemble_recording(self, session_dir: Path) -> None:
|
|
136
|
+
"""Concatenate ``init.mp4 + chunk_*.m4s`` into ``recording.mp4``.
|
|
137
|
+
|
|
138
|
+
fMP4 byte-concat is valid for players that accept fragmented MP4
|
|
139
|
+
(Safari, Chrome, ffmpeg). Streams chunks through ``copyfileobj``
|
|
140
|
+
rather than buffering in RAM since recordings can be 100 MB+.
|
|
141
|
+
|
|
142
|
+
Leading-black trimming used to live here as a post-hoc ffmpeg
|
|
143
|
+
pass. Removed once the recorder learned to drop pre-roll
|
|
144
|
+
duplicates at the source (REA-2323 / #2325) — the bytes never
|
|
145
|
+
reach init.mp4 / chunk_*.m4s anymore.
|
|
146
|
+
"""
|
|
147
|
+
init = session_dir / "init.mp4"
|
|
148
|
+
if not init.exists():
|
|
149
|
+
return # Recording disabled or session was empty.
|
|
150
|
+
|
|
151
|
+
chunks = sorted(session_dir.glob("chunk_*.m4s"))
|
|
152
|
+
dest = self.artifacts_dir / "recording.mp4"
|
|
153
|
+
try:
|
|
154
|
+
with open(dest, "wb") as out:
|
|
155
|
+
with open(init, "rb") as src:
|
|
156
|
+
shutil.copyfileobj(src, out)
|
|
157
|
+
for chunk in chunks:
|
|
158
|
+
with open(chunk, "rb") as src:
|
|
159
|
+
shutil.copyfileobj(src, out)
|
|
160
|
+
except OSError as err:
|
|
161
|
+
logger.warning(
|
|
162
|
+
"Failed to assemble recording.mp4",
|
|
163
|
+
session_dir=str(session_dir),
|
|
164
|
+
error=str(err),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def _stop_sampling_and_summarize(self) -> dict[str, Any] | None:
|
|
168
|
+
sampler = self._nvml_sampler
|
|
169
|
+
if sampler is None:
|
|
170
|
+
return None
|
|
171
|
+
self._nvml_sampler = None
|
|
172
|
+
try:
|
|
173
|
+
sampler.stop()
|
|
174
|
+
except Exception:
|
|
175
|
+
logger.exception("NVML sampler stop failed")
|
|
176
|
+
return None
|
|
177
|
+
state = sampler.state
|
|
178
|
+
n = max(int(state.get("samples", 0)), 1)
|
|
179
|
+
return {
|
|
180
|
+
"samples": int(state.get("samples", 0)),
|
|
181
|
+
"avg_vram_gb": round(float(state.get("vram_sum_gb", 0.0)) / n, 3),
|
|
182
|
+
"peak_vram_gb": round(float(state.get("vram_peak_gb", 0.0)), 3),
|
|
183
|
+
"avg_gpu_util_pct": round(float(state.get("gpu_util_sum_pct", 0.0)) / n, 2),
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
def _write_json(self, name: str, payload: Any) -> None:
|
|
187
|
+
path = self.artifacts_dir / name
|
|
188
|
+
try:
|
|
189
|
+
path.write_text(json.dumps(payload, default=str, indent=2))
|
|
190
|
+
except OSError as err:
|
|
191
|
+
logger.warning(
|
|
192
|
+
"Failed to write experiment artifact",
|
|
193
|
+
name=name,
|
|
194
|
+
error=str(err),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def maybe_build_session() -> ExperimentSession | None:
|
|
199
|
+
"""Return an ``ExperimentSession`` iff the runtime was launched as a
|
|
200
|
+
tracked experiment (``EXPERIMENT_ID`` set) AND this worker is rank0.
|
|
201
|
+
|
|
202
|
+
Non-rank0 workers and untracked runs both get ``None`` — callers
|
|
203
|
+
must treat that as "no tracking, behave as before".
|
|
204
|
+
"""
|
|
205
|
+
# Only rank0 produces artifacts. Multiple ranks writing into the same
|
|
206
|
+
# /tmp/experiment-<id>/ would race on the JSON files; the cheaper fix
|
|
207
|
+
# is to keep this whole subsystem inert on non-rank0 workers.
|
|
208
|
+
if int(os.environ.get("RANK", "0")) != 0:
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
experiment_id = os.environ.get(EXPERIMENT_ENV_ID)
|
|
212
|
+
if not experiment_id:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
artifacts_dir = Path(
|
|
216
|
+
os.environ.get(EXPERIMENT_ENV_ARTIFACTS_DIR)
|
|
217
|
+
or f"/tmp/experiment-{experiment_id}"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
logger.info(
|
|
221
|
+
"Experiment tracking enabled",
|
|
222
|
+
experiment_id=experiment_id,
|
|
223
|
+
artifacts_dir=str(artifacts_dir),
|
|
224
|
+
)
|
|
225
|
+
return ExperimentSession(
|
|
226
|
+
experiment_id=experiment_id,
|
|
227
|
+
artifacts_dir=artifacts_dir,
|
|
228
|
+
)
|
|
@@ -102,6 +102,34 @@ def split_batch(bundle: MediaBundle) -> List[MediaBundle]:
|
|
|
102
102
|
return result
|
|
103
103
|
|
|
104
104
|
|
|
105
|
+
class _FlushMarker:
|
|
106
|
+
"""Sentinel placed in :class:`OutputBuffer`'s queue by :meth:`flush`.
|
|
107
|
+
|
|
108
|
+
Carries no payload. When the emission loop dequeues an instance, it
|
|
109
|
+
resets ``_last_emitted`` *in-thread* and falls through to this
|
|
110
|
+
tick's empty-queue fallback (which dispatches a fresh
|
|
111
|
+
``_create_black_bundle()`` with ``duplicate=True``).
|
|
112
|
+
|
|
113
|
+
The point of routing the session-boundary reset through the queue
|
|
114
|
+
rather than writing ``_last_emitted = None`` directly from
|
|
115
|
+
:meth:`flush` is to make the reset atomic with respect to the
|
|
116
|
+
emission thread: only the emission thread reads or writes
|
|
117
|
+
``_last_emitted``, so the loop can never observe a half-applied
|
|
118
|
+
"queue drained but cached frame not yet cleared" state. Down-stream
|
|
119
|
+
callbacks already drop ``duplicate=True`` bundles (the wire
|
|
120
|
+
callback in ``_send_out_app_bundle_sync`` short-circuits, and the
|
|
121
|
+
:class:`~reactor_runtime.recording.session_recorder.SessionRecorder`
|
|
122
|
+
skips them when ``skip_leading_black`` is on and
|
|
123
|
+
``recording_started`` is still False), so no stale frame would
|
|
124
|
+
reach the client or recording even without this guarantee — but
|
|
125
|
+
closing the race architecturally is cheaper than re-deriving that
|
|
126
|
+
safety argument for every callback added in the future.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
_FLUSH_MARKER: _FlushMarker = _FlushMarker()
|
|
131
|
+
|
|
132
|
+
|
|
105
133
|
class OutputBuffer:
|
|
106
134
|
"""Rate-controlled emission buffer.
|
|
107
135
|
|
|
@@ -127,7 +155,11 @@ class OutputBuffer:
|
|
|
127
155
|
self._callbacks: List[Callable[[MediaBundle, bool], None]] = []
|
|
128
156
|
self._callbacks_lock: threading.Lock = threading.Lock()
|
|
129
157
|
|
|
130
|
-
|
|
158
|
+
# Queue items are normally MediaBundle, but flush() also puts a
|
|
159
|
+
# _FlushMarker sentinel here. We type the queue as ``object`` so
|
|
160
|
+
# both shapes are valid; the emission loop discriminates with
|
|
161
|
+
# ``isinstance``.
|
|
162
|
+
self._q: queue.Queue[object] = queue.Queue(maxsize=queue_depth)
|
|
131
163
|
|
|
132
164
|
# FPS control — store both rate and period to avoid 1/fps on every tick
|
|
133
165
|
self._fixed_fps: float = 0.0
|
|
@@ -322,11 +354,24 @@ class OutputBuffer:
|
|
|
322
354
|
while not self._emission_stop.is_set():
|
|
323
355
|
interval = self._interval
|
|
324
356
|
|
|
325
|
-
|
|
357
|
+
item: object
|
|
326
358
|
try:
|
|
327
|
-
|
|
359
|
+
item = self._q.get_nowait()
|
|
328
360
|
except queue.Empty:
|
|
329
|
-
|
|
361
|
+
item = None
|
|
362
|
+
|
|
363
|
+
bundle: Optional[MediaBundle] = None
|
|
364
|
+
if isinstance(item, _FlushMarker):
|
|
365
|
+
# Session-boundary reset, processed in the emission
|
|
366
|
+
# thread so there is no cross-thread race with
|
|
367
|
+
# flush(): clear the cached frame and fall through
|
|
368
|
+
# to the empty-queue fallback below, which
|
|
369
|
+
# dispatches duplicate=True black. mark_first_real_frame()
|
|
370
|
+
# is gated on duplicate=False, so the recorder's
|
|
371
|
+
# latch (REA-2323 / #2325) stays unset.
|
|
372
|
+
self._last_emitted = None
|
|
373
|
+
elif isinstance(item, MediaBundle):
|
|
374
|
+
bundle = item
|
|
330
375
|
|
|
331
376
|
if bundle is not None:
|
|
332
377
|
vtracks = bundle.get_tracks_by_kind(TrackKind.VIDEO)
|
|
@@ -464,11 +509,39 @@ class OutputBuffer:
|
|
|
464
509
|
self._drain_queue()
|
|
465
510
|
|
|
466
511
|
def flush(self) -> None:
|
|
467
|
-
"""
|
|
512
|
+
"""Drop pending bundles and request a session-boundary reset.
|
|
513
|
+
|
|
514
|
+
The reset of ``_last_emitted`` is performed by the emission
|
|
515
|
+
thread when it dequeues the :class:`_FlushMarker` sentinel,
|
|
516
|
+
making the operation race-free with the per-tick
|
|
517
|
+
"what do I emit?" decision in :meth:`_emission_loop`. After the
|
|
518
|
+
sentinel is consumed, the next emission tick synthesises
|
|
519
|
+
``_create_black_bundle()`` with ``duplicate=True`` — the
|
|
520
|
+
correct pre-roll behaviour for a session boundary:
|
|
521
|
+
|
|
522
|
+
* the wire callback (``_send_out_app_bundle_sync``) drops
|
|
523
|
+
``duplicate=True`` outright;
|
|
524
|
+
* the recorder's ``mark_first_real_frame()`` latch is not
|
|
525
|
+
tripped (REA-2323 / #2325), so the recording timeline only
|
|
526
|
+
starts at the next real model frame.
|
|
527
|
+
|
|
528
|
+
Safe to call from any thread; in practice ``flush()`` is
|
|
529
|
+
called from the model thread (``model-run``), the same thread
|
|
530
|
+
that calls :meth:`submit`, so there is no submit/flush race
|
|
531
|
+
on the producer side either.
|
|
532
|
+
"""
|
|
468
533
|
self._drain_queue()
|
|
469
|
-
self._last_emitted = None
|
|
470
|
-
black = self._create_black_bundle()
|
|
471
534
|
try:
|
|
472
|
-
self._q.put_nowait(
|
|
535
|
+
self._q.put_nowait(_FLUSH_MARKER)
|
|
473
536
|
except queue.Full:
|
|
474
|
-
|
|
537
|
+
# Unreachable in practice: the queue is bounded and the
|
|
538
|
+
# only producer (submit) runs on the same thread as
|
|
539
|
+
# flush, so nothing can refill the queue between drain
|
|
540
|
+
# and put. Log loudly if it ever happens and continue —
|
|
541
|
+
# the next real frame from the new session will set
|
|
542
|
+
# _last_emitted correctly even if the sentinel never
|
|
543
|
+
# lands.
|
|
544
|
+
logger.warning(
|
|
545
|
+
"OutputBuffer.flush: queue full immediately after drain; "
|
|
546
|
+
"reset sentinel dropped"
|
|
547
|
+
)
|
|
@@ -76,9 +76,11 @@ from reactor_runtime.profiling.profiler import (
|
|
|
76
76
|
from reactor_runtime.profiling.singleton import get_profiler, set_profiler
|
|
77
77
|
from reactor_runtime.profiling.backends.base import ProfilerBackend
|
|
78
78
|
from reactor_runtime.profiling.helpers import profile_fn
|
|
79
|
+
from reactor_runtime.profiling.torch_chunk_profiler import ChunkRangeProfiler
|
|
79
80
|
|
|
80
81
|
__all__ = [
|
|
81
82
|
"BucketPreset",
|
|
83
|
+
"ChunkRangeProfiler",
|
|
82
84
|
"CudaTimingMode",
|
|
83
85
|
"Profiler",
|
|
84
86
|
"ProfilerSection",
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
"""NVML resource sampler — general-purpose GPU observability.
|
|
4
|
+
|
|
5
|
+
Polls VRAM use + GPU utilization at 1Hz on a daemon thread and emits them
|
|
6
|
+
as OTLP gauges through the meter pipeline (set up by
|
|
7
|
+
``reactor_machine_metrics``). The exporter ships them to Grafana via the
|
|
8
|
+
same path the rest of the runtime uses.
|
|
9
|
+
|
|
10
|
+
Activation is the caller's responsibility — typical pattern is
|
|
11
|
+
"start it when there's a session you care about, stop it when the
|
|
12
|
+
session ends". The sampler doesn't know about experiments or sessions;
|
|
13
|
+
it just samples and emits. Callers that want a snapshot-style summary
|
|
14
|
+
(e.g. for inclusion in a row update) can read ``.state`` at stop time
|
|
15
|
+
and aggregate however they like.
|
|
16
|
+
|
|
17
|
+
Cost: ~2 NVML calls per second on a side thread. The main thread and
|
|
18
|
+
CUDA streams are untouched.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import threading
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from opentelemetry import metrics
|
|
27
|
+
|
|
28
|
+
from reactor_runtime.utils.log import get_logger
|
|
29
|
+
|
|
30
|
+
logger = get_logger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class NVMLSampler:
|
|
34
|
+
"""Background NVML poller emitting OTLP gauges + maintaining state.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
device_index:
|
|
39
|
+
CUDA device to poll. Defaults to 0.
|
|
40
|
+
interval_s:
|
|
41
|
+
Poll interval in seconds. 1Hz default.
|
|
42
|
+
attrs:
|
|
43
|
+
Label dict applied to every emitted observation. Use this to
|
|
44
|
+
attach context like ``{"experiment_id": "exp_..."}`` or
|
|
45
|
+
``{"model_name": "delta-forcing"}``. ``None`` → empty attrs
|
|
46
|
+
(gauges still emit, just unlabeled beyond the global meter labels).
|
|
47
|
+
meter_name:
|
|
48
|
+
OTel meter name. Defaults to ``reactor.machine`` — keep this
|
|
49
|
+
consistent with what your dashboards expect.
|
|
50
|
+
metric_namespace:
|
|
51
|
+
Prefix for emitted metrics. ``reactor.machine`` → emits
|
|
52
|
+
``reactor.machine.vram_used_gb`` + ``reactor.machine.gpu_util_pct``.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
device_index: int = 0,
|
|
58
|
+
interval_s: float = 1.0,
|
|
59
|
+
attrs: dict[str, str] | None = None,
|
|
60
|
+
meter_name: str = "reactor.machine",
|
|
61
|
+
metric_namespace: str = "reactor.machine",
|
|
62
|
+
) -> None:
|
|
63
|
+
self._device_index = device_index
|
|
64
|
+
self._interval_s = interval_s
|
|
65
|
+
self._stop_evt = threading.Event()
|
|
66
|
+
self._thread: threading.Thread | None = None
|
|
67
|
+
|
|
68
|
+
meter = metrics.get_meter(meter_name)
|
|
69
|
+
self._attrs: dict[str, str] = dict(attrs or {})
|
|
70
|
+
|
|
71
|
+
# In-memory state read by .state — used for snapshot-style
|
|
72
|
+
# aggregation at session-stop (avg/peak summaries etc.).
|
|
73
|
+
self._last_vram_gb: float = 0.0
|
|
74
|
+
self._last_gpu_util_pct: float = 0.0
|
|
75
|
+
self._sample_count: int = 0
|
|
76
|
+
self._vram_sum_gb: float = 0.0
|
|
77
|
+
self._gpu_util_sum_pct: float = 0.0
|
|
78
|
+
self._vram_peak_gb: float = 0.0
|
|
79
|
+
|
|
80
|
+
meter.create_observable_gauge(
|
|
81
|
+
name=f"{metric_namespace}.vram_used_gb",
|
|
82
|
+
callbacks=[self._observe_vram],
|
|
83
|
+
description="VRAM used (GB), sampled at 1Hz",
|
|
84
|
+
unit="GB",
|
|
85
|
+
)
|
|
86
|
+
meter.create_observable_gauge(
|
|
87
|
+
name=f"{metric_namespace}.gpu_util_pct",
|
|
88
|
+
callbacks=[self._observe_gpu_util],
|
|
89
|
+
description="GPU utilization (%), sampled at 1Hz",
|
|
90
|
+
unit="%",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
# OTel callbacks
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def _observe_vram(self, _options): # type: ignore[no-untyped-def]
|
|
98
|
+
from opentelemetry.metrics import Observation
|
|
99
|
+
|
|
100
|
+
return [Observation(self._last_vram_gb, self._attrs)]
|
|
101
|
+
|
|
102
|
+
def _observe_gpu_util(self, _options): # type: ignore[no-untyped-def]
|
|
103
|
+
from opentelemetry.metrics import Observation
|
|
104
|
+
|
|
105
|
+
return [Observation(self._last_gpu_util_pct, self._attrs)]
|
|
106
|
+
|
|
107
|
+
# ------------------------------------------------------------------
|
|
108
|
+
# Lifecycle
|
|
109
|
+
# ------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
def start(self) -> None:
|
|
112
|
+
if self._thread is not None:
|
|
113
|
+
return
|
|
114
|
+
self._thread = threading.Thread(
|
|
115
|
+
target=self._run,
|
|
116
|
+
name=f"nvml-sampler-{self._device_index}",
|
|
117
|
+
daemon=True,
|
|
118
|
+
)
|
|
119
|
+
self._thread.start()
|
|
120
|
+
|
|
121
|
+
def stop(self) -> None:
|
|
122
|
+
self._stop_evt.set()
|
|
123
|
+
thread = self._thread
|
|
124
|
+
if thread is not None and thread.is_alive():
|
|
125
|
+
thread.join(timeout=2.0)
|
|
126
|
+
self._thread = None
|
|
127
|
+
|
|
128
|
+
# ------------------------------------------------------------------
|
|
129
|
+
# Sampling loop
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
def _run(self) -> None:
|
|
133
|
+
try:
|
|
134
|
+
import pynvml # type: ignore
|
|
135
|
+
except ImportError:
|
|
136
|
+
logger.warning("pynvml not installed; NVML sampler disabled")
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
pynvml.nvmlInit()
|
|
141
|
+
except Exception as err:
|
|
142
|
+
logger.warning("nvmlInit failed", error=str(err))
|
|
143
|
+
return
|
|
144
|
+
|
|
145
|
+
# Once nvmlInit() succeeds we MUST pair it with nvmlShutdown(),
|
|
146
|
+
# even if handle lookup below fails — otherwise NVML's library
|
|
147
|
+
# state leaks for the lifetime of the process.
|
|
148
|
+
try:
|
|
149
|
+
try:
|
|
150
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(self._device_index)
|
|
151
|
+
except Exception as err:
|
|
152
|
+
logger.warning(
|
|
153
|
+
"nvmlDeviceGetHandleByIndex failed",
|
|
154
|
+
device_index=self._device_index,
|
|
155
|
+
error=str(err),
|
|
156
|
+
)
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
while not self._stop_evt.is_set():
|
|
160
|
+
try:
|
|
161
|
+
mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
162
|
+
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
163
|
+
vram_gb = mem.used / 1e9
|
|
164
|
+
gpu_pct = float(util.gpu)
|
|
165
|
+
self._last_vram_gb = vram_gb
|
|
166
|
+
self._last_gpu_util_pct = gpu_pct
|
|
167
|
+
self._vram_sum_gb += vram_gb
|
|
168
|
+
self._gpu_util_sum_pct += gpu_pct
|
|
169
|
+
self._vram_peak_gb = max(self._vram_peak_gb, vram_gb)
|
|
170
|
+
self._sample_count += 1
|
|
171
|
+
except Exception as err:
|
|
172
|
+
logger.debug("NVML sample failed", error=str(err))
|
|
173
|
+
# Sleep on the stop-event so stop() unblocks immediately.
|
|
174
|
+
self._stop_evt.wait(self._interval_s)
|
|
175
|
+
finally:
|
|
176
|
+
try:
|
|
177
|
+
pynvml.nvmlShutdown()
|
|
178
|
+
except Exception as err:
|
|
179
|
+
logger.debug("nvmlShutdown failed", error=str(err))
|
|
180
|
+
|
|
181
|
+
# ------------------------------------------------------------------
|
|
182
|
+
# State accessor for snapshot aggregation
|
|
183
|
+
# ------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def state(self) -> dict[str, Any]:
|
|
187
|
+
"""Raw counter state — caller aggregates as they wish.
|
|
188
|
+
|
|
189
|
+
Snapshot semantics: returns the current counters at call time. Safe
|
|
190
|
+
to call from another thread; reads are atomic enough that the
|
|
191
|
+
worst case is a sample number that doesn't quite match the sums
|
|
192
|
+
(off by one). For the avg/peak rollups used by experiment-tracking
|
|
193
|
+
that's fine.
|
|
194
|
+
"""
|
|
195
|
+
return {
|
|
196
|
+
"samples": self._sample_count,
|
|
197
|
+
"vram_sum_gb": self._vram_sum_gb,
|
|
198
|
+
"vram_peak_gb": self._vram_peak_gb,
|
|
199
|
+
"gpu_util_sum_pct": self._gpu_util_sum_pct,
|
|
200
|
+
}
|