reactor-runtime 2.7.4__tar.gz → 2.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/PKG-INFO +2 -1
  2. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/pyproject.toml +10 -1
  3. reactor_runtime-2.7.5/src/reactor_runtime/experiment/__init__.py +30 -0
  4. reactor_runtime-2.7.5/src/reactor_runtime/experiment/session.py +228 -0
  5. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/__init__.py +2 -0
  6. reactor_runtime-2.7.5/src/reactor_runtime/profiling/nvml_sampler.py +200 -0
  7. reactor_runtime-2.7.5/src/reactor_runtime/profiling/torch_chunk_profiler.py +399 -0
  8. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/runtimes/http/http_runtime.py +83 -0
  9. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime.egg-info/PKG-INFO +2 -1
  10. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime.egg-info/SOURCES.txt +4 -0
  11. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime.egg-info/requires.txt +1 -0
  12. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/README.md +0 -0
  13. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/setup.cfg +0 -0
  14. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/api/__init__.py +0 -0
  15. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/__init__.py +0 -0
  16. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/config.py +0 -0
  17. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/__init__.py +0 -0
  18. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/defaults.py +0 -0
  19. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/driver/__init__.py +0 -0
  20. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/driver/pipeline_executor.py +0 -0
  21. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/driver/step_result.py +0 -0
  22. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/events/__init__.py +0 -0
  23. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/events/connected.py +0 -0
  24. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/events/event.py +0 -0
  25. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/events/messages.py +0 -0
  26. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/events/upload.py +0 -0
  27. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/internal/__init__.py +0 -0
  28. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/internal/input_buffer.py +0 -0
  29. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/internal/output_buffer.py +0 -0
  30. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/internal/reactor_core.py +0 -0
  31. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/model/__init__.py +0 -0
  32. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/model/decorators.py +0 -0
  33. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/model/handlers.py +0 -0
  34. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/model/reactor_model.py +0 -0
  35. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/pipeline/__init__.py +0 -0
  36. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/pipeline/idle.py +0 -0
  37. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/pipeline/input_state.py +0 -0
  38. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/pipeline/reactor_pipeline.py +0 -0
  39. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/tracks/__init__.py +0 -0
  40. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/tracks/descriptors.py +0 -0
  41. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/tracks/input.py +0 -0
  42. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/tracks/output.py +0 -0
  43. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/interface/upload.py +0 -0
  44. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/model_state.py +0 -0
  45. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/backends/__init__.py +0 -0
  46. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/backends/base.py +0 -0
  47. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/backends/file.py +0 -0
  48. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/backends/otlp.py +0 -0
  49. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/helpers.py +0 -0
  50. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/plotting/__init__.py +0 -0
  51. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/plotting/plot_profiling.py +0 -0
  52. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/profiler.py +0 -0
  53. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/singleton.py +0 -0
  54. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/recording/__init__.py +0 -0
  55. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/recording/chunk_encoder.py +0 -0
  56. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/recording/chunk_uploader.py +0 -0
  57. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/recording/config.py +0 -0
  58. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/recording/markers.py +0 -0
  59. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/recording/session_recorder.py +0 -0
  60. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/recording/sinks.py +0 -0
  61. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/recording/track_resolver.py +0 -0
  62. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/runtime_api.py +0 -0
  63. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/runtimes/headless/config.py +0 -0
  64. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/runtimes/headless/headless_runtime.py +0 -0
  65. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/runtimes/headless/input_feeder.py +0 -0
  66. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/runtimes/http/config.py +0 -0
  67. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/runtimes/http/types.py +0 -0
  68. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/schema.py +0 -0
  69. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/schema_validator.py +0 -0
  70. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/__init__.py +0 -0
  71. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/__main__.py +0 -0
  72. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/commands/__init__.py +0 -0
  73. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/commands/run.py +0 -0
  74. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/commands/schema.py +0 -0
  75. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/main.py +0 -0
  76. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/utils/__init__.py +0 -0
  77. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/utils/config.py +0 -0
  78. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/serve/utils/runtime.py +0 -0
  79. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/__init__.py +0 -0
  80. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/aiortc/__init__.py +0 -0
  81. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/aiortc/audio_track.py +0 -0
  82. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/aiortc/client.py +0 -0
  83. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/aiortc/frame_conversion.py +0 -0
  84. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/aiortc/ice_connection.py +0 -0
  85. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/aiortc/video_track.py +0 -0
  86. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/config.py +0 -0
  87. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/events.py +0 -0
  88. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/__init__.py +0 -0
  89. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/client.py +0 -0
  90. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/__init__.py +0 -0
  91. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/av1.py +0 -0
  92. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/base.py +0 -0
  93. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/factory.py +0 -0
  94. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/h264.py +0 -0
  95. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/h265.py +0 -0
  96. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/opus.py +0 -0
  97. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/vp8.py +0 -0
  98. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/decoders/vp9.py +0 -0
  99. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/__init__.py +0 -0
  100. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/av1.py +0 -0
  101. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/base.py +0 -0
  102. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/factory.py +0 -0
  103. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/h264.py +0 -0
  104. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/h265.py +0 -0
  105. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/opus.py +0 -0
  106. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/vp8.py +0 -0
  107. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/encoders/vp9.py +0 -0
  108. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/gst.py +0 -0
  109. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/gst_helpers.py +0 -0
  110. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/probes/__init__.py +0 -0
  111. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/probes/fps_probe.py +0 -0
  112. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/receiver/__init__.py +0 -0
  113. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/receiver/audio.py +0 -0
  114. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/receiver/base.py +0 -0
  115. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/receiver/video.py +0 -0
  116. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sdp/__init__.py +0 -0
  117. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sdp/bundle.py +0 -0
  118. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sdp/codec.py +0 -0
  119. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sdp/extmap.py +0 -0
  120. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sdp/ice.py +0 -0
  121. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sender/__init__.py +0 -0
  122. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sender/audio.py +0 -0
  123. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sender/base.py +0 -0
  124. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/sender/video.py +0 -0
  125. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/settings.py +0 -0
  126. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/gstreamer/signals.py +0 -0
  127. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/ice_uris.py +0 -0
  128. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/interface.py +0 -0
  129. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/media.py +0 -0
  130. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/transports/types.py +0 -0
  131. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/utils/launch.py +0 -0
  132. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/utils/loader.py +0 -0
  133. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/utils/log.py +0 -0
  134. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/utils/messages.py +0 -0
  135. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/utils/paths.py +0 -0
  136. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/utils/ports.py +0 -0
  137. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/utils/typing.py +0 -0
  138. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime.egg-info/dependency_links.txt +0 -0
  139. {reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: reactor_runtime
3
- Version: 2.7.4
3
+ Version: 2.7.5
4
4
  Summary: Reactor runtime with public model API
5
5
  Author-email: Reactor <team@reactor.inc>
6
6
  Requires-Python: >=3.9
@@ -23,6 +23,7 @@ Requires-Dist: opentelemetry-exporter-prometheus~=0.63b0
23
23
  Requires-Dist: grpcio>=1.80.0
24
24
  Requires-Dist: grpcio-health-checking>=1.80.0
25
25
  Requires-Dist: opentelemetry-instrumentation-grpc~=0.63b0
26
+ Requires-Dist: pynvml>=11.5
26
27
  Provides-Extra: gst
27
28
  Requires-Dist: PyGObject>=3.56.0; extra == "gst"
28
29
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "reactor_runtime"
7
- version = "2.7.4"
7
+ version = "2.7.5"
8
8
  description = "Reactor runtime with public model API"
9
9
  authors = [
10
10
  { name = "Reactor", email = "team@reactor.inc" }
@@ -34,6 +34,15 @@ dependencies = [
34
34
  "grpcio>=1.80.0",
35
35
  "grpcio-health-checking>=1.80.0",
36
36
  "opentelemetry-instrumentation-grpc~=0.63b0",
37
+ # pynvml is the experiment-tracking NVML sampler's runtime dep. Kept as a
38
+ # default rather than behind an `experiment` extra because production model
39
+ # images install reactor-runtime via the `[gst]` extra path — putting pynvml
40
+ # behind a separate extra meant tracked runs in those images had no VRAM /
41
+ # GPU-util metrics at all (the sampler hit the ImportError path silently).
42
+ # pynvml is pure Python and has no system-level CUDA dep at install time,
43
+ # so it's safe to include in the base install for non-GPU envs too — the
44
+ # sampler still degrades gracefully when nvmlInit fails at runtime.
45
+ "pynvml>=11.5",
37
46
  ]
38
47
 
39
48
  [project.optional-dependencies]
@@ -0,0 +1,30 @@
1
+ # Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
2
+
3
+ """Experiment-tracking integration for the Reactor runtime.
4
+
5
+ Activates only when ``EXPERIMENT_ID`` is set in the environment. When unset,
6
+ ``maybe_build_session()`` returns ``None`` and the runtime takes the same
7
+ code paths it always has — no overhead, no NVML sampling, no profiler.
8
+
9
+ The runtime's only role is to produce artifacts (recording, metrics
10
+ summary, profile trace, derived config, session times) under a single
11
+ directory. The skill side (``iterate-model.sh``) reads that directory
12
+ post-run, uploads to S3 via presigned URLs, and POSTs ``/update_experiment``.
13
+ This keeps the runtime free of HTTP, boto3, and presigned-URL env vars.
14
+
15
+ See ``internal/experiment_tracking/`` for the backing API.
16
+ """
17
+
18
+ from reactor_runtime.experiment.session import (
19
+ EXPERIMENT_ENV_ARTIFACTS_DIR,
20
+ EXPERIMENT_ENV_ID,
21
+ ExperimentSession,
22
+ maybe_build_session,
23
+ )
24
+
25
+ __all__ = [
26
+ "EXPERIMENT_ENV_ARTIFACTS_DIR",
27
+ "EXPERIMENT_ENV_ID",
28
+ "ExperimentSession",
29
+ "maybe_build_session",
30
+ ]
@@ -0,0 +1,228 @@
1
+ # Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
2
+
3
+ """Container-side experiment session — passive artifact producer.
4
+
5
+ When ``EXPERIMENT_ID`` is set in the env on rank0, the runtime drops the
6
+ following files into ``/tmp/experiment-<id>/`` at session stop:
7
+
8
+ - ``session_times.json`` start + end ISO timestamps (UTC)
9
+ - ``metrics_summary.json`` NVML aggregate (avg/peak VRAM, GPU util %)
10
+ - ``derived_config.json`` model._config snapshot (or runtime merged
11
+ config dict if the model didn't expose one)
12
+ - ``profile.pt.trace.json.gz`` torch.profiler chrome trace, only if
13
+ EXPERIMENT_PROFILE_RANGE is set (see
14
+ reactor_runtime.profiling.ChunkRangeProfiler)
15
+ - ``recording.mp4`` fMP4 init.mp4 + chunk_*.m4s byte-concatenated
16
+ into a single playable file, if the runtime
17
+ had recording enabled and the session
18
+ produced chunks
19
+
20
+ That's it — no HTTP, no boto3, no S3. The script (``iterate-model.sh``
21
+ on the developer's machine) reads the artifacts dir over SSH, tars +
22
+ gzips it, POSTs it to ``/experiments/<id>/finalize``. The service
23
+ extracts the JSONs into RDS and routes the binary blobs (profile +
24
+ recording) into S3 using its own task role — the developer never needs
25
+ S3 PUT credentials.
26
+
27
+ Non-rank0 workers return ``None`` from :func:`maybe_build_session` and
28
+ take the runtime's normal no-op path — no duplicate finalize calls.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import json
34
+ import os
35
+ import shutil
36
+ from datetime import datetime, timezone
37
+ from pathlib import Path
38
+ from typing import Any
39
+
40
+ from reactor_runtime.profiling.nvml_sampler import NVMLSampler
41
+ from reactor_runtime.utils.log import get_logger
42
+
43
+ # datetime.UTC is a 3.11+ alias for timezone.utc. CI lint runs mypy with
44
+ # --python-version 3.10, so import timezone.utc and alias it ourselves.
45
+ UTC = timezone.utc
46
+
47
+ logger = get_logger(__name__)
48
+
49
+
50
+ EXPERIMENT_ENV_ID = "EXPERIMENT_ID"
51
+ # Override the default artifacts dir. Defaults to /tmp/experiment-<id>/.
52
+ EXPERIMENT_ENV_ARTIFACTS_DIR = "EXPERIMENT_ARTIFACTS_DIR"
53
+
54
+
55
+ class ExperimentSession:
56
+ """One in-flight experiment session.
57
+
58
+ Owns the artifacts directory and the session-start/end timestamps. All
59
+ network egress is the caller's responsibility (``iterate-model.sh``).
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ experiment_id: str,
65
+ artifacts_dir: Path,
66
+ ) -> None:
67
+ self.experiment_id = experiment_id
68
+ self.artifacts_dir = artifacts_dir
69
+ self.started_at: datetime | None = None
70
+ self.ended_at: datetime | None = None
71
+ self._nvml_sampler: NVMLSampler | None = None
72
+ self.artifacts_dir.mkdir(parents=True, exist_ok=True)
73
+
74
+ def mark_start(self, device_index: int = 0) -> None:
75
+ self.started_at = datetime.now(UTC)
76
+ try:
77
+ self._nvml_sampler = NVMLSampler(
78
+ device_index=device_index,
79
+ attrs={"experiment_id": self.experiment_id},
80
+ )
81
+ self._nvml_sampler.start()
82
+ except Exception:
83
+ logger.exception("Failed to start NVML sampler; continuing without")
84
+ self._nvml_sampler = None
85
+
86
+ def mark_end(self) -> None:
87
+ self.ended_at = datetime.now(UTC)
88
+
89
+ def finalize(
90
+ self,
91
+ *,
92
+ derived_config: dict[str, Any] | None,
93
+ recording_session_dir: Path | None = None,
94
+ ) -> None:
95
+ """Write every artifact the skill will ship to the tracker into
96
+ ``self.artifacts_dir``.
97
+
98
+ Each artifact is independent — a failure on one doesn't prevent
99
+ the others from being written. The skill treats a missing file as
100
+ "this artifact wasn't produced".
101
+
102
+ Note: the torch.profiler chrome trace + key_averages summary
103
+ (``profile.pt.trace.json.gz`` / ``profile.summary.txt``) are
104
+ written DIRECTLY into ``self.artifacts_dir`` by the model's
105
+ worker process via
106
+ :class:`reactor_runtime.profiling.ChunkRangeProfiler`. This
107
+ method doesn't move them around; it just trusts that they're
108
+ already in place by the time it runs.
109
+ """
110
+ if self.started_at and self.ended_at:
111
+ self._write_json(
112
+ "session_times.json",
113
+ {
114
+ "start": self.started_at.isoformat(),
115
+ "end": self.ended_at.isoformat(),
116
+ },
117
+ )
118
+
119
+ metrics_summary = self._stop_sampling_and_summarize()
120
+ if metrics_summary is not None:
121
+ self._write_json("metrics_summary.json", metrics_summary)
122
+
123
+ if derived_config is not None:
124
+ self._write_json("derived_config.json", derived_config)
125
+
126
+ if recording_session_dir is not None:
127
+ self._assemble_recording(recording_session_dir)
128
+
129
+ logger.info(
130
+ "Experiment artifacts finalized",
131
+ experiment_id=self.experiment_id,
132
+ artifacts_dir=str(self.artifacts_dir),
133
+ )
134
+
135
+ def _assemble_recording(self, session_dir: Path) -> None:
136
+ """Concatenate ``init.mp4 + chunk_*.m4s`` into ``recording.mp4``.
137
+
138
+ fMP4 byte-concat is valid for players that accept fragmented MP4
139
+ (Safari, Chrome, ffmpeg). Streams chunks through ``copyfileobj``
140
+ rather than buffering in RAM since recordings can be 100 MB+.
141
+
142
+ Leading-black trimming used to live here as a post-hoc ffmpeg
143
+ pass. Removed once the recorder learned to drop pre-roll
144
+ duplicates at the source (REA-2323 / #2325) — the bytes never
145
+ reach init.mp4 / chunk_*.m4s anymore.
146
+ """
147
+ init = session_dir / "init.mp4"
148
+ if not init.exists():
149
+ return # Recording disabled or session was empty.
150
+
151
+ chunks = sorted(session_dir.glob("chunk_*.m4s"))
152
+ dest = self.artifacts_dir / "recording.mp4"
153
+ try:
154
+ with open(dest, "wb") as out:
155
+ with open(init, "rb") as src:
156
+ shutil.copyfileobj(src, out)
157
+ for chunk in chunks:
158
+ with open(chunk, "rb") as src:
159
+ shutil.copyfileobj(src, out)
160
+ except OSError as err:
161
+ logger.warning(
162
+ "Failed to assemble recording.mp4",
163
+ session_dir=str(session_dir),
164
+ error=str(err),
165
+ )
166
+
167
+ def _stop_sampling_and_summarize(self) -> dict[str, Any] | None:
168
+ sampler = self._nvml_sampler
169
+ if sampler is None:
170
+ return None
171
+ self._nvml_sampler = None
172
+ try:
173
+ sampler.stop()
174
+ except Exception:
175
+ logger.exception("NVML sampler stop failed")
176
+ return None
177
+ state = sampler.state
178
+ n = max(int(state.get("samples", 0)), 1)
179
+ return {
180
+ "samples": int(state.get("samples", 0)),
181
+ "avg_vram_gb": round(float(state.get("vram_sum_gb", 0.0)) / n, 3),
182
+ "peak_vram_gb": round(float(state.get("vram_peak_gb", 0.0)), 3),
183
+ "avg_gpu_util_pct": round(float(state.get("gpu_util_sum_pct", 0.0)) / n, 2),
184
+ }
185
+
186
+ def _write_json(self, name: str, payload: Any) -> None:
187
+ path = self.artifacts_dir / name
188
+ try:
189
+ path.write_text(json.dumps(payload, default=str, indent=2))
190
+ except OSError as err:
191
+ logger.warning(
192
+ "Failed to write experiment artifact",
193
+ name=name,
194
+ error=str(err),
195
+ )
196
+
197
+
198
+ def maybe_build_session() -> ExperimentSession | None:
199
+ """Return an ``ExperimentSession`` iff the runtime was launched as a
200
+ tracked experiment (``EXPERIMENT_ID`` set) AND this worker is rank0.
201
+
202
+ Non-rank0 workers and untracked runs both get ``None`` — callers
203
+ must treat that as "no tracking, behave as before".
204
+ """
205
+ # Only rank0 produces artifacts. Multiple ranks writing into the same
206
+ # /tmp/experiment-<id>/ would race on the JSON files; the cheaper fix
207
+ # is to keep this whole subsystem inert on non-rank0 workers.
208
+ if int(os.environ.get("RANK", "0")) != 0:
209
+ return None
210
+
211
+ experiment_id = os.environ.get(EXPERIMENT_ENV_ID)
212
+ if not experiment_id:
213
+ return None
214
+
215
+ artifacts_dir = Path(
216
+ os.environ.get(EXPERIMENT_ENV_ARTIFACTS_DIR)
217
+ or f"/tmp/experiment-{experiment_id}"
218
+ )
219
+
220
+ logger.info(
221
+ "Experiment tracking enabled",
222
+ experiment_id=experiment_id,
223
+ artifacts_dir=str(artifacts_dir),
224
+ )
225
+ return ExperimentSession(
226
+ experiment_id=experiment_id,
227
+ artifacts_dir=artifacts_dir,
228
+ )
@@ -76,9 +76,11 @@ from reactor_runtime.profiling.profiler import (
76
76
  from reactor_runtime.profiling.singleton import get_profiler, set_profiler
77
77
  from reactor_runtime.profiling.backends.base import ProfilerBackend
78
78
  from reactor_runtime.profiling.helpers import profile_fn
79
+ from reactor_runtime.profiling.torch_chunk_profiler import ChunkRangeProfiler
79
80
 
80
81
  __all__ = [
81
82
  "BucketPreset",
83
+ "ChunkRangeProfiler",
82
84
  "CudaTimingMode",
83
85
  "Profiler",
84
86
  "ProfilerSection",
@@ -0,0 +1,200 @@
1
+ # Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
2
+
3
+ """NVML resource sampler — general-purpose GPU observability.
4
+
5
+ Polls VRAM use + GPU utilization at 1Hz on a daemon thread and emits them
6
+ as OTLP gauges through the meter pipeline (set up by
7
+ ``reactor_machine_metrics``). The exporter ships them to Grafana via the
8
+ same path the rest of the runtime uses.
9
+
10
+ Activation is the caller's responsibility — typical pattern is
11
+ "start it when there's a session you care about, stop it when the
12
+ session ends". The sampler doesn't know about experiments or sessions;
13
+ it just samples and emits. Callers that want a snapshot-style summary
14
+ (e.g. for inclusion in a row update) can read ``.state`` at stop time
15
+ and aggregate however they like.
16
+
17
+ Cost: ~2 NVML calls per second on a side thread. The main thread and
18
+ CUDA streams are untouched.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import threading
24
+ from typing import Any
25
+
26
+ from opentelemetry import metrics
27
+
28
+ from reactor_runtime.utils.log import get_logger
29
+
30
+ logger = get_logger(__name__)
31
+
32
+
33
+ class NVMLSampler:
34
+ """Background NVML poller emitting OTLP gauges + maintaining state.
35
+
36
+ Parameters
37
+ ----------
38
+ device_index:
39
+ CUDA device to poll. Defaults to 0.
40
+ interval_s:
41
+ Poll interval in seconds. 1Hz default.
42
+ attrs:
43
+ Label dict applied to every emitted observation. Use this to
44
+ attach context like ``{"experiment_id": "exp_..."}`` or
45
+ ``{"model_name": "delta-forcing"}``. ``None`` → empty attrs
46
+ (gauges still emit, just unlabeled beyond the global meter labels).
47
+ meter_name:
48
+ OTel meter name. Defaults to ``reactor.machine`` — keep this
49
+ consistent with what your dashboards expect.
50
+ metric_namespace:
51
+ Prefix for emitted metrics. ``reactor.machine`` → emits
52
+ ``reactor.machine.vram_used_gb`` + ``reactor.machine.gpu_util_pct``.
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ device_index: int = 0,
58
+ interval_s: float = 1.0,
59
+ attrs: dict[str, str] | None = None,
60
+ meter_name: str = "reactor.machine",
61
+ metric_namespace: str = "reactor.machine",
62
+ ) -> None:
63
+ self._device_index = device_index
64
+ self._interval_s = interval_s
65
+ self._stop_evt = threading.Event()
66
+ self._thread: threading.Thread | None = None
67
+
68
+ meter = metrics.get_meter(meter_name)
69
+ self._attrs: dict[str, str] = dict(attrs or {})
70
+
71
+ # In-memory state read by .state — used for snapshot-style
72
+ # aggregation at session-stop (avg/peak summaries etc.).
73
+ self._last_vram_gb: float = 0.0
74
+ self._last_gpu_util_pct: float = 0.0
75
+ self._sample_count: int = 0
76
+ self._vram_sum_gb: float = 0.0
77
+ self._gpu_util_sum_pct: float = 0.0
78
+ self._vram_peak_gb: float = 0.0
79
+
80
+ meter.create_observable_gauge(
81
+ name=f"{metric_namespace}.vram_used_gb",
82
+ callbacks=[self._observe_vram],
83
+ description="VRAM used (GB), sampled at 1Hz",
84
+ unit="GB",
85
+ )
86
+ meter.create_observable_gauge(
87
+ name=f"{metric_namespace}.gpu_util_pct",
88
+ callbacks=[self._observe_gpu_util],
89
+ description="GPU utilization (%), sampled at 1Hz",
90
+ unit="%",
91
+ )
92
+
93
+ # ------------------------------------------------------------------
94
+ # OTel callbacks
95
+ # ------------------------------------------------------------------
96
+
97
+ def _observe_vram(self, _options): # type: ignore[no-untyped-def]
98
+ from opentelemetry.metrics import Observation
99
+
100
+ return [Observation(self._last_vram_gb, self._attrs)]
101
+
102
+ def _observe_gpu_util(self, _options): # type: ignore[no-untyped-def]
103
+ from opentelemetry.metrics import Observation
104
+
105
+ return [Observation(self._last_gpu_util_pct, self._attrs)]
106
+
107
+ # ------------------------------------------------------------------
108
+ # Lifecycle
109
+ # ------------------------------------------------------------------
110
+
111
+ def start(self) -> None:
112
+ if self._thread is not None:
113
+ return
114
+ self._thread = threading.Thread(
115
+ target=self._run,
116
+ name=f"nvml-sampler-{self._device_index}",
117
+ daemon=True,
118
+ )
119
+ self._thread.start()
120
+
121
+ def stop(self) -> None:
122
+ self._stop_evt.set()
123
+ thread = self._thread
124
+ if thread is not None and thread.is_alive():
125
+ thread.join(timeout=2.0)
126
+ self._thread = None
127
+
128
+ # ------------------------------------------------------------------
129
+ # Sampling loop
130
+ # ------------------------------------------------------------------
131
+
132
+ def _run(self) -> None:
133
+ try:
134
+ import pynvml # type: ignore
135
+ except ImportError:
136
+ logger.warning("pynvml not installed; NVML sampler disabled")
137
+ return
138
+
139
+ try:
140
+ pynvml.nvmlInit()
141
+ except Exception as err:
142
+ logger.warning("nvmlInit failed", error=str(err))
143
+ return
144
+
145
+ # Once nvmlInit() succeeds we MUST pair it with nvmlShutdown(),
146
+ # even if handle lookup below fails — otherwise NVML's library
147
+ # state leaks for the lifetime of the process.
148
+ try:
149
+ try:
150
+ handle = pynvml.nvmlDeviceGetHandleByIndex(self._device_index)
151
+ except Exception as err:
152
+ logger.warning(
153
+ "nvmlDeviceGetHandleByIndex failed",
154
+ device_index=self._device_index,
155
+ error=str(err),
156
+ )
157
+ return
158
+
159
+ while not self._stop_evt.is_set():
160
+ try:
161
+ mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
162
+ util = pynvml.nvmlDeviceGetUtilizationRates(handle)
163
+ vram_gb = mem.used / 1e9
164
+ gpu_pct = float(util.gpu)
165
+ self._last_vram_gb = vram_gb
166
+ self._last_gpu_util_pct = gpu_pct
167
+ self._vram_sum_gb += vram_gb
168
+ self._gpu_util_sum_pct += gpu_pct
169
+ self._vram_peak_gb = max(self._vram_peak_gb, vram_gb)
170
+ self._sample_count += 1
171
+ except Exception as err:
172
+ logger.debug("NVML sample failed", error=str(err))
173
+ # Sleep on the stop-event so stop() unblocks immediately.
174
+ self._stop_evt.wait(self._interval_s)
175
+ finally:
176
+ try:
177
+ pynvml.nvmlShutdown()
178
+ except Exception as err:
179
+ logger.debug("nvmlShutdown failed", error=str(err))
180
+
181
+ # ------------------------------------------------------------------
182
+ # State accessor for snapshot aggregation
183
+ # ------------------------------------------------------------------
184
+
185
+ @property
186
+ def state(self) -> dict[str, Any]:
187
+ """Raw counter state — caller aggregates as they wish.
188
+
189
+ Snapshot semantics: returns the current counters at call time. Safe
190
+ to call from another thread; reads are atomic enough that the
191
+ worst case is a sample number that doesn't quite match the sums
192
+ (off by one). For the avg/peak rollups used by experiment-tracking
193
+ that's fine.
194
+ """
195
+ return {
196
+ "samples": self._sample_count,
197
+ "vram_sum_gb": self._vram_sum_gb,
198
+ "vram_peak_gb": self._vram_peak_gb,
199
+ "gpu_util_sum_pct": self._gpu_util_sum_pct,
200
+ }