reactor-runtime 2.7.4__tar.gz → 2.7.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/PKG-INFO +2 -1
  2. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/pyproject.toml +10 -1
  3. reactor_runtime-2.7.6/src/reactor_runtime/experiment/__init__.py +30 -0
  4. reactor_runtime-2.7.6/src/reactor_runtime/experiment/session.py +228 -0
  5. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/internal/output_buffer.py +82 -9
  6. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/__init__.py +2 -0
  7. reactor_runtime-2.7.6/src/reactor_runtime/profiling/nvml_sampler.py +200 -0
  8. reactor_runtime-2.7.6/src/reactor_runtime/profiling/torch_chunk_profiler.py +399 -0
  9. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/chunk_encoder.py +88 -11
  10. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/config.py +18 -0
  11. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/markers.py +10 -0
  12. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/http/http_runtime.py +83 -0
  13. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/PKG-INFO +2 -1
  14. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/SOURCES.txt +4 -0
  15. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/requires.txt +1 -0
  16. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/README.md +0 -0
  17. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/setup.cfg +0 -0
  18. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/api/__init__.py +0 -0
  19. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/__init__.py +0 -0
  20. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/config.py +0 -0
  21. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/__init__.py +0 -0
  22. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/defaults.py +0 -0
  23. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/driver/__init__.py +0 -0
  24. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/driver/pipeline_executor.py +0 -0
  25. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/driver/step_result.py +0 -0
  26. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/__init__.py +0 -0
  27. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/connected.py +0 -0
  28. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/event.py +0 -0
  29. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/messages.py +0 -0
  30. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/events/upload.py +0 -0
  31. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/internal/__init__.py +0 -0
  32. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/internal/input_buffer.py +0 -0
  33. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/internal/reactor_core.py +0 -0
  34. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/model/__init__.py +0 -0
  35. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/model/decorators.py +0 -0
  36. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/model/handlers.py +0 -0
  37. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/model/reactor_model.py +0 -0
  38. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/pipeline/__init__.py +0 -0
  39. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/pipeline/idle.py +0 -0
  40. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/pipeline/input_state.py +0 -0
  41. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/pipeline/reactor_pipeline.py +0 -0
  42. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/tracks/__init__.py +0 -0
  43. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/tracks/descriptors.py +0 -0
  44. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/tracks/input.py +0 -0
  45. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/tracks/output.py +0 -0
  46. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/interface/upload.py +0 -0
  47. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/model_state.py +0 -0
  48. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/backends/__init__.py +0 -0
  49. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/backends/base.py +0 -0
  50. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/backends/file.py +0 -0
  51. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/backends/otlp.py +0 -0
  52. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/helpers.py +0 -0
  53. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/plotting/__init__.py +0 -0
  54. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/plotting/plot_profiling.py +0 -0
  55. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/profiler.py +0 -0
  56. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/profiling/singleton.py +0 -0
  57. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/__init__.py +0 -0
  58. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/chunk_uploader.py +0 -0
  59. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/session_recorder.py +0 -0
  60. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/sinks.py +0 -0
  61. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/recording/track_resolver.py +0 -0
  62. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtime_api.py +0 -0
  63. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/headless/config.py +0 -0
  64. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/headless/headless_runtime.py +0 -0
  65. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/headless/input_feeder.py +0 -0
  66. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/http/config.py +0 -0
  67. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/runtimes/http/types.py +0 -0
  68. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/schema.py +0 -0
  69. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/schema_validator.py +0 -0
  70. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/__init__.py +0 -0
  71. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/__main__.py +0 -0
  72. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/commands/__init__.py +0 -0
  73. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/commands/run.py +0 -0
  74. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/commands/schema.py +0 -0
  75. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/main.py +0 -0
  76. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/utils/__init__.py +0 -0
  77. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/utils/config.py +0 -0
  78. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/serve/utils/runtime.py +0 -0
  79. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/__init__.py +0 -0
  80. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/__init__.py +0 -0
  81. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/audio_track.py +0 -0
  82. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/client.py +0 -0
  83. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/frame_conversion.py +0 -0
  84. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/ice_connection.py +0 -0
  85. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/aiortc/video_track.py +0 -0
  86. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/config.py +0 -0
  87. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/events.py +0 -0
  88. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/__init__.py +0 -0
  89. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/client.py +0 -0
  90. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/__init__.py +0 -0
  91. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/av1.py +0 -0
  92. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/base.py +0 -0
  93. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/factory.py +0 -0
  94. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/h264.py +0 -0
  95. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/h265.py +0 -0
  96. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/opus.py +0 -0
  97. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/vp8.py +0 -0
  98. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/decoders/vp9.py +0 -0
  99. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/__init__.py +0 -0
  100. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/av1.py +0 -0
  101. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/base.py +0 -0
  102. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/factory.py +0 -0
  103. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/h264.py +0 -0
  104. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/h265.py +0 -0
  105. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/opus.py +0 -0
  106. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/vp8.py +0 -0
  107. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/encoders/vp9.py +0 -0
  108. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/gst.py +0 -0
  109. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/gst_helpers.py +0 -0
  110. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/probes/__init__.py +0 -0
  111. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/probes/fps_probe.py +0 -0
  112. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/receiver/__init__.py +0 -0
  113. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/receiver/audio.py +0 -0
  114. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/receiver/base.py +0 -0
  115. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/receiver/video.py +0 -0
  116. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/__init__.py +0 -0
  117. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/bundle.py +0 -0
  118. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/codec.py +0 -0
  119. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/extmap.py +0 -0
  120. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sdp/ice.py +0 -0
  121. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sender/__init__.py +0 -0
  122. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sender/audio.py +0 -0
  123. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sender/base.py +0 -0
  124. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/sender/video.py +0 -0
  125. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/settings.py +0 -0
  126. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/gstreamer/signals.py +0 -0
  127. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/ice_uris.py +0 -0
  128. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/interface.py +0 -0
  129. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/media.py +0 -0
  130. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/transports/types.py +0 -0
  131. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/launch.py +0 -0
  132. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/loader.py +0 -0
  133. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/log.py +0 -0
  134. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/messages.py +0 -0
  135. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/paths.py +0 -0
  136. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/ports.py +0 -0
  137. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime/utils/typing.py +0 -0
  138. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/dependency_links.txt +0 -0
  139. {reactor_runtime-2.7.4 → reactor_runtime-2.7.6}/src/reactor_runtime.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: reactor_runtime
3
- Version: 2.7.4
3
+ Version: 2.7.6
4
4
  Summary: Reactor runtime with public model API
5
5
  Author-email: Reactor <team@reactor.inc>
6
6
  Requires-Python: >=3.9
@@ -23,6 +23,7 @@ Requires-Dist: opentelemetry-exporter-prometheus~=0.63b0
23
23
  Requires-Dist: grpcio>=1.80.0
24
24
  Requires-Dist: grpcio-health-checking>=1.80.0
25
25
  Requires-Dist: opentelemetry-instrumentation-grpc~=0.63b0
26
+ Requires-Dist: pynvml>=11.5
26
27
  Provides-Extra: gst
27
28
  Requires-Dist: PyGObject>=3.56.0; extra == "gst"
28
29
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "reactor_runtime"
7
- version = "2.7.4"
7
+ version = "2.7.6"
8
8
  description = "Reactor runtime with public model API"
9
9
  authors = [
10
10
  { name = "Reactor", email = "team@reactor.inc" }
@@ -34,6 +34,15 @@ dependencies = [
34
34
  "grpcio>=1.80.0",
35
35
  "grpcio-health-checking>=1.80.0",
36
36
  "opentelemetry-instrumentation-grpc~=0.63b0",
37
+ # pynvml is the experiment-tracking NVML sampler's runtime dep. Kept as a
38
+ # default rather than behind an `experiment` extra because production model
39
+ # images install reactor-runtime via the `[gst]` extra path — putting pynvml
40
+ # behind a separate extra meant tracked runs in those images had no VRAM /
41
+ # GPU-util metrics at all (the sampler hit the ImportError path silently).
42
+ # pynvml is pure Python and has no system-level CUDA dep at install time,
43
+ # so it's safe to include in the base install for non-GPU envs too — the
44
+ # sampler still degrades gracefully when nvmlInit fails at runtime.
45
+ "pynvml>=11.5",
37
46
  ]
38
47
 
39
48
  [project.optional-dependencies]
@@ -0,0 +1,30 @@
1
+ # Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
2
+
3
+ """Experiment-tracking integration for the Reactor runtime.
4
+
5
+ Activates only when ``EXPERIMENT_ID`` is set in the environment. When unset,
6
+ ``maybe_build_session()`` returns ``None`` and the runtime takes the same
7
+ code paths it always has — no overhead, no NVML sampling, no profiler.
8
+
9
+ The runtime's only role is to produce artifacts (recording, metrics
10
+ summary, profile trace, derived config, session times) under a single
11
+ directory. The skill side (``iterate-model.sh``) reads that directory
12
+ post-run, uploads to S3 via presigned URLs, and POSTs ``/update_experiment``.
13
+ This keeps the runtime free of HTTP, boto3, and presigned-URL env vars.
14
+
15
+ See ``internal/experiment_tracking/`` for the backing API.
16
+ """
17
+
18
+ from reactor_runtime.experiment.session import (
19
+ EXPERIMENT_ENV_ARTIFACTS_DIR,
20
+ EXPERIMENT_ENV_ID,
21
+ ExperimentSession,
22
+ maybe_build_session,
23
+ )
24
+
25
+ __all__ = [
26
+ "EXPERIMENT_ENV_ARTIFACTS_DIR",
27
+ "EXPERIMENT_ENV_ID",
28
+ "ExperimentSession",
29
+ "maybe_build_session",
30
+ ]
@@ -0,0 +1,228 @@
1
+ # Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
2
+
3
+ """Container-side experiment session — passive artifact producer.
4
+
5
+ When ``EXPERIMENT_ID`` is set in the env on rank0, the runtime drops the
6
+ following files into ``/tmp/experiment-<id>/`` at session stop:
7
+
8
+ - ``session_times.json`` start + end ISO timestamps (UTC)
9
+ - ``metrics_summary.json`` NVML aggregate (avg/peak VRAM, GPU util %)
10
+ - ``derived_config.json`` model._config snapshot (or runtime merged
11
+ config dict if the model didn't expose one)
12
+ - ``profile.pt.trace.json.gz`` torch.profiler chrome trace, only if
13
+ EXPERIMENT_PROFILE_RANGE is set (see
14
+ reactor_runtime.profiling.ChunkRangeProfiler)
15
+ - ``recording.mp4`` fMP4 init.mp4 + chunk_*.m4s byte-concatenated
16
+ into a single playable file, if the runtime
17
+ had recording enabled and the session
18
+ produced chunks
19
+
20
+ That's it — no HTTP, no boto3, no S3. The script (``iterate-model.sh``
21
+ on the developer's machine) reads the artifacts dir over SSH, tars +
22
+ gzips it, POSTs it to ``/experiments/<id>/finalize``. The service
23
+ extracts the JSONs into RDS and routes the binary blobs (profile +
24
+ recording) into S3 using its own task role — the developer never needs
25
+ S3 PUT credentials.
26
+
27
+ Non-rank0 workers return ``None`` from :func:`maybe_build_session` and
28
+ take the runtime's normal no-op path — no duplicate finalize calls.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import json
34
+ import os
35
+ import shutil
36
+ from datetime import datetime, timezone
37
+ from pathlib import Path
38
+ from typing import Any
39
+
40
+ from reactor_runtime.profiling.nvml_sampler import NVMLSampler
41
+ from reactor_runtime.utils.log import get_logger
42
+
43
+ # datetime.UTC is a 3.11+ alias for timezone.utc. CI lint runs mypy with
44
+ # --python-version 3.10, so import timezone.utc and alias it ourselves.
45
+ UTC = timezone.utc
46
+
47
+ logger = get_logger(__name__)
48
+
49
+
50
+ EXPERIMENT_ENV_ID = "EXPERIMENT_ID"
51
+ # Override the default artifacts dir. Defaults to /tmp/experiment-<id>/.
52
+ EXPERIMENT_ENV_ARTIFACTS_DIR = "EXPERIMENT_ARTIFACTS_DIR"
53
+
54
+
55
+ class ExperimentSession:
56
+ """One in-flight experiment session.
57
+
58
+ Owns the artifacts directory and the session-start/end timestamps. All
59
+ network egress is the caller's responsibility (``iterate-model.sh``).
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ experiment_id: str,
65
+ artifacts_dir: Path,
66
+ ) -> None:
67
+ self.experiment_id = experiment_id
68
+ self.artifacts_dir = artifacts_dir
69
+ self.started_at: datetime | None = None
70
+ self.ended_at: datetime | None = None
71
+ self._nvml_sampler: NVMLSampler | None = None
72
+ self.artifacts_dir.mkdir(parents=True, exist_ok=True)
73
+
74
+ def mark_start(self, device_index: int = 0) -> None:
75
+ self.started_at = datetime.now(UTC)
76
+ try:
77
+ self._nvml_sampler = NVMLSampler(
78
+ device_index=device_index,
79
+ attrs={"experiment_id": self.experiment_id},
80
+ )
81
+ self._nvml_sampler.start()
82
+ except Exception:
83
+ logger.exception("Failed to start NVML sampler; continuing without")
84
+ self._nvml_sampler = None
85
+
86
+ def mark_end(self) -> None:
87
+ self.ended_at = datetime.now(UTC)
88
+
89
+ def finalize(
90
+ self,
91
+ *,
92
+ derived_config: dict[str, Any] | None,
93
+ recording_session_dir: Path | None = None,
94
+ ) -> None:
95
+ """Write every artifact the skill will ship to the tracker into
96
+ ``self.artifacts_dir``.
97
+
98
+ Each artifact is independent — a failure on one doesn't prevent
99
+ the others from being written. The skill treats a missing file as
100
+ "this artifact wasn't produced".
101
+
102
+ Note: the torch.profiler chrome trace + key_averages summary
103
+ (``profile.pt.trace.json.gz`` / ``profile.summary.txt``) are
104
+ written DIRECTLY into ``self.artifacts_dir`` by the model's
105
+ worker process via
106
+ :class:`reactor_runtime.profiling.ChunkRangeProfiler`. This
107
+ method doesn't move them around; it just trusts that they're
108
+ already in place by the time it runs.
109
+ """
110
+ if self.started_at and self.ended_at:
111
+ self._write_json(
112
+ "session_times.json",
113
+ {
114
+ "start": self.started_at.isoformat(),
115
+ "end": self.ended_at.isoformat(),
116
+ },
117
+ )
118
+
119
+ metrics_summary = self._stop_sampling_and_summarize()
120
+ if metrics_summary is not None:
121
+ self._write_json("metrics_summary.json", metrics_summary)
122
+
123
+ if derived_config is not None:
124
+ self._write_json("derived_config.json", derived_config)
125
+
126
+ if recording_session_dir is not None:
127
+ self._assemble_recording(recording_session_dir)
128
+
129
+ logger.info(
130
+ "Experiment artifacts finalized",
131
+ experiment_id=self.experiment_id,
132
+ artifacts_dir=str(self.artifacts_dir),
133
+ )
134
+
135
+ def _assemble_recording(self, session_dir: Path) -> None:
136
+ """Concatenate ``init.mp4 + chunk_*.m4s`` into ``recording.mp4``.
137
+
138
+ fMP4 byte-concat is valid for players that accept fragmented MP4
139
+ (Safari, Chrome, ffmpeg). Streams chunks through ``copyfileobj``
140
+ rather than buffering in RAM since recordings can be 100 MB+.
141
+
142
+ Leading-black trimming used to live here as a post-hoc ffmpeg
143
+ pass. Removed once the recorder learned to drop pre-roll
144
+ duplicates at the source (REA-2323 / #2325) — the bytes never
145
+ reach init.mp4 / chunk_*.m4s anymore.
146
+ """
147
+ init = session_dir / "init.mp4"
148
+ if not init.exists():
149
+ return # Recording disabled or session was empty.
150
+
151
+ chunks = sorted(session_dir.glob("chunk_*.m4s"))
152
+ dest = self.artifacts_dir / "recording.mp4"
153
+ try:
154
+ with open(dest, "wb") as out:
155
+ with open(init, "rb") as src:
156
+ shutil.copyfileobj(src, out)
157
+ for chunk in chunks:
158
+ with open(chunk, "rb") as src:
159
+ shutil.copyfileobj(src, out)
160
+ except OSError as err:
161
+ logger.warning(
162
+ "Failed to assemble recording.mp4",
163
+ session_dir=str(session_dir),
164
+ error=str(err),
165
+ )
166
+
167
+ def _stop_sampling_and_summarize(self) -> dict[str, Any] | None:
168
+ sampler = self._nvml_sampler
169
+ if sampler is None:
170
+ return None
171
+ self._nvml_sampler = None
172
+ try:
173
+ sampler.stop()
174
+ except Exception:
175
+ logger.exception("NVML sampler stop failed")
176
+ return None
177
+ state = sampler.state
178
+ n = max(int(state.get("samples", 0)), 1)
179
+ return {
180
+ "samples": int(state.get("samples", 0)),
181
+ "avg_vram_gb": round(float(state.get("vram_sum_gb", 0.0)) / n, 3),
182
+ "peak_vram_gb": round(float(state.get("vram_peak_gb", 0.0)), 3),
183
+ "avg_gpu_util_pct": round(float(state.get("gpu_util_sum_pct", 0.0)) / n, 2),
184
+ }
185
+
186
+ def _write_json(self, name: str, payload: Any) -> None:
187
+ path = self.artifacts_dir / name
188
+ try:
189
+ path.write_text(json.dumps(payload, default=str, indent=2))
190
+ except OSError as err:
191
+ logger.warning(
192
+ "Failed to write experiment artifact",
193
+ name=name,
194
+ error=str(err),
195
+ )
196
+
197
+
198
+ def maybe_build_session() -> ExperimentSession | None:
199
+ """Return an ``ExperimentSession`` iff the runtime was launched as a
200
+ tracked experiment (``EXPERIMENT_ID`` set) AND this worker is rank0.
201
+
202
+ Non-rank0 workers and untracked runs both get ``None`` — callers
203
+ must treat that as "no tracking, behave as before".
204
+ """
205
+ # Only rank0 produces artifacts. Multiple ranks writing into the same
206
+ # /tmp/experiment-<id>/ would race on the JSON files; the cheaper fix
207
+ # is to keep this whole subsystem inert on non-rank0 workers.
208
+ if int(os.environ.get("RANK", "0")) != 0:
209
+ return None
210
+
211
+ experiment_id = os.environ.get(EXPERIMENT_ENV_ID)
212
+ if not experiment_id:
213
+ return None
214
+
215
+ artifacts_dir = Path(
216
+ os.environ.get(EXPERIMENT_ENV_ARTIFACTS_DIR)
217
+ or f"/tmp/experiment-{experiment_id}"
218
+ )
219
+
220
+ logger.info(
221
+ "Experiment tracking enabled",
222
+ experiment_id=experiment_id,
223
+ artifacts_dir=str(artifacts_dir),
224
+ )
225
+ return ExperimentSession(
226
+ experiment_id=experiment_id,
227
+ artifacts_dir=artifacts_dir,
228
+ )
@@ -102,6 +102,34 @@ def split_batch(bundle: MediaBundle) -> List[MediaBundle]:
102
102
  return result
103
103
 
104
104
 
105
+ class _FlushMarker:
106
+ """Sentinel placed in :class:`OutputBuffer`'s queue by :meth:`flush`.
107
+
108
+ Carries no payload. When the emission loop dequeues an instance, it
109
+ resets ``_last_emitted`` *in-thread* and falls through to this
110
+ tick's empty-queue fallback (which dispatches a fresh
111
+ ``_create_black_bundle()`` with ``duplicate=True``).
112
+
113
+ The point of routing the session-boundary reset through the queue
114
+ rather than writing ``_last_emitted = None`` directly from
115
+ :meth:`flush` is to make the reset atomic with respect to the
116
+ emission thread: only the emission thread reads or writes
117
+ ``_last_emitted``, so the loop can never observe a half-applied
118
+ "queue drained but cached frame not yet cleared" state. Down-stream
119
+ callbacks already drop ``duplicate=True`` bundles (the wire
120
+ callback in ``_send_out_app_bundle_sync`` short-circuits, and the
121
+ :class:`~reactor_runtime.recording.session_recorder.SessionRecorder`
122
+ skips them when ``skip_leading_black`` is on and
123
+ ``recording_started`` is still False), so no stale frame would
124
+ reach the client or recording even without this guarantee — but
125
+ closing the race architecturally is cheaper than re-deriving that
126
+ safety argument for every callback added in the future.
127
+ """
128
+
129
+
130
+ _FLUSH_MARKER: _FlushMarker = _FlushMarker()
131
+
132
+
105
133
  class OutputBuffer:
106
134
  """Rate-controlled emission buffer.
107
135
 
@@ -127,7 +155,11 @@ class OutputBuffer:
127
155
  self._callbacks: List[Callable[[MediaBundle, bool], None]] = []
128
156
  self._callbacks_lock: threading.Lock = threading.Lock()
129
157
 
130
- self._q: queue.Queue[MediaBundle] = queue.Queue(maxsize=queue_depth)
158
+ # Queue items are normally MediaBundle, but flush() also puts a
159
+ # _FlushMarker sentinel here. We type the queue as ``object`` so
160
+ # both shapes are valid; the emission loop discriminates with
161
+ # ``isinstance``.
162
+ self._q: queue.Queue[object] = queue.Queue(maxsize=queue_depth)
131
163
 
132
164
  # FPS control — store both rate and period to avoid 1/fps on every tick
133
165
  self._fixed_fps: float = 0.0
@@ -322,11 +354,24 @@ class OutputBuffer:
322
354
  while not self._emission_stop.is_set():
323
355
  interval = self._interval
324
356
 
325
- bundle: Optional[MediaBundle] = None
357
+ item: object
326
358
  try:
327
- bundle = self._q.get_nowait()
359
+ item = self._q.get_nowait()
328
360
  except queue.Empty:
329
- pass
361
+ item = None
362
+
363
+ bundle: Optional[MediaBundle] = None
364
+ if isinstance(item, _FlushMarker):
365
+ # Session-boundary reset, processed in the emission
366
+ # thread so there is no cross-thread race with
367
+ # flush(): clear the cached frame and fall through
368
+ # to the empty-queue fallback below, which
369
+ # dispatches duplicate=True black. mark_first_real_frame()
370
+ # is gated on duplicate=False, so the recorder's
371
+ # latch (REA-2323 / #2325) stays unset.
372
+ self._last_emitted = None
373
+ elif isinstance(item, MediaBundle):
374
+ bundle = item
330
375
 
331
376
  if bundle is not None:
332
377
  vtracks = bundle.get_tracks_by_kind(TrackKind.VIDEO)
@@ -464,11 +509,39 @@ class OutputBuffer:
464
509
  self._drain_queue()
465
510
 
466
511
  def flush(self) -> None:
467
- """Empty the queue and insert a black frame."""
512
+ """Drop pending bundles and request a session-boundary reset.
513
+
514
+ The reset of ``_last_emitted`` is performed by the emission
515
+ thread when it dequeues the :class:`_FlushMarker` sentinel,
516
+ making the operation race-free with the per-tick
517
+ "what do I emit?" decision in :meth:`_emission_loop`. After the
518
+ sentinel is consumed, the next emission tick synthesises
519
+ ``_create_black_bundle()`` with ``duplicate=True`` — the
520
+ correct pre-roll behaviour for a session boundary:
521
+
522
+ * the wire callback (``_send_out_app_bundle_sync``) drops
523
+ ``duplicate=True`` outright;
524
+ * the recorder's ``mark_first_real_frame()`` latch is not
525
+ tripped (REA-2323 / #2325), so the recording timeline only
526
+ starts at the next real model frame.
527
+
528
+ Safe to call from any thread; in practice ``flush()`` is
529
+ called from the model thread (``model-run``), the same thread
530
+ that calls :meth:`submit`, so there is no submit/flush race
531
+ on the producer side either.
532
+ """
468
533
  self._drain_queue()
469
- self._last_emitted = None
470
- black = self._create_black_bundle()
471
534
  try:
472
- self._q.put_nowait(black)
535
+ self._q.put_nowait(_FLUSH_MARKER)
473
536
  except queue.Full:
474
- pass
537
+ # Unreachable in practice: the queue is bounded and the
538
+ # only producer (submit) runs on the same thread as
539
+ # flush, so nothing can refill the queue between drain
540
+ # and put. Log loudly if it ever happens and continue —
541
+ # the next real frame from the new session will set
542
+ # _last_emitted correctly even if the sentinel never
543
+ # lands.
544
+ logger.warning(
545
+ "OutputBuffer.flush: queue full immediately after drain; "
546
+ "reset sentinel dropped"
547
+ )
@@ -76,9 +76,11 @@ from reactor_runtime.profiling.profiler import (
76
76
  from reactor_runtime.profiling.singleton import get_profiler, set_profiler
77
77
  from reactor_runtime.profiling.backends.base import ProfilerBackend
78
78
  from reactor_runtime.profiling.helpers import profile_fn
79
+ from reactor_runtime.profiling.torch_chunk_profiler import ChunkRangeProfiler
79
80
 
80
81
  __all__ = [
81
82
  "BucketPreset",
83
+ "ChunkRangeProfiler",
82
84
  "CudaTimingMode",
83
85
  "Profiler",
84
86
  "ProfilerSection",
@@ -0,0 +1,200 @@
1
+ # Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
2
+
3
+ """NVML resource sampler — general-purpose GPU observability.
4
+
5
+ Polls VRAM use + GPU utilization at 1Hz on a daemon thread and emits them
6
+ as OTLP gauges through the meter pipeline (set up by
7
+ ``reactor_machine_metrics``). The exporter ships them to Grafana via the
8
+ same path the rest of the runtime uses.
9
+
10
+ Activation is the caller's responsibility — typical pattern is
11
+ "start it when there's a session you care about, stop it when the
12
+ session ends". The sampler doesn't know about experiments or sessions;
13
+ it just samples and emits. Callers that want a snapshot-style summary
14
+ (e.g. for inclusion in a row update) can read ``.state`` at stop time
15
+ and aggregate however they like.
16
+
17
+ Cost: ~2 NVML calls per second on a side thread. The main thread and
18
+ CUDA streams are untouched.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import threading
24
+ from typing import Any
25
+
26
+ from opentelemetry import metrics
27
+
28
+ from reactor_runtime.utils.log import get_logger
29
+
30
+ logger = get_logger(__name__)
31
+
32
+
33
+ class NVMLSampler:
34
+ """Background NVML poller emitting OTLP gauges + maintaining state.
35
+
36
+ Parameters
37
+ ----------
38
+ device_index:
39
+ CUDA device to poll. Defaults to 0.
40
+ interval_s:
41
+ Poll interval in seconds. 1Hz default.
42
+ attrs:
43
+ Label dict applied to every emitted observation. Use this to
44
+ attach context like ``{"experiment_id": "exp_..."}`` or
45
+ ``{"model_name": "delta-forcing"}``. ``None`` → empty attrs
46
+ (gauges still emit, just unlabeled beyond the global meter labels).
47
+ meter_name:
48
+ OTel meter name. Defaults to ``reactor.machine`` — keep this
49
+ consistent with what your dashboards expect.
50
+ metric_namespace:
51
+ Prefix for emitted metrics. ``reactor.machine`` → emits
52
+ ``reactor.machine.vram_used_gb`` + ``reactor.machine.gpu_util_pct``.
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ device_index: int = 0,
58
+ interval_s: float = 1.0,
59
+ attrs: dict[str, str] | None = None,
60
+ meter_name: str = "reactor.machine",
61
+ metric_namespace: str = "reactor.machine",
62
+ ) -> None:
63
+ self._device_index = device_index
64
+ self._interval_s = interval_s
65
+ self._stop_evt = threading.Event()
66
+ self._thread: threading.Thread | None = None
67
+
68
+ meter = metrics.get_meter(meter_name)
69
+ self._attrs: dict[str, str] = dict(attrs or {})
70
+
71
+ # In-memory state read by .state — used for snapshot-style
72
+ # aggregation at session-stop (avg/peak summaries etc.).
73
+ self._last_vram_gb: float = 0.0
74
+ self._last_gpu_util_pct: float = 0.0
75
+ self._sample_count: int = 0
76
+ self._vram_sum_gb: float = 0.0
77
+ self._gpu_util_sum_pct: float = 0.0
78
+ self._vram_peak_gb: float = 0.0
79
+
80
+ meter.create_observable_gauge(
81
+ name=f"{metric_namespace}.vram_used_gb",
82
+ callbacks=[self._observe_vram],
83
+ description="VRAM used (GB), sampled at 1Hz",
84
+ unit="GB",
85
+ )
86
+ meter.create_observable_gauge(
87
+ name=f"{metric_namespace}.gpu_util_pct",
88
+ callbacks=[self._observe_gpu_util],
89
+ description="GPU utilization (%), sampled at 1Hz",
90
+ unit="%",
91
+ )
92
+
93
+ # ------------------------------------------------------------------
94
+ # OTel callbacks
95
+ # ------------------------------------------------------------------
96
+
97
+ def _observe_vram(self, _options): # type: ignore[no-untyped-def]
98
+ from opentelemetry.metrics import Observation
99
+
100
+ return [Observation(self._last_vram_gb, self._attrs)]
101
+
102
+ def _observe_gpu_util(self, _options): # type: ignore[no-untyped-def]
103
+ from opentelemetry.metrics import Observation
104
+
105
+ return [Observation(self._last_gpu_util_pct, self._attrs)]
106
+
107
+ # ------------------------------------------------------------------
108
+ # Lifecycle
109
+ # ------------------------------------------------------------------
110
+
111
+ def start(self) -> None:
112
+ if self._thread is not None:
113
+ return
114
+ self._thread = threading.Thread(
115
+ target=self._run,
116
+ name=f"nvml-sampler-{self._device_index}",
117
+ daemon=True,
118
+ )
119
+ self._thread.start()
120
+
121
+ def stop(self) -> None:
122
+ self._stop_evt.set()
123
+ thread = self._thread
124
+ if thread is not None and thread.is_alive():
125
+ thread.join(timeout=2.0)
126
+ self._thread = None
127
+
128
+ # ------------------------------------------------------------------
129
+ # Sampling loop
130
+ # ------------------------------------------------------------------
131
+
132
+ def _run(self) -> None:
133
+ try:
134
+ import pynvml # type: ignore
135
+ except ImportError:
136
+ logger.warning("pynvml not installed; NVML sampler disabled")
137
+ return
138
+
139
+ try:
140
+ pynvml.nvmlInit()
141
+ except Exception as err:
142
+ logger.warning("nvmlInit failed", error=str(err))
143
+ return
144
+
145
+ # Once nvmlInit() succeeds we MUST pair it with nvmlShutdown(),
146
+ # even if handle lookup below fails — otherwise NVML's library
147
+ # state leaks for the lifetime of the process.
148
+ try:
149
+ try:
150
+ handle = pynvml.nvmlDeviceGetHandleByIndex(self._device_index)
151
+ except Exception as err:
152
+ logger.warning(
153
+ "nvmlDeviceGetHandleByIndex failed",
154
+ device_index=self._device_index,
155
+ error=str(err),
156
+ )
157
+ return
158
+
159
+ while not self._stop_evt.is_set():
160
+ try:
161
+ mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
162
+ util = pynvml.nvmlDeviceGetUtilizationRates(handle)
163
+ vram_gb = mem.used / 1e9
164
+ gpu_pct = float(util.gpu)
165
+ self._last_vram_gb = vram_gb
166
+ self._last_gpu_util_pct = gpu_pct
167
+ self._vram_sum_gb += vram_gb
168
+ self._gpu_util_sum_pct += gpu_pct
169
+ self._vram_peak_gb = max(self._vram_peak_gb, vram_gb)
170
+ self._sample_count += 1
171
+ except Exception as err:
172
+ logger.debug("NVML sample failed", error=str(err))
173
+ # Sleep on the stop-event so stop() unblocks immediately.
174
+ self._stop_evt.wait(self._interval_s)
175
+ finally:
176
+ try:
177
+ pynvml.nvmlShutdown()
178
+ except Exception as err:
179
+ logger.debug("nvmlShutdown failed", error=str(err))
180
+
181
+ # ------------------------------------------------------------------
182
+ # State accessor for snapshot aggregation
183
+ # ------------------------------------------------------------------
184
+
185
+ @property
186
+ def state(self) -> dict[str, Any]:
187
+ """Raw counter state — caller aggregates as they wish.
188
+
189
+ Snapshot semantics: returns the current counters at call time. Safe
190
+ to call from another thread; reads are atomic enough that the
191
+ worst case is a sample number that doesn't quite match the sums
192
+ (off by one). For the avg/peak rollups used by experiment-tracking
193
+ that's fine.
194
+ """
195
+ return {
196
+ "samples": self._sample_count,
197
+ "vram_sum_gb": self._vram_sum_gb,
198
+ "vram_peak_gb": self._vram_peak_gb,
199
+ "gpu_util_sum_pct": self._gpu_util_sum_pct,
200
+ }