PyPI - reactor-runtime - Versions diffs - 2.7.4__tar.gz → 2.7.5__tar.gz - Mend

reactor-runtime 2.7.4tar.gz → 2.7.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

{reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: reactor_runtime
-Version: 2.7.4
+Version: 2.7.5
 Summary: Reactor runtime with public model API
 Author-email: Reactor <team@reactor.inc>
 Requires-Python: >=3.9
@@ -23,6 +23,7 @@ Requires-Dist: opentelemetry-exporter-prometheus~=0.63b0
 Requires-Dist: grpcio>=1.80.0
 Requires-Dist: grpcio-health-checking>=1.80.0
 Requires-Dist: opentelemetry-instrumentation-grpc~=0.63b0
+Requires-Dist: pynvml>=11.5
 Provides-Extra: gst
 Requires-Dist: PyGObject>=3.56.0; extra == "gst"

{reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "reactor_runtime"
-version = "2.7.4"
+version = "2.7.5"
 description = "Reactor runtime with public model API"
 authors = [
 	{ name = "Reactor", email = "team@reactor.inc" }
@@ -34,6 +34,15 @@ dependencies = [
     "grpcio>=1.80.0",
     "grpcio-health-checking>=1.80.0",
     "opentelemetry-instrumentation-grpc~=0.63b0",
+    # pynvml is the experiment-tracking NVML sampler's runtime dep. Kept as a
+    # default rather than behind an `experiment` extra because production model
+    # images install reactor-runtime via the `[gst]` extra path — putting pynvml
+    # behind a separate extra meant tracked runs in those images had no VRAM /
+    # GPU-util metrics at all (the sampler hit the ImportError path silently).
+    # pynvml is pure Python and has no system-level CUDA dep at install time,
+    # so it's safe to include in the base install for non-GPU envs too — the
+    # sampler still degrades gracefully when nvmlInit fails at runtime.
+    "pynvml>=11.5",
 ]
 [project.optional-dependencies]

reactor_runtime-2.7.5/src/reactor_runtime/experiment/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+# Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
+"""Experiment-tracking integration for the Reactor runtime.
+Activates only when ``EXPERIMENT_ID`` is set in the environment. When unset,
+``maybe_build_session()`` returns ``None`` and the runtime takes the same
+code paths it always has — no overhead, no NVML sampling, no profiler.
+The runtime's only role is to produce artifacts (recording, metrics
+summary, profile trace, derived config, session times) under a single
+directory. The skill side (``iterate-model.sh``) reads that directory
+post-run, uploads to S3 via presigned URLs, and POSTs ``/update_experiment``.
+This keeps the runtime free of HTTP, boto3, and presigned-URL env vars.
+See ``internal/experiment_tracking/`` for the backing API.
+"""
+from reactor_runtime.experiment.session import (
+    EXPERIMENT_ENV_ARTIFACTS_DIR,
+    EXPERIMENT_ENV_ID,
+    ExperimentSession,
+    maybe_build_session,
+)
+__all__ = [
+    "EXPERIMENT_ENV_ARTIFACTS_DIR",
+    "EXPERIMENT_ENV_ID",
+    "ExperimentSession",
+    "maybe_build_session",
+]

reactor_runtime-2.7.5/src/reactor_runtime/experiment/session.py ADDED Viewed

@@ -0,0 +1,228 @@
+# Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
+"""Container-side experiment session — passive artifact producer.
+When ``EXPERIMENT_ID`` is set in the env on rank0, the runtime drops the
+following files into ``/tmp/experiment-<id>/`` at session stop:
+- ``session_times.json``     start + end ISO timestamps (UTC)
+- ``metrics_summary.json``   NVML aggregate (avg/peak VRAM, GPU util %)
+- ``derived_config.json``    model._config snapshot (or runtime merged
+                             config dict if the model didn't expose one)
+- ``profile.pt.trace.json.gz``  torch.profiler chrome trace, only if
+                                EXPERIMENT_PROFILE_RANGE is set (see
+                                reactor_runtime.profiling.ChunkRangeProfiler)
+- ``recording.mp4``          fMP4 init.mp4 + chunk_*.m4s byte-concatenated
+                             into a single playable file, if the runtime
+                             had recording enabled and the session
+                             produced chunks
+That's it — no HTTP, no boto3, no S3. The script (``iterate-model.sh``
+on the developer's machine) reads the artifacts dir over SSH, tars +
+gzips it, POSTs it to ``/experiments/<id>/finalize``. The service
+extracts the JSONs into RDS and routes the binary blobs (profile +
+recording) into S3 using its own task role — the developer never needs
+S3 PUT credentials.
+Non-rank0 workers return ``None`` from :func:`maybe_build_session` and
+take the runtime's normal no-op path — no duplicate finalize calls.
+"""
+from __future__ import annotations
+import json
+import os
+import shutil
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+from reactor_runtime.profiling.nvml_sampler import NVMLSampler
+from reactor_runtime.utils.log import get_logger
+# datetime.UTC is a 3.11+ alias for timezone.utc. CI lint runs mypy with
+# --python-version 3.10, so import timezone.utc and alias it ourselves.
+UTC = timezone.utc
+logger = get_logger(__name__)
+EXPERIMENT_ENV_ID = "EXPERIMENT_ID"
+# Override the default artifacts dir. Defaults to /tmp/experiment-<id>/.
+EXPERIMENT_ENV_ARTIFACTS_DIR = "EXPERIMENT_ARTIFACTS_DIR"
+class ExperimentSession:
+    """One in-flight experiment session.
+    Owns the artifacts directory and the session-start/end timestamps. All
+    network egress is the caller's responsibility (``iterate-model.sh``).
+    """
+    def __init__(
+        self,
+        experiment_id: str,
+        artifacts_dir: Path,
+    ) -> None:
+        self.experiment_id = experiment_id
+        self.artifacts_dir = artifacts_dir
+        self.started_at: datetime | None = None
+        self.ended_at: datetime | None = None
+        self._nvml_sampler: NVMLSampler | None = None
+        self.artifacts_dir.mkdir(parents=True, exist_ok=True)
+    def mark_start(self, device_index: int = 0) -> None:
+        self.started_at = datetime.now(UTC)
+        try:
+            self._nvml_sampler = NVMLSampler(
+                device_index=device_index,
+                attrs={"experiment_id": self.experiment_id},
+            )
+            self._nvml_sampler.start()
+        except Exception:
+            logger.exception("Failed to start NVML sampler; continuing without")
+            self._nvml_sampler = None
+    def mark_end(self) -> None:
+        self.ended_at = datetime.now(UTC)
+    def finalize(
+        self,
+        *,
+        derived_config: dict[str, Any] | None,
+        recording_session_dir: Path | None = None,
+    ) -> None:
+        """Write every artifact the skill will ship to the tracker into
+        ``self.artifacts_dir``.
+        Each artifact is independent — a failure on one doesn't prevent
+        the others from being written. The skill treats a missing file as
+        "this artifact wasn't produced".
+        Note: the torch.profiler chrome trace + key_averages summary
+        (``profile.pt.trace.json.gz`` / ``profile.summary.txt``) are
+        written DIRECTLY into ``self.artifacts_dir`` by the model's
+        worker process via
+        :class:`reactor_runtime.profiling.ChunkRangeProfiler`. This
+        method doesn't move them around; it just trusts that they're
+        already in place by the time it runs.
+        """
+        if self.started_at and self.ended_at:
+            self._write_json(
+                "session_times.json",
+                {
+                    "start": self.started_at.isoformat(),
+                    "end": self.ended_at.isoformat(),
+                },
+            )
+        metrics_summary = self._stop_sampling_and_summarize()
+        if metrics_summary is not None:
+            self._write_json("metrics_summary.json", metrics_summary)
+        if derived_config is not None:
+            self._write_json("derived_config.json", derived_config)
+        if recording_session_dir is not None:
+            self._assemble_recording(recording_session_dir)
+        logger.info(
+            "Experiment artifacts finalized",
+            experiment_id=self.experiment_id,
+            artifacts_dir=str(self.artifacts_dir),
+        )
+    def _assemble_recording(self, session_dir: Path) -> None:
+        """Concatenate ``init.mp4 + chunk_*.m4s`` into ``recording.mp4``.
+        fMP4 byte-concat is valid for players that accept fragmented MP4
+        (Safari, Chrome, ffmpeg). Streams chunks through ``copyfileobj``
+        rather than buffering in RAM since recordings can be 100 MB+.
+        Leading-black trimming used to live here as a post-hoc ffmpeg
+        pass. Removed once the recorder learned to drop pre-roll
+        duplicates at the source (REA-2323 / #2325) — the bytes never
+        reach init.mp4 / chunk_*.m4s anymore.
+        """
+        init = session_dir / "init.mp4"
+        if not init.exists():
+            return  # Recording disabled or session was empty.
+        chunks = sorted(session_dir.glob("chunk_*.m4s"))
+        dest = self.artifacts_dir / "recording.mp4"
+        try:
+            with open(dest, "wb") as out:
+                with open(init, "rb") as src:
+                    shutil.copyfileobj(src, out)
+                for chunk in chunks:
+                    with open(chunk, "rb") as src:
+                        shutil.copyfileobj(src, out)
+        except OSError as err:
+            logger.warning(
+                "Failed to assemble recording.mp4",
+                session_dir=str(session_dir),
+                error=str(err),
+            )
+    def _stop_sampling_and_summarize(self) -> dict[str, Any] | None:
+        sampler = self._nvml_sampler
+        if sampler is None:
+            return None
+        self._nvml_sampler = None
+        try:
+            sampler.stop()
+        except Exception:
+            logger.exception("NVML sampler stop failed")
+            return None
+        state = sampler.state
+        n = max(int(state.get("samples", 0)), 1)
+        return {
+            "samples": int(state.get("samples", 0)),
+            "avg_vram_gb": round(float(state.get("vram_sum_gb", 0.0)) / n, 3),
+            "peak_vram_gb": round(float(state.get("vram_peak_gb", 0.0)), 3),
+            "avg_gpu_util_pct": round(float(state.get("gpu_util_sum_pct", 0.0)) / n, 2),
+        }
+    def _write_json(self, name: str, payload: Any) -> None:
+        path = self.artifacts_dir / name
+        try:
+            path.write_text(json.dumps(payload, default=str, indent=2))
+        except OSError as err:
+            logger.warning(
+                "Failed to write experiment artifact",
+                name=name,
+                error=str(err),
+            )
+def maybe_build_session() -> ExperimentSession | None:
+    """Return an ``ExperimentSession`` iff the runtime was launched as a
+    tracked experiment (``EXPERIMENT_ID`` set) AND this worker is rank0.
+    Non-rank0 workers and untracked runs both get ``None`` — callers
+    must treat that as "no tracking, behave as before".
+    """
+    # Only rank0 produces artifacts. Multiple ranks writing into the same
+    # /tmp/experiment-<id>/ would race on the JSON files; the cheaper fix
+    # is to keep this whole subsystem inert on non-rank0 workers.
+    if int(os.environ.get("RANK", "0")) != 0:
+        return None
+    experiment_id = os.environ.get(EXPERIMENT_ENV_ID)
+    if not experiment_id:
+        return None
+    artifacts_dir = Path(
+        os.environ.get(EXPERIMENT_ENV_ARTIFACTS_DIR)
+        or f"/tmp/experiment-{experiment_id}"
+    )
+    logger.info(
+        "Experiment tracking enabled",
+        experiment_id=experiment_id,
+        artifacts_dir=str(artifacts_dir),
+    )
+    return ExperimentSession(
+        experiment_id=experiment_id,
+        artifacts_dir=artifacts_dir,
+    )

{reactor_runtime-2.7.4 → reactor_runtime-2.7.5}/src/reactor_runtime/profiling/__init__.py RENAMED Viewed

@@ -76,9 +76,11 @@ from reactor_runtime.profiling.profiler import (
 from reactor_runtime.profiling.singleton import get_profiler, set_profiler
 from reactor_runtime.profiling.backends.base import ProfilerBackend
 from reactor_runtime.profiling.helpers import profile_fn
+from reactor_runtime.profiling.torch_chunk_profiler import ChunkRangeProfiler
 __all__ = [
     "BucketPreset",
+    "ChunkRangeProfiler",
     "CudaTimingMode",
     "Profiler",
     "ProfilerSection",

reactor_runtime-2.7.5/src/reactor_runtime/profiling/nvml_sampler.py ADDED Viewed

@@ -0,0 +1,200 @@
+# Copyright (c) 2026 Reactor Technologies, Inc. All rights reserved.
+"""NVML resource sampler — general-purpose GPU observability.
+Polls VRAM use + GPU utilization at 1Hz on a daemon thread and emits them
+as OTLP gauges through the meter pipeline (set up by
+``reactor_machine_metrics``). The exporter ships them to Grafana via the
+same path the rest of the runtime uses.
+Activation is the caller's responsibility — typical pattern is
+"start it when there's a session you care about, stop it when the
+session ends". The sampler doesn't know about experiments or sessions;
+it just samples and emits. Callers that want a snapshot-style summary
+(e.g. for inclusion in a row update) can read ``.state`` at stop time
+and aggregate however they like.
+Cost: ~2 NVML calls per second on a side thread. The main thread and
+CUDA streams are untouched.
+"""
+from __future__ import annotations
+import threading
+from typing import Any
+from opentelemetry import metrics
+from reactor_runtime.utils.log import get_logger
+logger = get_logger(__name__)
+class NVMLSampler:
+    """Background NVML poller emitting OTLP gauges + maintaining state.
+    Parameters
+    ----------
+    device_index:
+        CUDA device to poll. Defaults to 0.
+    interval_s:
+        Poll interval in seconds. 1Hz default.
+    attrs:
+        Label dict applied to every emitted observation. Use this to
+        attach context like ``{"experiment_id": "exp_..."}`` or
+        ``{"model_name": "delta-forcing"}``. ``None`` → empty attrs
+        (gauges still emit, just unlabeled beyond the global meter labels).
+    meter_name:
+        OTel meter name. Defaults to ``reactor.machine`` — keep this
+        consistent with what your dashboards expect.
+    metric_namespace:
+        Prefix for emitted metrics. ``reactor.machine`` → emits
+        ``reactor.machine.vram_used_gb`` + ``reactor.machine.gpu_util_pct``.
+    """
+    def __init__(
+        self,
+        device_index: int = 0,
+        interval_s: float = 1.0,
+        attrs: dict[str, str] | None = None,
+        meter_name: str = "reactor.machine",
+        metric_namespace: str = "reactor.machine",
+    ) -> None:
+        self._device_index = device_index
+        self._interval_s = interval_s
+        self._stop_evt = threading.Event()
+        self._thread: threading.Thread | None = None
+        meter = metrics.get_meter(meter_name)
+        self._attrs: dict[str, str] = dict(attrs or {})
+        # In-memory state read by .state — used for snapshot-style
+        # aggregation at session-stop (avg/peak summaries etc.).
+        self._last_vram_gb: float = 0.0
+        self._last_gpu_util_pct: float = 0.0
+        self._sample_count: int = 0
+        self._vram_sum_gb: float = 0.0
+        self._gpu_util_sum_pct: float = 0.0
+        self._vram_peak_gb: float = 0.0
+        meter.create_observable_gauge(
+            name=f"{metric_namespace}.vram_used_gb",
+            callbacks=[self._observe_vram],
+            description="VRAM used (GB), sampled at 1Hz",
+            unit="GB",
+        )
+        meter.create_observable_gauge(
+            name=f"{metric_namespace}.gpu_util_pct",
+            callbacks=[self._observe_gpu_util],
+            description="GPU utilization (%), sampled at 1Hz",
+            unit="%",
+        )
+    # ------------------------------------------------------------------
+    # OTel callbacks
+    # ------------------------------------------------------------------
+    def _observe_vram(self, _options):  # type: ignore[no-untyped-def]
+        from opentelemetry.metrics import Observation
+        return [Observation(self._last_vram_gb, self._attrs)]
+    def _observe_gpu_util(self, _options):  # type: ignore[no-untyped-def]
+        from opentelemetry.metrics import Observation
+        return [Observation(self._last_gpu_util_pct, self._attrs)]
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def start(self) -> None:
+        if self._thread is not None:
+            return
+        self._thread = threading.Thread(
+            target=self._run,
+            name=f"nvml-sampler-{self._device_index}",
+            daemon=True,
+        )
+        self._thread.start()
+    def stop(self) -> None:
+        self._stop_evt.set()
+        thread = self._thread
+        if thread is not None and thread.is_alive():
+            thread.join(timeout=2.0)
+        self._thread = None
+    # ------------------------------------------------------------------
+    # Sampling loop
+    # ------------------------------------------------------------------
+    def _run(self) -> None:
+        try:
+            import pynvml  # type: ignore
+        except ImportError:
+            logger.warning("pynvml not installed; NVML sampler disabled")
+            return
+        try:
+            pynvml.nvmlInit()
+        except Exception as err:
+            logger.warning("nvmlInit failed", error=str(err))
+            return
+        # Once nvmlInit() succeeds we MUST pair it with nvmlShutdown(),
+        # even if handle lookup below fails — otherwise NVML's library
+        # state leaks for the lifetime of the process.
+        try:
+            try:
+                handle = pynvml.nvmlDeviceGetHandleByIndex(self._device_index)
+            except Exception as err:
+                logger.warning(
+                    "nvmlDeviceGetHandleByIndex failed",
+                    device_index=self._device_index,
+                    error=str(err),
+                )
+                return
+            while not self._stop_evt.is_set():
+                try:
+                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                    vram_gb = mem.used / 1e9
+                    gpu_pct = float(util.gpu)
+                    self._last_vram_gb = vram_gb
+                    self._last_gpu_util_pct = gpu_pct
+                    self._vram_sum_gb += vram_gb
+                    self._gpu_util_sum_pct += gpu_pct
+                    self._vram_peak_gb = max(self._vram_peak_gb, vram_gb)
+                    self._sample_count += 1
+                except Exception as err:
+                    logger.debug("NVML sample failed", error=str(err))
+                # Sleep on the stop-event so stop() unblocks immediately.
+                self._stop_evt.wait(self._interval_s)
+        finally:
+            try:
+                pynvml.nvmlShutdown()
+            except Exception as err:
+                logger.debug("nvmlShutdown failed", error=str(err))
+    # ------------------------------------------------------------------
+    # State accessor for snapshot aggregation
+    # ------------------------------------------------------------------
+    @property
+    def state(self) -> dict[str, Any]:
+        """Raw counter state — caller aggregates as they wish.
+        Snapshot semantics: returns the current counters at call time. Safe
+        to call from another thread; reads are atomic enough that the
+        worst case is a sample number that doesn't quite match the sums
+        (off by one). For the avg/peak rollups used by experiment-tracking
+        that's fine.
+        """
+        return {
+            "samples": self._sample_count,
+            "vram_sum_gb": self._vram_sum_gb,
+            "vram_peak_gb": self._vram_peak_gb,
+            "gpu_util_sum_pct": self._gpu_util_sum_pct,
+        }

reactor-runtime 2.7.4__tar.gz → 2.7.5__tar.gz

reactor-runtime 2.7.4tar.gz → 2.7.5tar.gz