PyPI - runtheta - Versions diffs - 0.1.9__tar.gz - Mend

runtheta 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

runtheta-0.1.9/.dockerignore +11 -0
runtheta-0.1.9/.github/workflows/ci.yml +64 -0
runtheta-0.1.9/.gitignore +16 -0
runtheta-0.1.9/.supabase/functions/telemetry_ingest/index.ts +89 -0
runtheta-0.1.9/.supabase/migrations/1780625094_thermalos_intelligence_network.sql +65 -0
runtheta-0.1.9/Dockerfile +45 -0
runtheta-0.1.9/PKG-INFO +219 -0
runtheta-0.1.9/README.md +198 -0
runtheta-0.1.9/deploy/grafana/dashboards/theta.json +259 -0
runtheta-0.1.9/deploy/grafana/provisioning/dashboards/theta.yml +7 -0
runtheta-0.1.9/deploy/grafana/provisioning/datasources/prometheus.yml +7 -0
runtheta-0.1.9/deploy/grafana/theta_default_dashboard.json +269 -0
runtheta-0.1.9/deploy/prometheus.yml +11 -0
runtheta-0.1.9/deploy/supabase/schema.sql +65 -0
runtheta-0.1.9/deploy/supabase/telemetry_ingest.ts +89 -0
runtheta-0.1.9/deploy/theta-monitor.service +26 -0
runtheta-0.1.9/docker-compose.yml +64 -0
runtheta-0.1.9/pyproject.toml +35 -0
runtheta-0.1.9/sim/README.md +155 -0
runtheta-0.1.9/sim/__init__.py +1 -0
runtheta-0.1.9/sim/elt/__init__.py +14 -0
runtheta-0.1.9/sim/elt/analysis.py +227 -0
runtheta-0.1.9/sim/elt/degradation.py +170 -0
runtheta-0.1.9/sim/elt/detector.py +170 -0
runtheta-0.1.9/sim/elt/experiment.py +188 -0
runtheta-0.1.9/sim/elt/params.py +178 -0
runtheta-0.1.9/sim/elt/run_elt.py +178 -0
runtheta-0.1.9/sim/elt/thermal_model.py +174 -0
runtheta-0.1.9/sim/elt/validate.py +149 -0
runtheta-0.1.9/sim/tests/__init__.py +0 -0
runtheta-0.1.9/sim/tests/test_elt.py +121 -0
runtheta-0.1.9/supabase/.temp/cli-latest +1 -0
runtheta-0.1.9/supabase/.temp/gotrue-version +1 -0
runtheta-0.1.9/supabase/.temp/linked-project.json +1 -0
runtheta-0.1.9/supabase/.temp/pooler-url +1 -0
runtheta-0.1.9/supabase/.temp/postgres-version +1 -0
runtheta-0.1.9/supabase/.temp/project-ref +1 -0
runtheta-0.1.9/supabase/.temp/rest-version +1 -0
runtheta-0.1.9/supabase/.temp/storage-migration +1 -0
runtheta-0.1.9/supabase/.temp/storage-version +1 -0
runtheta-0.1.9/supabase/functions/telemetry_ingest/index.ts +89 -0
runtheta-0.1.9/tests/test_metrics.py +248 -0
runtheta-0.1.9/theta/__init__.py +7 -0
runtheta-0.1.9/theta/agent/__init__.py +0 -0
runtheta-0.1.9/theta/agent/alerter.py +218 -0
runtheta-0.1.9/theta/agent/baseline.py +204 -0
runtheta-0.1.9/theta/agent/calibrate.py +233 -0
runtheta-0.1.9/theta/agent/classifier.py +217 -0
runtheta-0.1.9/theta/agent/collector.py +211 -0
runtheta-0.1.9/theta/agent/correlator.py +91 -0
runtheta-0.1.9/theta/agent/daemon.py +534 -0
runtheta-0.1.9/theta/agent/dcgm_collector.py +151 -0
runtheta-0.1.9/theta/agent/detector.py +185 -0
runtheta-0.1.9/theta/agent/exporter.py +229 -0
runtheta-0.1.9/theta/agent/fault_classifier.py +411 -0
runtheta-0.1.9/theta/agent/health_api.py +176 -0
runtheta-0.1.9/theta/agent/metrics.py +150 -0
runtheta-0.1.9/theta/agent/predictor.py +237 -0
runtheta-0.1.9/theta/agent/redfish_collector.py +171 -0
runtheta-0.1.9/theta/agent/sdc_hunter.py +266 -0
runtheta-0.1.9/theta/agent/silicon.py +339 -0
runtheta-0.1.9/theta/agent/state.py +199 -0
runtheta-0.1.9/theta/agent/telemetry.py +217 -0
runtheta-0.1.9/theta/agent/unsupervised.py +199 -0
runtheta-0.1.9/theta/agent/window.py +116 -0
runtheta-0.1.9/theta/cli.py +445 -0
runtheta-0.1.9/theta/models/__init__.py +0 -0
runtheta-0.1.9/theta/models/bundle/dt_steady_state.pkl +0 -0
runtheta-0.1.9/theta/models/bundle/nb_steady_state.pkl +0 -0
runtheta-0.1.9/theta/models/train.py +130 -0
runtheta-0.1.9/theta/wizard.py +717 -0

runtheta-0.1.9/.dockerignore ADDED Viewed

@@ -0,0 +1,11 @@
+.git
+.github
+__pycache__
+*.pyc
+*.pyo
+.pytest_cache
+dist/
+*.egg-info/
+tests/
+.venv/
+*.pkl.bak

runtheta-0.1.9/.github/workflows/ci.yml ADDED Viewed

@@ -0,0 +1,64 @@
+name: CI
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+      - name: Install
+        run: pip install -e ".[dev]"
+      - name: Train bundled models
+        run: |
+          # CI has no GPU data — skip training, rely on hard-coded rule fallback
+          echo "Skipping model training in CI (no Stage 1 CSV available)"
+      - name: Test
+        run: pytest tests/ -v --tb=short
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+      - run: pip install ruff
+      - run: ruff check theta/ --select E,F,W --ignore E501
+  docker:
+    runs-on: ubuntu-latest
+    needs: test
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build Docker image
+        run: docker build -t runtheta/agent:ci .
+      - name: Smoke test (demo mode)
+        run: docker run --rm runtheta/agent:ci --help
+      - name: Tag and push (main only)
+        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+        run: |
+          echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u "${{ secrets.DOCKER_USERNAME }}" --password-stdin
+          docker tag runtheta/agent:ci runtheta/agent:latest
+          docker push runtheta/agent:latest

runtheta-0.1.9/.gitignore ADDED Viewed

@@ -0,0 +1,16 @@
+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+build/
+dist/
+# Simulation venv + generated artifacts
+sim/.venv/
+sim/elt/out/
+sim/elt/out_*/
+.pytest_cache/
+# OS
+.DS_Store

runtheta-0.1.9/.supabase/functions/telemetry_ingest/index.ts ADDED Viewed

@@ -0,0 +1,89 @@
+// Theta Intelligence Network — Supabase Edge Function
+// Deploy: supabase functions deploy telemetry_ingest
+//
+// Receives anonymized GPU health batches from the theta agent.
+// Validates, normalizes, inserts into gpu_health_hourly.
+// Returns community benchmarks for the agent's GPU generation.
+import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
+const supabase = createClient(
+  Deno.env.get("SUPABASE_URL")!,
+  Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!
+);
+const ALLOWED_GPU_GENS = new Set([
+  "t4-class", "a100-class", "h100-class", "b200-class",
+  "l40-class", "a10-class", "mi300-class", "other",
+]);
+function sanitize(batch: any): any | null {
+  if (typeof batch !== "object" || batch === null) return null;
+  const gpu_gen = String(batch.gpu_gen ?? "other");
+  if (!ALLOWED_GPU_GENS.has(gpu_gen)) return null;
+  const hour = Number(batch.hour);
+  if (!Number.isInteger(hour) || hour < 0) return null;
+  return {
+    gpu_gen,
+    hour,
+    n_samples:         Math.min(Number(batch.n_samples ?? 0), 10000),
+    rtheta_mean:       batch.rtheta_mean != null ? Number(batch.rtheta_mean) : null,
+    rtheta_std_mean:   batch.rtheta_std_mean != null ? Number(batch.rtheta_std_mean) : null,
+    ecc_sbit_total:    Math.max(0, Number(batch.ecc_sbit_total ?? 0)),
+    ecc_dbit_any:      Boolean(batch.ecc_dbit_any),
+    clock_eff_mean:    batch.clock_eff_mean != null ? Number(batch.clock_eff_mean) : null,
+    alert_types:       Array.isArray(batch.alert_types) ? batch.alert_types.slice(0, 10).map(String) : [],
+    recovery_time_p50: batch.recovery_time_p50 != null ? Number(batch.recovery_time_p50) : null,
+  };
+}
+Deno.serve(async (req: Request) => {
+  if (req.method !== "POST") {
+    return new Response("Method not allowed", { status: 405 });
+  }
+  let body: any;
+  try {
+    body = await req.json();
+  } catch {
+    return new Response("Invalid JSON", { status: 400 });
+  }
+  const install_id    = String(body.install_id ?? "").slice(0, 32);
+  const agent_version = String(body.agent_version ?? "unknown").slice(0, 20);
+  const batches       = Array.isArray(body.batches) ? body.batches : [];
+  if (!install_id || batches.length === 0 || batches.length > 500) {
+    return new Response("Bad request", { status: 400 });
+  }
+  // Store raw batch for audit
+  await supabase.from("telemetry_batches").insert({
+    install_id,
+    agent_version,
+    batch: batches,
+  });
+  // Normalize and insert health rows
+  const rows = batches
+    .map((b: any) => sanitize(b))
+    .filter(Boolean)
+    .map((r: any) => ({ ...r, install_id }));
+  if (rows.length > 0) {
+    await supabase.from("gpu_health_hourly").insert(rows);
+  }
+  // Return community benchmarks for the GPU generations present in this batch
+  const gpu_gens = [...new Set(rows.map((r: any) => r.gpu_gen))];
+  const { data: benchmarks } = await supabase
+    .from("community_benchmarks")
+    .select("*")
+    .in("gpu_gen", gpu_gens);
+  return new Response(
+    JSON.stringify({ accepted: rows.length, benchmarks: benchmarks ?? [] }),
+    { headers: { "Content-Type": "application/json" }, status: 200 }
+  );
+});

runtheta-0.1.9/.supabase/migrations/1780625094_thermalos_intelligence_network.sql ADDED Viewed

@@ -0,0 +1,65 @@
+-- Theta Intelligence Network — Supabase schema
+-- Run once: supabase db push (or paste into SQL editor)
+-- ── Raw telemetry batches ──────────────────────────────────────────────────────
+CREATE TABLE IF NOT EXISTS telemetry_batches (
+    id              BIGSERIAL PRIMARY KEY,
+    received_at     TIMESTAMPTZ DEFAULT NOW(),
+    install_id      TEXT NOT NULL,          -- anonymous sha256 of machine UUID (16 chars)
+    agent_version   TEXT NOT NULL,
+    batch           JSONB NOT NULL          -- array of aggregated hour-bucket records
+);
+CREATE INDEX IF NOT EXISTS idx_telemetry_install ON telemetry_batches(install_id);
+CREATE INDEX IF NOT EXISTS idx_telemetry_received ON telemetry_batches(received_at);
+-- ── Normalized GPU health events ──────────────────────────────────────────────
+CREATE TABLE IF NOT EXISTS gpu_health_hourly (
+    id              BIGSERIAL PRIMARY KEY,
+    install_id      TEXT NOT NULL,
+    hour            BIGINT NOT NULL,        -- unix epoch // 3600
+    gpu_gen         TEXT NOT NULL,          -- "h100-class", "b200-class", etc.
+    n_samples       INT NOT NULL,
+    rtheta_mean     FLOAT,
+    rtheta_std_mean FLOAT,
+    ecc_sbit_total  FLOAT DEFAULT 0,
+    ecc_dbit_any    BOOLEAN DEFAULT FALSE,
+    clock_eff_mean  FLOAT,
+    alert_types     TEXT[],
+    recovery_time_p50 FLOAT,
+    created_at      TIMESTAMPTZ DEFAULT NOW()
+);
+CREATE INDEX IF NOT EXISTS idx_health_gpu_gen ON gpu_health_hourly(gpu_gen);
+CREATE INDEX IF NOT EXISTS idx_health_hour ON gpu_health_hourly(hour);
+-- ── Community benchmark view ──────────────────────────────────────────────────
+-- This is the "give back" — exposed to agents as /v1/benchmarks?gpu_gen=h100-class
+CREATE OR REPLACE VIEW community_benchmarks AS
+SELECT
+    gpu_gen,
+    COUNT(DISTINCT install_id)                          AS fleet_size,
+    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p25,
+    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p50,
+    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p75,
+    PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p95,
+    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY clock_eff_mean) FILTER (WHERE clock_eff_mean IS NOT NULL) AS clock_eff_p50,
+    AVG(ecc_sbit_total)                                AS avg_ecc_sbit_per_hour,
+    SUM(CASE WHEN ecc_dbit_any THEN 1 ELSE 0 END)::FLOAT / COUNT(*) AS dbit_event_rate,
+    MAX(created_at)                                    AS last_updated
+FROM gpu_health_hourly
+WHERE created_at > NOW() - INTERVAL '30 days'
+  AND n_samples >= 5
+GROUP BY gpu_gen;
+-- ── Row-level security ────────────────────────────────────────────────────────
+-- All writes go through the Edge Function (service role). Public can read benchmarks.
+ALTER TABLE telemetry_batches ENABLE ROW LEVEL SECURITY;
+ALTER TABLE gpu_health_hourly ENABLE ROW LEVEL SECURITY;
+-- Edge function uses service role key — full access
+-- Public (agent GET /benchmarks) reads the view only, no direct table access

runtheta-0.1.9/Dockerfile ADDED Viewed

@@ -0,0 +1,45 @@
+# ── Stage 1: build ────────────────────────────────────────────────────────────
+FROM python:3.12-slim AS builder
+WORKDIR /build
+COPY pyproject.toml README.md ./
+COPY theta/ ./theta/
+RUN pip install --upgrade pip --quiet \
+ && pip install build --quiet \
+ && python -m build --wheel --outdir /dist
+# ── Stage 2: runtime ──────────────────────────────────────────────────────────
+FROM python:3.12-slim AS runtime
+LABEL org.opencontainers.image.title="Theta"
+LABEL org.opencontainers.image.description="GPU thermal-power forensics agent"
+LABEL org.opencontainers.image.licenses="MIT"
+LABEL org.opencontainers.image.source="https://github.com/Asomisetty27/theta"
+# Non-root user
+RUN useradd --create-home --shell /bin/bash theta
+WORKDIR /app
+COPY --from=builder /dist/*.whl .
+RUN pip install --quiet *.whl && rm *.whl
+# Config and log dirs (writable by theta user)
+RUN mkdir -p /home/theta/.theta /var/log/theta \
+ && chown -R theta:theta /home/theta/.theta /var/log/theta
+USER theta
+# Prometheus metrics
+EXPOSE 9101
+# Defaults — override via env vars or command args
+ENV THETA_INTERVAL=5 \
+    THETA_PROMETHEUS_PORT=9101 \
+    THETA_LOG=/var/log/theta/alerts.jsonl
+ENTRYPOINT ["theta"]
+CMD ["monitor", \
+     "--interval", "5", \
+     "--port",     "9101", \
+     "--log",      "/var/log/theta/alerts.jsonl"]

runtheta-0.1.9/PKG-INFO ADDED Viewed

@@ -0,0 +1,219 @@
+Metadata-Version: 2.4
+Name: runtheta
+Version: 0.1.9
+Summary: GPU thermal-power forensics agent. Computes R_theta = ΔT/P in real time.
+License: MIT
+Requires-Python: >=3.10
+Requires-Dist: httpx>=0.27
+Requires-Dist: joblib>=1.3
+Requires-Dist: numpy>=1.24
+Requires-Dist: nvidia-ml-py>=12.0.0
+Requires-Dist: prometheus-client>=0.20
+Requires-Dist: questionary>=2.0
+Requires-Dist: rich>=13.0
+Requires-Dist: scikit-learn>=1.3
+Requires-Dist: structlog>=24.0
+Requires-Dist: typer>=0.12
+Provides-Extra: dev
+Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Description-Content-Type: text/markdown
+# Theta
+**GPU thermal-power forensics agent.** Computes `R_θ = ΔT / P` in real time from your existing DCGM telemetry. That ratio is the only signal that separates a busy-hot GPU from a failing-hot one — and no incumbent computes it.
+```
+theta_gpu_rtheta_cwatt{gpu_index="3"} 2.104   # zombie recovery — CUDA context stuck
+theta_gpu_rtheta_cwatt{gpu_index="3"} 0.724   # under load — healthy
+theta_gpu_rtheta_cwatt{gpu_index="3"} 1.281   # clean idle — normal
+```
+---
+## The problem
+A GPU at 82°C could be:
+- **Busy and healthy** — running a job at thermal equilibrium
+- **Cooling path failing** — ambient temperature up, heatsink degrading
+- **CUDA zombie** — process exited but context retained, drawing 31W at 0% utilization
+`nvidia-smi`, DCGM, and Mission Control all expose T and P as separate fields. None of them divide the two. Theta does.
+---
+## Quick start
+### pip (single node, free forever)
+```bash
+pip install theta
+theta setup        # interactive wizard — 90 seconds to first R_θ reading
+theta monitor      # start monitoring
+```
+### Docker
+```bash
+docker run --gpus all -p 9101:9101 theta/agent:latest
+```
+### Docker Compose (agent + Prometheus + Grafana)
+```bash
+git clone https://github.com/Asomisetty27/theta
+cd theta
+docker compose --profile metrics up
+```
+Open `http://localhost:3000` — Grafana dashboard pre-provisioned, no setup required.
+Login: `admin` / `theta`
+---
+## How it works
+```
+GPU (pynvml)
+  → T_junction, P_GPU, util, P-state every 5s
+  → R_θ = (T_junction − T_ref) / P_GPU
+  → 15s steady-state window  (σ < 0.03 C/W)
+  → Decision Tree classifier  →  {under_load, clean_idle, zombie_recovery, child_exit_recovery}
+  → Rolling baseline + k·σ drift detector
+  → Alert (stdout / Slack webhook / JSONL / Prometheus)
+```
+**Virtual ambient** — `T_ref` is derived from the GPU's own stable idle windows. No thermocouple, no rack modification, no extra hardware.
+**Steady-state filter** — classification only runs on stable windows. This takes Naive Bayes accuracy from 84% → 99.8% and eliminates transient false positives.
+**Classifier** — Decision Tree trained on 4,570 rows of Stage 1 Tesla T4 data. 100% 5-fold CV accuracy on steady-state samples. Rules are human-readable and publishable:
+```
+IF R_θ ≤ 0.87        →  under_load          (n=963, conf=1.00)
+IF R_θ > 0.87, P0    →  zombie_recovery     (n=584, conf=1.00)  ← CUDA zombie
+IF R_θ > 1.50, P8    →  child_exit_recovery (n=696, conf=1.00)
+ELSE                 →  clean_idle / early recovery
+```
+---
+## CLI reference
+```
+theta setup                         Interactive wizard (run this first)
+theta monitor                       Run agent — blocks until Ctrl+C
+theta monitor --interval 2          Sample every 2s
+theta monitor --gpus 0,1,3          Monitor specific GPUs
+theta monitor --webhook <url>       Send alerts to Slack / generic webhook
+theta monitor --log alerts.jsonl    Append alerts to JSONL file
+theta monitor --port 9101           Prometheus metrics port (0 = disabled)
+theta monitor --nb                  Use Naive Bayes instead of Decision Tree
+theta baseline --gpu 0              Lock virtual ambient T_ref from idle window
+theta baseline --gpu 0 --manual 24  Set T_ref manually (°C)
+theta classify                      Snapshot classify all GPUs right now
+theta serve --port 9101             Metrics export only (no stdout alerts)
+theta train /path/data.csv          Retrain bundled models from new data
+```
+---
+## Prometheus metrics
+| Metric | Type | Description |
+|---|---|---|
+| `theta_gpu_rtheta_cwatt` | gauge | R_θ (C/W) — the core signal |
+| `theta_gpu_state_info` | gauge | Current classified state (label: `state`) |
+| `theta_gpu_drift_sigma` | gauge | Deviation from baseline in σ units |
+| `theta_gpu_temperature_celsius` | gauge | Junction temperature |
+| `theta_gpu_power_watts` | gauge | GPU power consumption |
+| `theta_gpu_utilization_ratio` | gauge | 0–1 utilization |
+| `theta_gpu_perf_state` | gauge | P-state (0=max, 8=idle) |
+| `theta_gpu_baseline_tref_celsius` | gauge | Virtual ambient T_ref |
+| `theta_gpu_window_rtheta_std` | gauge | Steady-state window σ |
+| `theta_gpu_alerts_total` | counter | Alerts (labels: `severity`, `state`) |
+All metrics include a `gpu_index` label.
+---
+## Alert payload (webhook / JSONL)
+Every alert includes full forensic context:
+```json
+{
+  "source":    "theta",
+  "severity":  "critical",
+  "gpu_index": 3,
+  "state":     "zombie_recovery",
+  "prev_state": "under_load",
+  "rtheta":    1.541,
+  "rtheta_baseline": 0.724,
+  "drift_sigma": 4.2,
+  "confidence": 1.0,
+  "message":   "[CRITICAL] GPU 3 — CUDA zombie detected. R_θ=1.541 at 0% utilisation. Action: release CUDA context.",
+  "context": {
+    "severity": "critical",
+    "duration_prev": 3842.1,
+    "history": [
+      { "ts": 1748995200.1, "state": "under_load", "r": 0.721, "conf": 0.99 }
+    ]
+  }
+}
+```
+---
+## Why not DCGM / Mission Control / Phaidra?
+| Capability | DCGM | Mission Control | Phaidra | **Theta** |
+|---|:---:|:---:|:---:|:---:|
+| Computes R_θ | ✗ | ✗ | ✗ | **✓** |
+| Separates busy-hot vs failing-hot | ✗ | ✗ | ✗ | **✓** |
+| CUDA zombie detection | ✗ | ✗ | ✗ | **✓** |
+| Drift detection (baseline + k·σ) | ✗ | ✗ | ◐ | **✓** |
+| Virtual ambient (no hardware) | ✗ | ✗ | ✗ | **✓** |
+| Serves neocloud / mixed fleets | ✓ | ✗ | ✗ | **✓** |
+| Open-source agent | ✓ | ✗ | ✗ | **✓** |
+Mission Control ships only on Blackwell DGX/GB200. Theta runs on any NVIDIA GPU reachable by pynvml.
+---
+## Requirements
+- Python 3.10+
+- NVIDIA GPU with driver ≥ 450 (for pynvml)
+- No DCGM required — pynvml only
+For Docker: [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
+---
+## Retrain on your own data
+```bash
+theta train /path/to/measurements.csv
+```
+CSV schema: `phase, trial_second, rtheta_cwatt, power_w, util_pct, perf_state, ...`
+---
+## Research basis
+- **F1** — R_θ separates idle (1.28 C/W) from load (0.72 C/W) with 77.9% margin, Tesla T4
+- **F2** — Ambient sensitivity: 7.1%/°C at idle vs 2.0%/°C at load (3.5× difference)
+- **F6** — CUDA zombie: same-process exit leaves GPU at P0 (~31W), invisible to utilization
+Stage 1: 4,570 rows · Tesla T4 · E001–E004 · 9 child-exit trials
+Stage 2 (in progress): Cal Poly DGX B200 AI Factory · E005–E008
+---
+## License
+MIT — free forever for single-node use.
+Built at Cal Poly SLO · [asomisetty27@gmail.com](mailto:asomisetty27@gmail.com)