runtheta 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. runtheta-0.1.9/.dockerignore +11 -0
  2. runtheta-0.1.9/.github/workflows/ci.yml +64 -0
  3. runtheta-0.1.9/.gitignore +16 -0
  4. runtheta-0.1.9/.supabase/functions/telemetry_ingest/index.ts +89 -0
  5. runtheta-0.1.9/.supabase/migrations/1780625094_thermalos_intelligence_network.sql +65 -0
  6. runtheta-0.1.9/Dockerfile +45 -0
  7. runtheta-0.1.9/PKG-INFO +219 -0
  8. runtheta-0.1.9/README.md +198 -0
  9. runtheta-0.1.9/deploy/grafana/dashboards/theta.json +259 -0
  10. runtheta-0.1.9/deploy/grafana/provisioning/dashboards/theta.yml +7 -0
  11. runtheta-0.1.9/deploy/grafana/provisioning/datasources/prometheus.yml +7 -0
  12. runtheta-0.1.9/deploy/grafana/theta_default_dashboard.json +269 -0
  13. runtheta-0.1.9/deploy/prometheus.yml +11 -0
  14. runtheta-0.1.9/deploy/supabase/schema.sql +65 -0
  15. runtheta-0.1.9/deploy/supabase/telemetry_ingest.ts +89 -0
  16. runtheta-0.1.9/deploy/theta-monitor.service +26 -0
  17. runtheta-0.1.9/docker-compose.yml +64 -0
  18. runtheta-0.1.9/pyproject.toml +35 -0
  19. runtheta-0.1.9/sim/README.md +155 -0
  20. runtheta-0.1.9/sim/__init__.py +1 -0
  21. runtheta-0.1.9/sim/elt/__init__.py +14 -0
  22. runtheta-0.1.9/sim/elt/analysis.py +227 -0
  23. runtheta-0.1.9/sim/elt/degradation.py +170 -0
  24. runtheta-0.1.9/sim/elt/detector.py +170 -0
  25. runtheta-0.1.9/sim/elt/experiment.py +188 -0
  26. runtheta-0.1.9/sim/elt/params.py +178 -0
  27. runtheta-0.1.9/sim/elt/run_elt.py +178 -0
  28. runtheta-0.1.9/sim/elt/thermal_model.py +174 -0
  29. runtheta-0.1.9/sim/elt/validate.py +149 -0
  30. runtheta-0.1.9/sim/tests/__init__.py +0 -0
  31. runtheta-0.1.9/sim/tests/test_elt.py +121 -0
  32. runtheta-0.1.9/supabase/.temp/cli-latest +1 -0
  33. runtheta-0.1.9/supabase/.temp/gotrue-version +1 -0
  34. runtheta-0.1.9/supabase/.temp/linked-project.json +1 -0
  35. runtheta-0.1.9/supabase/.temp/pooler-url +1 -0
  36. runtheta-0.1.9/supabase/.temp/postgres-version +1 -0
  37. runtheta-0.1.9/supabase/.temp/project-ref +1 -0
  38. runtheta-0.1.9/supabase/.temp/rest-version +1 -0
  39. runtheta-0.1.9/supabase/.temp/storage-migration +1 -0
  40. runtheta-0.1.9/supabase/.temp/storage-version +1 -0
  41. runtheta-0.1.9/supabase/functions/telemetry_ingest/index.ts +89 -0
  42. runtheta-0.1.9/tests/test_metrics.py +248 -0
  43. runtheta-0.1.9/theta/__init__.py +7 -0
  44. runtheta-0.1.9/theta/agent/__init__.py +0 -0
  45. runtheta-0.1.9/theta/agent/alerter.py +218 -0
  46. runtheta-0.1.9/theta/agent/baseline.py +204 -0
  47. runtheta-0.1.9/theta/agent/calibrate.py +233 -0
  48. runtheta-0.1.9/theta/agent/classifier.py +217 -0
  49. runtheta-0.1.9/theta/agent/collector.py +211 -0
  50. runtheta-0.1.9/theta/agent/correlator.py +91 -0
  51. runtheta-0.1.9/theta/agent/daemon.py +534 -0
  52. runtheta-0.1.9/theta/agent/dcgm_collector.py +151 -0
  53. runtheta-0.1.9/theta/agent/detector.py +185 -0
  54. runtheta-0.1.9/theta/agent/exporter.py +229 -0
  55. runtheta-0.1.9/theta/agent/fault_classifier.py +411 -0
  56. runtheta-0.1.9/theta/agent/health_api.py +176 -0
  57. runtheta-0.1.9/theta/agent/metrics.py +150 -0
  58. runtheta-0.1.9/theta/agent/predictor.py +237 -0
  59. runtheta-0.1.9/theta/agent/redfish_collector.py +171 -0
  60. runtheta-0.1.9/theta/agent/sdc_hunter.py +266 -0
  61. runtheta-0.1.9/theta/agent/silicon.py +339 -0
  62. runtheta-0.1.9/theta/agent/state.py +199 -0
  63. runtheta-0.1.9/theta/agent/telemetry.py +217 -0
  64. runtheta-0.1.9/theta/agent/unsupervised.py +199 -0
  65. runtheta-0.1.9/theta/agent/window.py +116 -0
  66. runtheta-0.1.9/theta/cli.py +445 -0
  67. runtheta-0.1.9/theta/models/__init__.py +0 -0
  68. runtheta-0.1.9/theta/models/bundle/dt_steady_state.pkl +0 -0
  69. runtheta-0.1.9/theta/models/bundle/nb_steady_state.pkl +0 -0
  70. runtheta-0.1.9/theta/models/train.py +130 -0
  71. runtheta-0.1.9/theta/wizard.py +717 -0
@@ -0,0 +1,11 @@
1
+ .git
2
+ .github
3
+ __pycache__
4
+ *.pyc
5
+ *.pyo
6
+ .pytest_cache
7
+ dist/
8
+ *.egg-info/
9
+ tests/
10
+ .venv/
11
+ *.pkl.bak
@@ -0,0 +1,64 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+ cache: pip
24
+
25
+ - name: Install
26
+ run: pip install -e ".[dev]"
27
+
28
+ - name: Train bundled models
29
+ run: |
30
+ # CI has no GPU data — skip training, rely on hard-coded rule fallback
31
+ echo "Skipping model training in CI (no Stage 1 CSV available)"
32
+
33
+ - name: Test
34
+ run: pytest tests/ -v --tb=short
35
+
36
+ lint:
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+ - uses: actions/setup-python@v5
41
+ with:
42
+ python-version: "3.12"
43
+ cache: pip
44
+ - run: pip install ruff
45
+ - run: ruff check theta/ --select E,F,W --ignore E501
46
+
47
+ docker:
48
+ runs-on: ubuntu-latest
49
+ needs: test
50
+ steps:
51
+ - uses: actions/checkout@v4
52
+
53
+ - name: Build Docker image
54
+ run: docker build -t runtheta/agent:ci .
55
+
56
+ - name: Smoke test (demo mode)
57
+ run: docker run --rm runtheta/agent:ci --help
58
+
59
+ - name: Tag and push (main only)
60
+ if: github.ref == 'refs/heads/main' && github.event_name == 'push'
61
+ run: |
62
+ echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u "${{ secrets.DOCKER_USERNAME }}" --password-stdin
63
+ docker tag runtheta/agent:ci runtheta/agent:latest
64
+ docker push runtheta/agent:latest
@@ -0,0 +1,16 @@
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.egg-info/
6
+ build/
7
+ dist/
8
+
9
+ # Simulation venv + generated artifacts
10
+ sim/.venv/
11
+ sim/elt/out/
12
+ sim/elt/out_*/
13
+ .pytest_cache/
14
+
15
+ # OS
16
+ .DS_Store
@@ -0,0 +1,89 @@
1
+ // Theta Intelligence Network — Supabase Edge Function
2
+ // Deploy: supabase functions deploy telemetry_ingest
3
+ //
4
+ // Receives anonymized GPU health batches from the theta agent.
5
+ // Validates, normalizes, inserts into gpu_health_hourly.
6
+ // Returns community benchmarks for the agent's GPU generation.
7
+
8
+ import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
9
+
10
+ const supabase = createClient(
11
+ Deno.env.get("SUPABASE_URL")!,
12
+ Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!
13
+ );
14
+
15
+ const ALLOWED_GPU_GENS = new Set([
16
+ "t4-class", "a100-class", "h100-class", "b200-class",
17
+ "l40-class", "a10-class", "mi300-class", "other",
18
+ ]);
19
+
20
+ function sanitize(batch: any): any | null {
21
+ if (typeof batch !== "object" || batch === null) return null;
22
+ const gpu_gen = String(batch.gpu_gen ?? "other");
23
+ if (!ALLOWED_GPU_GENS.has(gpu_gen)) return null;
24
+ const hour = Number(batch.hour);
25
+ if (!Number.isInteger(hour) || hour < 0) return null;
26
+
27
+ return {
28
+ gpu_gen,
29
+ hour,
30
+ n_samples: Math.min(Number(batch.n_samples ?? 0), 10000),
31
+ rtheta_mean: batch.rtheta_mean != null ? Number(batch.rtheta_mean) : null,
32
+ rtheta_std_mean: batch.rtheta_std_mean != null ? Number(batch.rtheta_std_mean) : null,
33
+ ecc_sbit_total: Math.max(0, Number(batch.ecc_sbit_total ?? 0)),
34
+ ecc_dbit_any: Boolean(batch.ecc_dbit_any),
35
+ clock_eff_mean: batch.clock_eff_mean != null ? Number(batch.clock_eff_mean) : null,
36
+ alert_types: Array.isArray(batch.alert_types) ? batch.alert_types.slice(0, 10).map(String) : [],
37
+ recovery_time_p50: batch.recovery_time_p50 != null ? Number(batch.recovery_time_p50) : null,
38
+ };
39
+ }
40
+
41
+ Deno.serve(async (req: Request) => {
42
+ if (req.method !== "POST") {
43
+ return new Response("Method not allowed", { status: 405 });
44
+ }
45
+
46
+ let body: any;
47
+ try {
48
+ body = await req.json();
49
+ } catch {
50
+ return new Response("Invalid JSON", { status: 400 });
51
+ }
52
+
53
+ const install_id = String(body.install_id ?? "").slice(0, 32);
54
+ const agent_version = String(body.agent_version ?? "unknown").slice(0, 20);
55
+ const batches = Array.isArray(body.batches) ? body.batches : [];
56
+
57
+ if (!install_id || batches.length === 0 || batches.length > 500) {
58
+ return new Response("Bad request", { status: 400 });
59
+ }
60
+
61
+ // Store raw batch for audit
62
+ await supabase.from("telemetry_batches").insert({
63
+ install_id,
64
+ agent_version,
65
+ batch: batches,
66
+ });
67
+
68
+ // Normalize and insert health rows
69
+ const rows = batches
70
+ .map((b: any) => sanitize(b))
71
+ .filter(Boolean)
72
+ .map((r: any) => ({ ...r, install_id }));
73
+
74
+ if (rows.length > 0) {
75
+ await supabase.from("gpu_health_hourly").insert(rows);
76
+ }
77
+
78
+ // Return community benchmarks for the GPU generations present in this batch
79
+ const gpu_gens = [...new Set(rows.map((r: any) => r.gpu_gen))];
80
+ const { data: benchmarks } = await supabase
81
+ .from("community_benchmarks")
82
+ .select("*")
83
+ .in("gpu_gen", gpu_gens);
84
+
85
+ return new Response(
86
+ JSON.stringify({ accepted: rows.length, benchmarks: benchmarks ?? [] }),
87
+ { headers: { "Content-Type": "application/json" }, status: 200 }
88
+ );
89
+ });
@@ -0,0 +1,65 @@
1
+ -- Theta Intelligence Network — Supabase schema
2
+ -- Run once: supabase db push (or paste into SQL editor)
3
+
4
+ -- ── Raw telemetry batches ──────────────────────────────────────────────────────
5
+
6
+ CREATE TABLE IF NOT EXISTS telemetry_batches (
7
+ id BIGSERIAL PRIMARY KEY,
8
+ received_at TIMESTAMPTZ DEFAULT NOW(),
9
+ install_id TEXT NOT NULL, -- anonymous sha256 of machine UUID (16 chars)
10
+ agent_version TEXT NOT NULL,
11
+ batch JSONB NOT NULL -- array of aggregated hour-bucket records
12
+ );
13
+
14
+ CREATE INDEX IF NOT EXISTS idx_telemetry_install ON telemetry_batches(install_id);
15
+ CREATE INDEX IF NOT EXISTS idx_telemetry_received ON telemetry_batches(received_at);
16
+
17
+ -- ── Normalized GPU health events ──────────────────────────────────────────────
18
+
19
+ CREATE TABLE IF NOT EXISTS gpu_health_hourly (
20
+ id BIGSERIAL PRIMARY KEY,
21
+ install_id TEXT NOT NULL,
22
+ hour BIGINT NOT NULL, -- unix epoch // 3600
23
+ gpu_gen TEXT NOT NULL, -- "h100-class", "b200-class", etc.
24
+ n_samples INT NOT NULL,
25
+ rtheta_mean FLOAT,
26
+ rtheta_std_mean FLOAT,
27
+ ecc_sbit_total FLOAT DEFAULT 0,
28
+ ecc_dbit_any BOOLEAN DEFAULT FALSE,
29
+ clock_eff_mean FLOAT,
30
+ alert_types TEXT[],
31
+ recovery_time_p50 FLOAT,
32
+ created_at TIMESTAMPTZ DEFAULT NOW()
33
+ );
34
+
35
+ CREATE INDEX IF NOT EXISTS idx_health_gpu_gen ON gpu_health_hourly(gpu_gen);
36
+ CREATE INDEX IF NOT EXISTS idx_health_hour ON gpu_health_hourly(hour);
37
+
38
+ -- ── Community benchmark view ──────────────────────────────────────────────────
39
+ -- This is the "give back" — exposed to agents as /v1/benchmarks?gpu_gen=h100-class
40
+
41
+ CREATE OR REPLACE VIEW community_benchmarks AS
42
+ SELECT
43
+ gpu_gen,
44
+ COUNT(DISTINCT install_id) AS fleet_size,
45
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p25,
46
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p50,
47
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p75,
48
+ PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p95,
49
+ PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY clock_eff_mean) FILTER (WHERE clock_eff_mean IS NOT NULL) AS clock_eff_p50,
50
+ AVG(ecc_sbit_total) AS avg_ecc_sbit_per_hour,
51
+ SUM(CASE WHEN ecc_dbit_any THEN 1 ELSE 0 END)::FLOAT / COUNT(*) AS dbit_event_rate,
52
+ MAX(created_at) AS last_updated
53
+ FROM gpu_health_hourly
54
+ WHERE created_at > NOW() - INTERVAL '30 days'
55
+ AND n_samples >= 5
56
+ GROUP BY gpu_gen;
57
+
58
+ -- ── Row-level security ────────────────────────────────────────────────────────
59
+ -- All writes go through the Edge Function (service role). Public can read benchmarks.
60
+
61
+ ALTER TABLE telemetry_batches ENABLE ROW LEVEL SECURITY;
62
+ ALTER TABLE gpu_health_hourly ENABLE ROW LEVEL SECURITY;
63
+
64
+ -- Edge function uses service role key — full access
65
+ -- Public (agent GET /benchmarks) reads the view only, no direct table access
@@ -0,0 +1,45 @@
1
+ # ── Stage 1: build ────────────────────────────────────────────────────────────
2
+ FROM python:3.12-slim AS builder
3
+
4
+ WORKDIR /build
5
+ COPY pyproject.toml README.md ./
6
+ COPY theta/ ./theta/
7
+
8
+ RUN pip install --upgrade pip --quiet \
9
+ && pip install build --quiet \
10
+ && python -m build --wheel --outdir /dist
11
+
12
+ # ── Stage 2: runtime ──────────────────────────────────────────────────────────
13
+ FROM python:3.12-slim AS runtime
14
+
15
+ LABEL org.opencontainers.image.title="Theta"
16
+ LABEL org.opencontainers.image.description="GPU thermal-power forensics agent"
17
+ LABEL org.opencontainers.image.licenses="MIT"
18
+ LABEL org.opencontainers.image.source="https://github.com/Asomisetty27/theta"
19
+
20
+ # Non-root user
21
+ RUN useradd --create-home --shell /bin/bash theta
22
+
23
+ WORKDIR /app
24
+ COPY --from=builder /dist/*.whl .
25
+ RUN pip install --quiet *.whl && rm *.whl
26
+
27
+ # Config and log dirs (writable by theta user)
28
+ RUN mkdir -p /home/theta/.theta /var/log/theta \
29
+ && chown -R theta:theta /home/theta/.theta /var/log/theta
30
+
31
+ USER theta
32
+
33
+ # Prometheus metrics
34
+ EXPOSE 9101
35
+
36
+ # Defaults — override via env vars or command args
37
+ ENV THETA_INTERVAL=5 \
38
+ THETA_PROMETHEUS_PORT=9101 \
39
+ THETA_LOG=/var/log/theta/alerts.jsonl
40
+
41
+ ENTRYPOINT ["theta"]
42
+ CMD ["monitor", \
43
+ "--interval", "5", \
44
+ "--port", "9101", \
45
+ "--log", "/var/log/theta/alerts.jsonl"]
@@ -0,0 +1,219 @@
1
+ Metadata-Version: 2.4
2
+ Name: runtheta
3
+ Version: 0.1.9
4
+ Summary: GPU thermal-power forensics agent. Computes R_theta = ΔT/P in real time.
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: httpx>=0.27
8
+ Requires-Dist: joblib>=1.3
9
+ Requires-Dist: numpy>=1.24
10
+ Requires-Dist: nvidia-ml-py>=12.0.0
11
+ Requires-Dist: prometheus-client>=0.20
12
+ Requires-Dist: questionary>=2.0
13
+ Requires-Dist: rich>=13.0
14
+ Requires-Dist: scikit-learn>=1.3
15
+ Requires-Dist: structlog>=24.0
16
+ Requires-Dist: typer>=0.12
17
+ Provides-Extra: dev
18
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
19
+ Requires-Dist: pytest>=8.0; extra == 'dev'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # Theta
23
+
24
+ **GPU thermal-power forensics agent.** Computes `R_θ = ΔT / P` in real time from your existing DCGM telemetry. That ratio is the only signal that separates a busy-hot GPU from a failing-hot one — and no incumbent computes it.
25
+
26
+ ```
27
+ theta_gpu_rtheta_cwatt{gpu_index="3"} 2.104 # zombie recovery — CUDA context stuck
28
+ theta_gpu_rtheta_cwatt{gpu_index="3"} 0.724 # under load — healthy
29
+ theta_gpu_rtheta_cwatt{gpu_index="3"} 1.281 # clean idle — normal
30
+ ```
31
+
32
+ ---
33
+
34
+ ## The problem
35
+
36
+ A GPU at 82°C could be:
37
+ - **Busy and healthy** — running a job at thermal equilibrium
38
+ - **Cooling path failing** — ambient temperature up, heatsink degrading
39
+ - **CUDA zombie** — process exited but context retained, drawing 31W at 0% utilization
40
+
41
+ `nvidia-smi`, DCGM, and Mission Control all expose T and P as separate fields. None of them divide the two. Theta does.
42
+
43
+ ---
44
+
45
+ ## Quick start
46
+
47
+ ### pip (single node, free forever)
48
+
49
+ ```bash
50
+ pip install theta
51
+ theta setup # interactive wizard — 90 seconds to first R_θ reading
52
+ theta monitor # start monitoring
53
+ ```
54
+
55
+ ### Docker
56
+
57
+ ```bash
58
+ docker run --gpus all -p 9101:9101 theta/agent:latest
59
+ ```
60
+
61
+ ### Docker Compose (agent + Prometheus + Grafana)
62
+
63
+ ```bash
64
+ git clone https://github.com/Asomisetty27/theta
65
+ cd theta
66
+ docker compose --profile metrics up
67
+ ```
68
+
69
+ Open `http://localhost:3000` — Grafana dashboard pre-provisioned, no setup required.
70
+ Login: `admin` / `theta`
71
+
72
+ ---
73
+
74
+ ## How it works
75
+
76
+ ```
77
+ GPU (pynvml)
78
+ → T_junction, P_GPU, util, P-state every 5s
79
+ → R_θ = (T_junction − T_ref) / P_GPU
80
+ → 15s steady-state window (σ < 0.03 C/W)
81
+ → Decision Tree classifier → {under_load, clean_idle, zombie_recovery, child_exit_recovery}
82
+ → Rolling baseline + k·σ drift detector
83
+ → Alert (stdout / Slack webhook / JSONL / Prometheus)
84
+ ```
85
+
86
+ **Virtual ambient** — `T_ref` is derived from the GPU's own stable idle windows. No thermocouple, no rack modification, no extra hardware.
87
+
88
+ **Steady-state filter** — classification only runs on stable windows. This takes Naive Bayes accuracy from 84% → 99.8% and eliminates transient false positives.
89
+
90
+ **Classifier** — Decision Tree trained on 4,570 rows of Stage 1 Tesla T4 data. 100% 5-fold CV accuracy on steady-state samples. Rules are human-readable and publishable:
91
+
92
+ ```
93
+ IF R_θ ≤ 0.87 → under_load (n=963, conf=1.00)
94
+ IF R_θ > 0.87, P0 → zombie_recovery (n=584, conf=1.00) ← CUDA zombie
95
+ IF R_θ > 1.50, P8 → child_exit_recovery (n=696, conf=1.00)
96
+ ELSE → clean_idle / early recovery
97
+ ```
98
+
99
+ ---
100
+
101
+ ## CLI reference
102
+
103
+ ```
104
+ theta setup Interactive wizard (run this first)
105
+ theta monitor Run agent — blocks until Ctrl+C
106
+ theta monitor --interval 2 Sample every 2s
107
+ theta monitor --gpus 0,1,3 Monitor specific GPUs
108
+ theta monitor --webhook <url> Send alerts to Slack / generic webhook
109
+ theta monitor --log alerts.jsonl Append alerts to JSONL file
110
+ theta monitor --port 9101 Prometheus metrics port (0 = disabled)
111
+ theta monitor --nb Use Naive Bayes instead of Decision Tree
112
+ theta baseline --gpu 0 Lock virtual ambient T_ref from idle window
113
+ theta baseline --gpu 0 --manual 24 Set T_ref manually (°C)
114
+ theta classify Snapshot classify all GPUs right now
115
+ theta serve --port 9101 Metrics export only (no stdout alerts)
116
+ theta train /path/data.csv Retrain bundled models from new data
117
+ ```
118
+
119
+ ---
120
+
121
+ ## Prometheus metrics
122
+
123
+ | Metric | Type | Description |
124
+ |---|---|---|
125
+ | `theta_gpu_rtheta_cwatt` | gauge | R_θ (C/W) — the core signal |
126
+ | `theta_gpu_state_info` | gauge | Current classified state (label: `state`) |
127
+ | `theta_gpu_drift_sigma` | gauge | Deviation from baseline in σ units |
128
+ | `theta_gpu_temperature_celsius` | gauge | Junction temperature |
129
+ | `theta_gpu_power_watts` | gauge | GPU power consumption |
130
+ | `theta_gpu_utilization_ratio` | gauge | 0–1 utilization |
131
+ | `theta_gpu_perf_state` | gauge | P-state (0=max, 8=idle) |
132
+ | `theta_gpu_baseline_tref_celsius` | gauge | Virtual ambient T_ref |
133
+ | `theta_gpu_window_rtheta_std` | gauge | Steady-state window σ |
134
+ | `theta_gpu_alerts_total` | counter | Alerts (labels: `severity`, `state`) |
135
+
136
+ All metrics include a `gpu_index` label.
137
+
138
+ ---
139
+
140
+ ## Alert payload (webhook / JSONL)
141
+
142
+ Every alert includes full forensic context:
143
+
144
+ ```json
145
+ {
146
+ "source": "theta",
147
+ "severity": "critical",
148
+ "gpu_index": 3,
149
+ "state": "zombie_recovery",
150
+ "prev_state": "under_load",
151
+ "rtheta": 1.541,
152
+ "rtheta_baseline": 0.724,
153
+ "drift_sigma": 4.2,
154
+ "confidence": 1.0,
155
+ "message": "[CRITICAL] GPU 3 — CUDA zombie detected. R_θ=1.541 at 0% utilisation. Action: release CUDA context.",
156
+ "context": {
157
+ "severity": "critical",
158
+ "duration_prev": 3842.1,
159
+ "history": [
160
+ { "ts": 1748995200.1, "state": "under_load", "r": 0.721, "conf": 0.99 }
161
+ ]
162
+ }
163
+ }
164
+ ```
165
+
166
+ ---
167
+
168
+ ## Why not DCGM / Mission Control / Phaidra?
169
+
170
+ | Capability | DCGM | Mission Control | Phaidra | **Theta** |
171
+ |---|:---:|:---:|:---:|:---:|
172
+ | Computes R_θ | ✗ | ✗ | ✗ | **✓** |
173
+ | Separates busy-hot vs failing-hot | ✗ | ✗ | ✗ | **✓** |
174
+ | CUDA zombie detection | ✗ | ✗ | ✗ | **✓** |
175
+ | Drift detection (baseline + k·σ) | ✗ | ✗ | ◐ | **✓** |
176
+ | Virtual ambient (no hardware) | ✗ | ✗ | ✗ | **✓** |
177
+ | Serves neocloud / mixed fleets | ✓ | ✗ | ✗ | **✓** |
178
+ | Open-source agent | ✓ | ✗ | ✗ | **✓** |
179
+
180
+ Mission Control ships only on Blackwell DGX/GB200. Theta runs on any NVIDIA GPU reachable by pynvml.
181
+
182
+ ---
183
+
184
+ ## Requirements
185
+
186
+ - Python 3.10+
187
+ - NVIDIA GPU with driver ≥ 450 (for pynvml)
188
+ - No DCGM required — pynvml only
189
+
190
+ For Docker: [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
191
+
192
+ ---
193
+
194
+ ## Retrain on your own data
195
+
196
+ ```bash
197
+ theta train /path/to/measurements.csv
198
+ ```
199
+
200
+ CSV schema: `phase, trial_second, rtheta_cwatt, power_w, util_pct, perf_state, ...`
201
+
202
+ ---
203
+
204
+ ## Research basis
205
+
206
+ - **F1** — R_θ separates idle (1.28 C/W) from load (0.72 C/W) with 77.9% margin, Tesla T4
207
+ - **F2** — Ambient sensitivity: 7.1%/°C at idle vs 2.0%/°C at load (3.5× difference)
208
+ - **F6** — CUDA zombie: same-process exit leaves GPU at P0 (~31W), invisible to utilization
209
+
210
+ Stage 1: 4,570 rows · Tesla T4 · E001–E004 · 9 child-exit trials
211
+ Stage 2 (in progress): Cal Poly DGX B200 AI Factory · E005–E008
212
+
213
+ ---
214
+
215
+ ## License
216
+
217
+ MIT — free forever for single-node use.
218
+
219
+ Built at Cal Poly SLO · [asomisetty27@gmail.com](mailto:asomisetty27@gmail.com)