runtheta 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runtheta-0.1.9/.dockerignore +11 -0
- runtheta-0.1.9/.github/workflows/ci.yml +64 -0
- runtheta-0.1.9/.gitignore +16 -0
- runtheta-0.1.9/.supabase/functions/telemetry_ingest/index.ts +89 -0
- runtheta-0.1.9/.supabase/migrations/1780625094_thermalos_intelligence_network.sql +65 -0
- runtheta-0.1.9/Dockerfile +45 -0
- runtheta-0.1.9/PKG-INFO +219 -0
- runtheta-0.1.9/README.md +198 -0
- runtheta-0.1.9/deploy/grafana/dashboards/theta.json +259 -0
- runtheta-0.1.9/deploy/grafana/provisioning/dashboards/theta.yml +7 -0
- runtheta-0.1.9/deploy/grafana/provisioning/datasources/prometheus.yml +7 -0
- runtheta-0.1.9/deploy/grafana/theta_default_dashboard.json +269 -0
- runtheta-0.1.9/deploy/prometheus.yml +11 -0
- runtheta-0.1.9/deploy/supabase/schema.sql +65 -0
- runtheta-0.1.9/deploy/supabase/telemetry_ingest.ts +89 -0
- runtheta-0.1.9/deploy/theta-monitor.service +26 -0
- runtheta-0.1.9/docker-compose.yml +64 -0
- runtheta-0.1.9/pyproject.toml +35 -0
- runtheta-0.1.9/sim/README.md +155 -0
- runtheta-0.1.9/sim/__init__.py +1 -0
- runtheta-0.1.9/sim/elt/__init__.py +14 -0
- runtheta-0.1.9/sim/elt/analysis.py +227 -0
- runtheta-0.1.9/sim/elt/degradation.py +170 -0
- runtheta-0.1.9/sim/elt/detector.py +170 -0
- runtheta-0.1.9/sim/elt/experiment.py +188 -0
- runtheta-0.1.9/sim/elt/params.py +178 -0
- runtheta-0.1.9/sim/elt/run_elt.py +178 -0
- runtheta-0.1.9/sim/elt/thermal_model.py +174 -0
- runtheta-0.1.9/sim/elt/validate.py +149 -0
- runtheta-0.1.9/sim/tests/__init__.py +0 -0
- runtheta-0.1.9/sim/tests/test_elt.py +121 -0
- runtheta-0.1.9/supabase/.temp/cli-latest +1 -0
- runtheta-0.1.9/supabase/.temp/gotrue-version +1 -0
- runtheta-0.1.9/supabase/.temp/linked-project.json +1 -0
- runtheta-0.1.9/supabase/.temp/pooler-url +1 -0
- runtheta-0.1.9/supabase/.temp/postgres-version +1 -0
- runtheta-0.1.9/supabase/.temp/project-ref +1 -0
- runtheta-0.1.9/supabase/.temp/rest-version +1 -0
- runtheta-0.1.9/supabase/.temp/storage-migration +1 -0
- runtheta-0.1.9/supabase/.temp/storage-version +1 -0
- runtheta-0.1.9/supabase/functions/telemetry_ingest/index.ts +89 -0
- runtheta-0.1.9/tests/test_metrics.py +248 -0
- runtheta-0.1.9/theta/__init__.py +7 -0
- runtheta-0.1.9/theta/agent/__init__.py +0 -0
- runtheta-0.1.9/theta/agent/alerter.py +218 -0
- runtheta-0.1.9/theta/agent/baseline.py +204 -0
- runtheta-0.1.9/theta/agent/calibrate.py +233 -0
- runtheta-0.1.9/theta/agent/classifier.py +217 -0
- runtheta-0.1.9/theta/agent/collector.py +211 -0
- runtheta-0.1.9/theta/agent/correlator.py +91 -0
- runtheta-0.1.9/theta/agent/daemon.py +534 -0
- runtheta-0.1.9/theta/agent/dcgm_collector.py +151 -0
- runtheta-0.1.9/theta/agent/detector.py +185 -0
- runtheta-0.1.9/theta/agent/exporter.py +229 -0
- runtheta-0.1.9/theta/agent/fault_classifier.py +411 -0
- runtheta-0.1.9/theta/agent/health_api.py +176 -0
- runtheta-0.1.9/theta/agent/metrics.py +150 -0
- runtheta-0.1.9/theta/agent/predictor.py +237 -0
- runtheta-0.1.9/theta/agent/redfish_collector.py +171 -0
- runtheta-0.1.9/theta/agent/sdc_hunter.py +266 -0
- runtheta-0.1.9/theta/agent/silicon.py +339 -0
- runtheta-0.1.9/theta/agent/state.py +199 -0
- runtheta-0.1.9/theta/agent/telemetry.py +217 -0
- runtheta-0.1.9/theta/agent/unsupervised.py +199 -0
- runtheta-0.1.9/theta/agent/window.py +116 -0
- runtheta-0.1.9/theta/cli.py +445 -0
- runtheta-0.1.9/theta/models/__init__.py +0 -0
- runtheta-0.1.9/theta/models/bundle/dt_steady_state.pkl +0 -0
- runtheta-0.1.9/theta/models/bundle/nb_steady_state.pkl +0 -0
- runtheta-0.1.9/theta/models/train.py +130 -0
- runtheta-0.1.9/theta/wizard.py +717 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
cache: pip
|
|
24
|
+
|
|
25
|
+
- name: Install
|
|
26
|
+
run: pip install -e ".[dev]"
|
|
27
|
+
|
|
28
|
+
- name: Train bundled models
|
|
29
|
+
run: |
|
|
30
|
+
# CI has no GPU data — skip training, rely on hard-coded rule fallback
|
|
31
|
+
echo "Skipping model training in CI (no Stage 1 CSV available)"
|
|
32
|
+
|
|
33
|
+
- name: Test
|
|
34
|
+
run: pytest tests/ -v --tb=short
|
|
35
|
+
|
|
36
|
+
lint:
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v4
|
|
40
|
+
- uses: actions/setup-python@v5
|
|
41
|
+
with:
|
|
42
|
+
python-version: "3.12"
|
|
43
|
+
cache: pip
|
|
44
|
+
- run: pip install ruff
|
|
45
|
+
- run: ruff check theta/ --select E,F,W --ignore E501
|
|
46
|
+
|
|
47
|
+
docker:
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
needs: test
|
|
50
|
+
steps:
|
|
51
|
+
- uses: actions/checkout@v4
|
|
52
|
+
|
|
53
|
+
- name: Build Docker image
|
|
54
|
+
run: docker build -t runtheta/agent:ci .
|
|
55
|
+
|
|
56
|
+
- name: Smoke test (demo mode)
|
|
57
|
+
run: docker run --rm runtheta/agent:ci --help
|
|
58
|
+
|
|
59
|
+
- name: Tag and push (main only)
|
|
60
|
+
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
61
|
+
run: |
|
|
62
|
+
echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u "${{ secrets.DOCKER_USERNAME }}" --password-stdin
|
|
63
|
+
docker tag runtheta/agent:ci runtheta/agent:latest
|
|
64
|
+
docker push runtheta/agent:latest
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// Theta Intelligence Network — Supabase Edge Function
|
|
2
|
+
// Deploy: supabase functions deploy telemetry_ingest
|
|
3
|
+
//
|
|
4
|
+
// Receives anonymized GPU health batches from the theta agent.
|
|
5
|
+
// Validates, normalizes, inserts into gpu_health_hourly.
|
|
6
|
+
// Returns community benchmarks for the agent's GPU generation.
|
|
7
|
+
|
|
8
|
+
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
|
|
9
|
+
|
|
10
|
+
const supabase = createClient(
|
|
11
|
+
Deno.env.get("SUPABASE_URL")!,
|
|
12
|
+
Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!
|
|
13
|
+
);
|
|
14
|
+
|
|
15
|
+
const ALLOWED_GPU_GENS = new Set([
|
|
16
|
+
"t4-class", "a100-class", "h100-class", "b200-class",
|
|
17
|
+
"l40-class", "a10-class", "mi300-class", "other",
|
|
18
|
+
]);
|
|
19
|
+
|
|
20
|
+
function sanitize(batch: any): any | null {
|
|
21
|
+
if (typeof batch !== "object" || batch === null) return null;
|
|
22
|
+
const gpu_gen = String(batch.gpu_gen ?? "other");
|
|
23
|
+
if (!ALLOWED_GPU_GENS.has(gpu_gen)) return null;
|
|
24
|
+
const hour = Number(batch.hour);
|
|
25
|
+
if (!Number.isInteger(hour) || hour < 0) return null;
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
gpu_gen,
|
|
29
|
+
hour,
|
|
30
|
+
n_samples: Math.min(Number(batch.n_samples ?? 0), 10000),
|
|
31
|
+
rtheta_mean: batch.rtheta_mean != null ? Number(batch.rtheta_mean) : null,
|
|
32
|
+
rtheta_std_mean: batch.rtheta_std_mean != null ? Number(batch.rtheta_std_mean) : null,
|
|
33
|
+
ecc_sbit_total: Math.max(0, Number(batch.ecc_sbit_total ?? 0)),
|
|
34
|
+
ecc_dbit_any: Boolean(batch.ecc_dbit_any),
|
|
35
|
+
clock_eff_mean: batch.clock_eff_mean != null ? Number(batch.clock_eff_mean) : null,
|
|
36
|
+
alert_types: Array.isArray(batch.alert_types) ? batch.alert_types.slice(0, 10).map(String) : [],
|
|
37
|
+
recovery_time_p50: batch.recovery_time_p50 != null ? Number(batch.recovery_time_p50) : null,
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
Deno.serve(async (req: Request) => {
|
|
42
|
+
if (req.method !== "POST") {
|
|
43
|
+
return new Response("Method not allowed", { status: 405 });
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
let body: any;
|
|
47
|
+
try {
|
|
48
|
+
body = await req.json();
|
|
49
|
+
} catch {
|
|
50
|
+
return new Response("Invalid JSON", { status: 400 });
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const install_id = String(body.install_id ?? "").slice(0, 32);
|
|
54
|
+
const agent_version = String(body.agent_version ?? "unknown").slice(0, 20);
|
|
55
|
+
const batches = Array.isArray(body.batches) ? body.batches : [];
|
|
56
|
+
|
|
57
|
+
if (!install_id || batches.length === 0 || batches.length > 500) {
|
|
58
|
+
return new Response("Bad request", { status: 400 });
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Store raw batch for audit
|
|
62
|
+
await supabase.from("telemetry_batches").insert({
|
|
63
|
+
install_id,
|
|
64
|
+
agent_version,
|
|
65
|
+
batch: batches,
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
// Normalize and insert health rows
|
|
69
|
+
const rows = batches
|
|
70
|
+
.map((b: any) => sanitize(b))
|
|
71
|
+
.filter(Boolean)
|
|
72
|
+
.map((r: any) => ({ ...r, install_id }));
|
|
73
|
+
|
|
74
|
+
if (rows.length > 0) {
|
|
75
|
+
await supabase.from("gpu_health_hourly").insert(rows);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Return community benchmarks for the GPU generations present in this batch
|
|
79
|
+
const gpu_gens = [...new Set(rows.map((r: any) => r.gpu_gen))];
|
|
80
|
+
const { data: benchmarks } = await supabase
|
|
81
|
+
.from("community_benchmarks")
|
|
82
|
+
.select("*")
|
|
83
|
+
.in("gpu_gen", gpu_gens);
|
|
84
|
+
|
|
85
|
+
return new Response(
|
|
86
|
+
JSON.stringify({ accepted: rows.length, benchmarks: benchmarks ?? [] }),
|
|
87
|
+
{ headers: { "Content-Type": "application/json" }, status: 200 }
|
|
88
|
+
);
|
|
89
|
+
});
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
-- Theta Intelligence Network — Supabase schema
|
|
2
|
+
-- Run once: supabase db push (or paste into SQL editor)
|
|
3
|
+
|
|
4
|
+
-- ── Raw telemetry batches ──────────────────────────────────────────────────────
|
|
5
|
+
|
|
6
|
+
CREATE TABLE IF NOT EXISTS telemetry_batches (
|
|
7
|
+
id BIGSERIAL PRIMARY KEY,
|
|
8
|
+
received_at TIMESTAMPTZ DEFAULT NOW(),
|
|
9
|
+
install_id TEXT NOT NULL, -- anonymous sha256 of machine UUID (16 chars)
|
|
10
|
+
agent_version TEXT NOT NULL,
|
|
11
|
+
batch JSONB NOT NULL -- array of aggregated hour-bucket records
|
|
12
|
+
);
|
|
13
|
+
|
|
14
|
+
CREATE INDEX IF NOT EXISTS idx_telemetry_install ON telemetry_batches(install_id);
|
|
15
|
+
CREATE INDEX IF NOT EXISTS idx_telemetry_received ON telemetry_batches(received_at);
|
|
16
|
+
|
|
17
|
+
-- ── Normalized GPU health events ──────────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
CREATE TABLE IF NOT EXISTS gpu_health_hourly (
|
|
20
|
+
id BIGSERIAL PRIMARY KEY,
|
|
21
|
+
install_id TEXT NOT NULL,
|
|
22
|
+
hour BIGINT NOT NULL, -- unix epoch // 3600
|
|
23
|
+
gpu_gen TEXT NOT NULL, -- "h100-class", "b200-class", etc.
|
|
24
|
+
n_samples INT NOT NULL,
|
|
25
|
+
rtheta_mean FLOAT,
|
|
26
|
+
rtheta_std_mean FLOAT,
|
|
27
|
+
ecc_sbit_total FLOAT DEFAULT 0,
|
|
28
|
+
ecc_dbit_any BOOLEAN DEFAULT FALSE,
|
|
29
|
+
clock_eff_mean FLOAT,
|
|
30
|
+
alert_types TEXT[],
|
|
31
|
+
recovery_time_p50 FLOAT,
|
|
32
|
+
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
33
|
+
);
|
|
34
|
+
|
|
35
|
+
CREATE INDEX IF NOT EXISTS idx_health_gpu_gen ON gpu_health_hourly(gpu_gen);
|
|
36
|
+
CREATE INDEX IF NOT EXISTS idx_health_hour ON gpu_health_hourly(hour);
|
|
37
|
+
|
|
38
|
+
-- ── Community benchmark view ──────────────────────────────────────────────────
|
|
39
|
+
-- This is the "give back" — exposed to agents as /v1/benchmarks?gpu_gen=h100-class
|
|
40
|
+
|
|
41
|
+
CREATE OR REPLACE VIEW community_benchmarks AS
|
|
42
|
+
SELECT
|
|
43
|
+
gpu_gen,
|
|
44
|
+
COUNT(DISTINCT install_id) AS fleet_size,
|
|
45
|
+
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p25,
|
|
46
|
+
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p50,
|
|
47
|
+
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p75,
|
|
48
|
+
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY rtheta_mean) AS rtheta_p95,
|
|
49
|
+
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY clock_eff_mean) FILTER (WHERE clock_eff_mean IS NOT NULL) AS clock_eff_p50,
|
|
50
|
+
AVG(ecc_sbit_total) AS avg_ecc_sbit_per_hour,
|
|
51
|
+
SUM(CASE WHEN ecc_dbit_any THEN 1 ELSE 0 END)::FLOAT / COUNT(*) AS dbit_event_rate,
|
|
52
|
+
MAX(created_at) AS last_updated
|
|
53
|
+
FROM gpu_health_hourly
|
|
54
|
+
WHERE created_at > NOW() - INTERVAL '30 days'
|
|
55
|
+
AND n_samples >= 5
|
|
56
|
+
GROUP BY gpu_gen;
|
|
57
|
+
|
|
58
|
+
-- ── Row-level security ────────────────────────────────────────────────────────
|
|
59
|
+
-- All writes go through the Edge Function (service role). Public can read benchmarks.
|
|
60
|
+
|
|
61
|
+
ALTER TABLE telemetry_batches ENABLE ROW LEVEL SECURITY;
|
|
62
|
+
ALTER TABLE gpu_health_hourly ENABLE ROW LEVEL SECURITY;
|
|
63
|
+
|
|
64
|
+
-- Edge function uses service role key — full access
|
|
65
|
+
-- Public (agent GET /benchmarks) reads the view only, no direct table access
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# ── Stage 1: build ────────────────────────────────────────────────────────────
|
|
2
|
+
FROM python:3.12-slim AS builder
|
|
3
|
+
|
|
4
|
+
WORKDIR /build
|
|
5
|
+
COPY pyproject.toml README.md ./
|
|
6
|
+
COPY theta/ ./theta/
|
|
7
|
+
|
|
8
|
+
RUN pip install --upgrade pip --quiet \
|
|
9
|
+
&& pip install build --quiet \
|
|
10
|
+
&& python -m build --wheel --outdir /dist
|
|
11
|
+
|
|
12
|
+
# ── Stage 2: runtime ──────────────────────────────────────────────────────────
|
|
13
|
+
FROM python:3.12-slim AS runtime
|
|
14
|
+
|
|
15
|
+
LABEL org.opencontainers.image.title="Theta"
|
|
16
|
+
LABEL org.opencontainers.image.description="GPU thermal-power forensics agent"
|
|
17
|
+
LABEL org.opencontainers.image.licenses="MIT"
|
|
18
|
+
LABEL org.opencontainers.image.source="https://github.com/Asomisetty27/theta"
|
|
19
|
+
|
|
20
|
+
# Non-root user
|
|
21
|
+
RUN useradd --create-home --shell /bin/bash theta
|
|
22
|
+
|
|
23
|
+
WORKDIR /app
|
|
24
|
+
COPY --from=builder /dist/*.whl .
|
|
25
|
+
RUN pip install --quiet *.whl && rm *.whl
|
|
26
|
+
|
|
27
|
+
# Config and log dirs (writable by theta user)
|
|
28
|
+
RUN mkdir -p /home/theta/.theta /var/log/theta \
|
|
29
|
+
&& chown -R theta:theta /home/theta/.theta /var/log/theta
|
|
30
|
+
|
|
31
|
+
USER theta
|
|
32
|
+
|
|
33
|
+
# Prometheus metrics
|
|
34
|
+
EXPOSE 9101
|
|
35
|
+
|
|
36
|
+
# Defaults — override via env vars or command args
|
|
37
|
+
ENV THETA_INTERVAL=5 \
|
|
38
|
+
THETA_PROMETHEUS_PORT=9101 \
|
|
39
|
+
THETA_LOG=/var/log/theta/alerts.jsonl
|
|
40
|
+
|
|
41
|
+
ENTRYPOINT ["theta"]
|
|
42
|
+
CMD ["monitor", \
|
|
43
|
+
"--interval", "5", \
|
|
44
|
+
"--port", "9101", \
|
|
45
|
+
"--log", "/var/log/theta/alerts.jsonl"]
|
runtheta-0.1.9/PKG-INFO
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: runtheta
|
|
3
|
+
Version: 0.1.9
|
|
4
|
+
Summary: GPU thermal-power forensics agent. Computes R_theta = ΔT/P in real time.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: httpx>=0.27
|
|
8
|
+
Requires-Dist: joblib>=1.3
|
|
9
|
+
Requires-Dist: numpy>=1.24
|
|
10
|
+
Requires-Dist: nvidia-ml-py>=12.0.0
|
|
11
|
+
Requires-Dist: prometheus-client>=0.20
|
|
12
|
+
Requires-Dist: questionary>=2.0
|
|
13
|
+
Requires-Dist: rich>=13.0
|
|
14
|
+
Requires-Dist: scikit-learn>=1.3
|
|
15
|
+
Requires-Dist: structlog>=24.0
|
|
16
|
+
Requires-Dist: typer>=0.12
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
19
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Theta
|
|
23
|
+
|
|
24
|
+
**GPU thermal-power forensics agent.** Computes `R_θ = ΔT / P` in real time from your existing DCGM telemetry. That ratio is the only signal that separates a busy-hot GPU from a failing-hot one — and no incumbent computes it.
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
theta_gpu_rtheta_cwatt{gpu_index="3"} 2.104 # zombie recovery — CUDA context stuck
|
|
28
|
+
theta_gpu_rtheta_cwatt{gpu_index="3"} 0.724 # under load — healthy
|
|
29
|
+
theta_gpu_rtheta_cwatt{gpu_index="3"} 1.281 # clean idle — normal
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## The problem
|
|
35
|
+
|
|
36
|
+
A GPU at 82°C could be:
|
|
37
|
+
- **Busy and healthy** — running a job at thermal equilibrium
|
|
38
|
+
- **Cooling path failing** — ambient temperature up, heatsink degrading
|
|
39
|
+
- **CUDA zombie** — process exited but context retained, drawing 31W at 0% utilization
|
|
40
|
+
|
|
41
|
+
`nvidia-smi`, DCGM, and Mission Control all expose T and P as separate fields. None of them divide the two. Theta does.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Quick start
|
|
46
|
+
|
|
47
|
+
### pip (single node, free forever)
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install theta
|
|
51
|
+
theta setup # interactive wizard — 90 seconds to first R_θ reading
|
|
52
|
+
theta monitor # start monitoring
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Docker
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
docker run --gpus all -p 9101:9101 theta/agent:latest
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Docker Compose (agent + Prometheus + Grafana)
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
git clone https://github.com/Asomisetty27/theta
|
|
65
|
+
cd theta
|
|
66
|
+
docker compose --profile metrics up
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Open `http://localhost:3000` — Grafana dashboard pre-provisioned, no setup required.
|
|
70
|
+
Login: `admin` / `theta`
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## How it works
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
GPU (pynvml)
|
|
78
|
+
→ T_junction, P_GPU, util, P-state every 5s
|
|
79
|
+
→ R_θ = (T_junction − T_ref) / P_GPU
|
|
80
|
+
→ 15s steady-state window (σ < 0.03 C/W)
|
|
81
|
+
→ Decision Tree classifier → {under_load, clean_idle, zombie_recovery, child_exit_recovery}
|
|
82
|
+
→ Rolling baseline + k·σ drift detector
|
|
83
|
+
→ Alert (stdout / Slack webhook / JSONL / Prometheus)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Virtual ambient** — `T_ref` is derived from the GPU's own stable idle windows. No thermocouple, no rack modification, no extra hardware.
|
|
87
|
+
|
|
88
|
+
**Steady-state filter** — classification only runs on stable windows. This takes Naive Bayes accuracy from 84% → 99.8% and eliminates transient false positives.
|
|
89
|
+
|
|
90
|
+
**Classifier** — Decision Tree trained on 4,570 rows of Stage 1 Tesla T4 data. 100% 5-fold CV accuracy on steady-state samples. Rules are human-readable and publishable:
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
IF R_θ ≤ 0.87 → under_load (n=963, conf=1.00)
|
|
94
|
+
IF R_θ > 0.87, P0 → zombie_recovery (n=584, conf=1.00) ← CUDA zombie
|
|
95
|
+
IF R_θ > 1.50, P8 → child_exit_recovery (n=696, conf=1.00)
|
|
96
|
+
ELSE → clean_idle / early recovery
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## CLI reference
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
theta setup Interactive wizard (run this first)
|
|
105
|
+
theta monitor Run agent — blocks until Ctrl+C
|
|
106
|
+
theta monitor --interval 2 Sample every 2s
|
|
107
|
+
theta monitor --gpus 0,1,3 Monitor specific GPUs
|
|
108
|
+
theta monitor --webhook <url> Send alerts to Slack / generic webhook
|
|
109
|
+
theta monitor --log alerts.jsonl Append alerts to JSONL file
|
|
110
|
+
theta monitor --port 9101 Prometheus metrics port (0 = disabled)
|
|
111
|
+
theta monitor --nb Use Naive Bayes instead of Decision Tree
|
|
112
|
+
theta baseline --gpu 0 Lock virtual ambient T_ref from idle window
|
|
113
|
+
theta baseline --gpu 0 --manual 24 Set T_ref manually (°C)
|
|
114
|
+
theta classify Snapshot classify all GPUs right now
|
|
115
|
+
theta serve --port 9101 Metrics export only (no stdout alerts)
|
|
116
|
+
theta train /path/data.csv Retrain bundled models from new data
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Prometheus metrics
|
|
122
|
+
|
|
123
|
+
| Metric | Type | Description |
|
|
124
|
+
|---|---|---|
|
|
125
|
+
| `theta_gpu_rtheta_cwatt` | gauge | R_θ (C/W) — the core signal |
|
|
126
|
+
| `theta_gpu_state_info` | gauge | Current classified state (label: `state`) |
|
|
127
|
+
| `theta_gpu_drift_sigma` | gauge | Deviation from baseline in σ units |
|
|
128
|
+
| `theta_gpu_temperature_celsius` | gauge | Junction temperature |
|
|
129
|
+
| `theta_gpu_power_watts` | gauge | GPU power consumption |
|
|
130
|
+
| `theta_gpu_utilization_ratio` | gauge | 0–1 utilization |
|
|
131
|
+
| `theta_gpu_perf_state` | gauge | P-state (0=max, 8=idle) |
|
|
132
|
+
| `theta_gpu_baseline_tref_celsius` | gauge | Virtual ambient T_ref |
|
|
133
|
+
| `theta_gpu_window_rtheta_std` | gauge | Steady-state window σ |
|
|
134
|
+
| `theta_gpu_alerts_total` | counter | Alerts (labels: `severity`, `state`) |
|
|
135
|
+
|
|
136
|
+
All metrics include a `gpu_index` label.
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Alert payload (webhook / JSONL)
|
|
141
|
+
|
|
142
|
+
Every alert includes full forensic context:
|
|
143
|
+
|
|
144
|
+
```json
|
|
145
|
+
{
|
|
146
|
+
"source": "theta",
|
|
147
|
+
"severity": "critical",
|
|
148
|
+
"gpu_index": 3,
|
|
149
|
+
"state": "zombie_recovery",
|
|
150
|
+
"prev_state": "under_load",
|
|
151
|
+
"rtheta": 1.541,
|
|
152
|
+
"rtheta_baseline": 0.724,
|
|
153
|
+
"drift_sigma": 4.2,
|
|
154
|
+
"confidence": 1.0,
|
|
155
|
+
"message": "[CRITICAL] GPU 3 — CUDA zombie detected. R_θ=1.541 at 0% utilisation. Action: release CUDA context.",
|
|
156
|
+
"context": {
|
|
157
|
+
"severity": "critical",
|
|
158
|
+
"duration_prev": 3842.1,
|
|
159
|
+
"history": [
|
|
160
|
+
{ "ts": 1748995200.1, "state": "under_load", "r": 0.721, "conf": 0.99 }
|
|
161
|
+
]
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Why not DCGM / Mission Control / Phaidra?
|
|
169
|
+
|
|
170
|
+
| Capability | DCGM | Mission Control | Phaidra | **Theta** |
|
|
171
|
+
|---|:---:|:---:|:---:|:---:|
|
|
172
|
+
| Computes R_θ | ✗ | ✗ | ✗ | **✓** |
|
|
173
|
+
| Separates busy-hot vs failing-hot | ✗ | ✗ | ✗ | **✓** |
|
|
174
|
+
| CUDA zombie detection | ✗ | ✗ | ✗ | **✓** |
|
|
175
|
+
| Drift detection (baseline + k·σ) | ✗ | ✗ | ◐ | **✓** |
|
|
176
|
+
| Virtual ambient (no hardware) | ✗ | ✗ | ✗ | **✓** |
|
|
177
|
+
| Serves neocloud / mixed fleets | ✓ | ✗ | ✗ | **✓** |
|
|
178
|
+
| Open-source agent | ✓ | ✗ | ✗ | **✓** |
|
|
179
|
+
|
|
180
|
+
Mission Control ships only on Blackwell DGX/GB200. Theta runs on any NVIDIA GPU reachable by pynvml.
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Requirements
|
|
185
|
+
|
|
186
|
+
- Python 3.10+
|
|
187
|
+
- NVIDIA GPU with driver ≥ 450 (for pynvml)
|
|
188
|
+
- No DCGM required — pynvml only
|
|
189
|
+
|
|
190
|
+
For Docker: [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Retrain on your own data
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
theta train /path/to/measurements.csv
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
CSV schema: `phase, trial_second, rtheta_cwatt, power_w, util_pct, perf_state, ...`
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## Research basis
|
|
205
|
+
|
|
206
|
+
- **F1** — R_θ separates idle (1.28 C/W) from load (0.72 C/W) with 77.9% margin, Tesla T4
|
|
207
|
+
- **F2** — Ambient sensitivity: 7.1%/°C at idle vs 2.0%/°C at load (3.5× difference)
|
|
208
|
+
- **F6** — CUDA zombie: same-process exit leaves GPU at P0 (~31W), invisible to utilization
|
|
209
|
+
|
|
210
|
+
Stage 1: 4,570 rows · Tesla T4 · E001–E004 · 9 child-exit trials
|
|
211
|
+
Stage 2 (in progress): Cal Poly DGX B200 AI Factory · E005–E008
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## License
|
|
216
|
+
|
|
217
|
+
MIT — free forever for single-node use.
|
|
218
|
+
|
|
219
|
+
Built at Cal Poly SLO · [asomisetty27@gmail.com](mailto:asomisetty27@gmail.com)
|