qcoder 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. qcoder/__init__.py +3 -0
  2. qcoder/__main__.py +6 -0
  3. qcoder/cli.py +116 -0
  4. qcoder/core/__init__.py +1 -0
  5. qcoder/core/context.py +16 -0
  6. qcoder/core/qasm2/__init__.py +1 -0
  7. qcoder/core/qasm2/adjoint_eligibility.py +128 -0
  8. qcoder/core/qasm2/mirror_build.py +234 -0
  9. qcoder/core/run_config.py +84 -0
  10. qcoder/core/schema.py +26 -0
  11. qcoder/engines/feature_extraction/adapters/__init__.py +1 -0
  12. qcoder/engines/feature_extraction/adapters/qiskit_intake.py +46 -0
  13. qcoder/engines/feature_extraction/extractor.py +43 -0
  14. qcoder/engines/feature_extraction/features/compute_v0.py +157 -0
  15. qcoder/engines/feature_extraction/features/schema_v0.py +84 -0
  16. qcoder/engines/feature_extraction/ir.py +41 -0
  17. qcoder/engines/feature_extraction/labeling.py +68 -0
  18. qcoder/engines/feature_extraction/parsers/__init__.py +21 -0
  19. qcoder/engines/feature_extraction/qasm2_regex_parser.py +184 -0
  20. qcoder/engines/feature_extraction/reps/cut_profile.py +106 -0
  21. qcoder/engines/feature_extraction/reps/depth.py +47 -0
  22. qcoder/engines/feature_extraction/reps/entangling_layers.py +57 -0
  23. qcoder/engines/feature_extraction/reps/gate_set_stats.py +82 -0
  24. qcoder/engines/feature_extraction/reps/interaction_graph.py +30 -0
  25. qcoder/engines/feature_extraction/reps/interaction_graph_metrics.py +113 -0
  26. qcoder/engines/feature_extraction/reps/spans.py +89 -0
  27. qcoder/engines/prediction_model/__init__.py +16 -0
  28. qcoder/engines/prediction_model/artifact.py +85 -0
  29. qcoder/engines/prediction_model/engine.py +209 -0
  30. qcoder/engines/prediction_model/models.py +62 -0
  31. qcoder/engines/prediction_model/policy.py +45 -0
  32. qcoder/engines/prediction_model/schema_alignment.py +41 -0
  33. qcoder/engines/quantumness/__init__.py +8 -0
  34. qcoder/engines/quantumness/scorer.py +254 -0
  35. qcoder/pipelines/analyze.py +131 -0
  36. qcoder/pipelines/batch.py +56 -0
  37. qcoder/tools/analyze.py +88 -0
  38. qcoder/tools/analyze_shot_scaling.py +239 -0
  39. qcoder/tools/batch.py +39 -0
  40. qcoder/tools/generate_corpus.py +491 -0
  41. qcoder/tools/harness.py +15 -0
  42. qcoder/tools/inspect_corpus_features.py +273 -0
  43. qcoder/tools/join_runs_features.py +252 -0
  44. qcoder/tools/mirror.py +15 -0
  45. qcoder/tools/predict_baseline.py +347 -0
  46. qcoder/tools/qr_dll_bootstrap.py +31 -0
  47. qcoder/tools/runner.py +15 -0
  48. qcoder/tools/runners/__init__.py +1 -0
  49. qcoder/tools/runners/quantum_rings/__init__.py +1 -0
  50. qcoder/tools/runners/quantum_rings/v12/__init__.py +1 -0
  51. qcoder/tools/runners/quantum_rings/v12/harness.py +1350 -0
  52. qcoder/tools/runners/quantum_rings/v12/mirror.py +459 -0
  53. qcoder/tools/runners/quantum_rings/v12/runner.py +549 -0
  54. qcoder/tools/train_baseline_models.py +619 -0
  55. qcoder/tools/validate_baseline.py +307 -0
  56. qcoder-0.1.0a0.dist-info/METADATA +86 -0
  57. qcoder-0.1.0a0.dist-info/RECORD +62 -0
  58. qcoder-0.1.0a0.dist-info/WHEEL +5 -0
  59. qcoder-0.1.0a0.dist-info/entry_points.txt +2 -0
  60. qcoder-0.1.0a0.dist-info/licenses/LICENSE +201 -0
  61. qcoder-0.1.0a0.dist-info/licenses/NOTICE +11 -0
  62. qcoder-0.1.0a0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,619 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import datetime as dt
5
+ import hashlib
6
+ import json
7
+ import math
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ try:
12
+ import numpy as np
13
+ except Exception: # pragma: no cover - fallback path
14
+ import jax.numpy as np # type: ignore
15
+
16
+
17
+ DEFAULT_INPUT = Path("data/training/joined_observations_ok.jsonl")
18
+ DEFAULT_OUTPUT_DIR = Path("data/artifacts/models/qr12/schema_0.4.0")
19
+ DEFAULT_RUNTIME_OUT = DEFAULT_OUTPUT_DIR / "runtime_ridge.json"
20
+ DEFAULT_BUILD_OUT = DEFAULT_OUTPUT_DIR / "build_ridge.json"
21
+ DEFAULT_FIDELITY_OUT = DEFAULT_OUTPUT_DIR / "fidelity_ridge.json"
22
+ DEFAULT_MEMORY_OUT = DEFAULT_OUTPUT_DIR / "memory_ridge.json"
23
+
24
+
25
+ def sha256_file(path: str | Path) -> str:
26
+ h = hashlib.sha256()
27
+ with Path(path).open("rb") as f:
28
+ for chunk in iter(lambda: f.read(65536), b""):
29
+ h.update(chunk)
30
+ return h.hexdigest()
31
+
32
+
33
+ def _now_utc() -> str:
34
+ return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat()
35
+
36
+
37
+ def _load_jsonl(path: str | Path) -> list[dict[str, Any]]:
38
+ rows: list[dict[str, Any]] = []
39
+ with Path(path).open("r", encoding="utf-8") as f:
40
+ for line in f:
41
+ s = line.strip()
42
+ if not s:
43
+ continue
44
+ rows.append(json.loads(s))
45
+ return rows
46
+
47
+
48
+ def _precision_is_double(precision: Any) -> float:
49
+ p = str(precision or "").strip().lower()
50
+ return 1.0 if p in {"double", "fp64"} else 0.0
51
+
52
+
53
+ def _safe_log2_threshold(x: Any) -> float:
54
+ try:
55
+ v = float(x)
56
+ except Exception:
57
+ return 0.0
58
+ return math.log2(v) if v > 0 else 0.0
59
+
60
+
61
+ def _safe_log2_shots_runner(x: Any) -> float:
62
+ s = _safe_int(x)
63
+ if s is None or s <= 0:
64
+ return 0.0
65
+ return math.log2(float(s))
66
+
67
+
68
+ def _content_hash_is_test(content_hash: str, test_fraction: float) -> bool:
69
+ if not content_hash:
70
+ return False
71
+ digest = hashlib.sha256(content_hash.encode("utf-8")).hexdigest()
72
+ n = int(digest[:8], 16) / float(0xFFFFFFFF)
73
+ return n < test_fraction
74
+
75
+
76
+ def _safe_int(value: Any) -> int | None:
77
+ try:
78
+ return int(value)
79
+ except Exception:
80
+ return None
81
+
82
+
83
+ def _is_total_runtime_row(r: dict[str, Any]) -> bool:
84
+ if r.get("run_kind") != "forward_runner_execution":
85
+ return False
86
+ if r.get("runner_wall_s") is None:
87
+ return False
88
+ shots_runner = _safe_int(r.get("shots_runner"))
89
+ if shots_runner is None:
90
+ return False
91
+ shots_hist = _safe_int(r.get("shots_hist"))
92
+ if shots_hist is not None and shots_hist > 1:
93
+ return shots_runner == shots_hist
94
+ return shots_runner > 1
95
+
96
+
97
+ def _derive_execution_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
98
+ """
99
+ Derive execution-time rows from forward_runner_execution rows.
100
+ total_runtime ≈ build_time + execution_time; execution_time ≈ slope * shots.
101
+ Match state row (shots_runner==1) and hist rows (shots_runner>1) by
102
+ content_hash, backend_id, precision, threshold; then
103
+ execution_time_s = max(0, runner_wall_s_hist - runner_wall_s_state).
104
+ Returns one row per (group, hist_row) with execution_time_s and hist row's features.
105
+ """
106
+ forward = [r for r in rows if r.get("run_kind") == "forward_runner_execution"]
107
+ key_fn = lambda r: (
108
+ str(r.get("content_hash") or ""),
109
+ str(r.get("backend_id") or r.get("backend") or ""),
110
+ str(r.get("precision") or "").strip() or "double",
111
+ _safe_int(r.get("threshold")),
112
+ )
113
+ groups: dict[tuple[Any, ...], list[dict[str, Any]]] = {}
114
+ for r in forward:
115
+ k = key_fn(r)
116
+ if k[3] is None:
117
+ continue
118
+ groups.setdefault(k, []).append(r)
119
+
120
+ out: list[dict[str, Any]] = []
121
+ for (_ch, _bid, _prec, thr), group in groups.items():
122
+ state_row = next((r for r in group if _safe_int(r.get("shots_runner")) == 1), None)
123
+ if state_row is None:
124
+ continue
125
+ runner_wall_state = state_row.get("runner_wall_s")
126
+ if runner_wall_state is None:
127
+ continue
128
+ try:
129
+ build_s = float(runner_wall_state)
130
+ except (TypeError, ValueError):
131
+ continue
132
+ for hist_row in group:
133
+ shots = _safe_int(hist_row.get("shots_runner"))
134
+ if shots is None or shots <= 1:
135
+ continue
136
+ runner_wall_hist = hist_row.get("runner_wall_s")
137
+ if runner_wall_hist is None:
138
+ continue
139
+ try:
140
+ total_s = float(runner_wall_hist)
141
+ except (TypeError, ValueError):
142
+ continue
143
+ execution_time_s = max(0.0, total_s - build_s)
144
+ out.append({
145
+ "content_hash": _ch,
146
+ "backend_id": _bid,
147
+ "precision": _prec,
148
+ "threshold": thr,
149
+ "shots_runner": shots,
150
+ "execution_time_s": execution_time_s,
151
+ "feature_names": list(hist_row.get("feature_names") or []),
152
+ "features": list(hist_row.get("features") or []),
153
+ })
154
+ return out
155
+
156
+
157
+ def _is_build_runtime_row(r: dict[str, Any]) -> bool:
158
+ if r.get("run_kind") != "forward_runner_execution":
159
+ return False
160
+ if r.get("runner_wall_s") is None:
161
+ return False
162
+ shots_runner = _safe_int(r.get("shots_runner"))
163
+ return shots_runner == 1
164
+
165
+
166
+ def _is_memory_row(r: dict[str, Any]) -> bool:
167
+ if not _is_total_runtime_row(r):
168
+ return False
169
+ return r.get("peak_rss_mb") is not None
170
+
171
+
172
+ def _build_xy(
173
+ rows: list[dict[str, Any]],
174
+ *,
175
+ for_runtime: bool,
176
+ eps: float,
177
+ runtime_selector: Any = None,
178
+ include_log2_shots_runner: bool = False,
179
+ ) -> tuple[np.ndarray, np.ndarray, list[str], np.ndarray]:
180
+ selected: list[dict[str, Any]] = []
181
+ for r in rows:
182
+ if for_runtime:
183
+ if runtime_selector is None:
184
+ keep = r.get("run_kind") == "forward_runner_execution" and r.get("runner_wall_s") is not None
185
+ else:
186
+ keep = bool(runtime_selector(r))
187
+ if not keep:
188
+ continue
189
+ else:
190
+ if r.get("run_kind") != "mirror_threshold_attempt":
191
+ continue
192
+ if r.get("fidelity") is None:
193
+ continue
194
+ selected.append(r)
195
+
196
+ if not selected:
197
+ raise ValueError("no rows selected for model training")
198
+
199
+ first = selected[0]
200
+ circuit_feature_names = list(first.get("feature_names") or [])
201
+ if not circuit_feature_names:
202
+ raise ValueError("joined rows missing feature_names")
203
+
204
+ if include_log2_shots_runner:
205
+ feature_names = ["bias", "log2_threshold", "precision_is_double", "log2_shots_runner"] + circuit_feature_names
206
+ else:
207
+ feature_names = ["bias", "log2_threshold", "precision_is_double"] + circuit_feature_names
208
+
209
+ x_rows: list[list[float]] = []
210
+ y_vals: list[float] = []
211
+ y_eval_vals: list[float] = []
212
+
213
+ for r in selected:
214
+ feats = list(r.get("features") or [])
215
+ if len(feats) != len(circuit_feature_names):
216
+ raise ValueError("feature length mismatch in joined rows")
217
+
218
+ x = [1.0, _safe_log2_threshold(r.get("threshold")), _precision_is_double(r.get("precision"))]
219
+ if include_log2_shots_runner:
220
+ x.append(_safe_log2_shots_runner(r.get("shots_runner")))
221
+ x.extend(float(v) for v in feats)
222
+ x_rows.append(x)
223
+
224
+ if for_runtime:
225
+ y_sec = float(r["runner_wall_s"])
226
+ y_vals.append(math.log1p(max(0.0, y_sec)))
227
+ y_eval_vals.append(max(0.0, y_sec))
228
+ else:
229
+ fid = float(r["fidelity"])
230
+ fid = min(1.0 - eps, max(eps, fid))
231
+ y_vals.append(math.log(fid / (1.0 - fid)))
232
+ y_eval_vals.append(float(r["fidelity"]))
233
+
234
+ return np.asarray(x_rows, dtype=float), np.asarray(y_vals, dtype=float), feature_names, np.asarray(y_eval_vals, dtype=float)
235
+
236
+
237
+ def _build_xy_execution(
238
+ execution_rows: list[dict[str, Any]],
239
+ ) -> tuple[np.ndarray, np.ndarray, list[str], np.ndarray]:
240
+ """
241
+ Build X and y from derived execution rows. y = log1p(execution_time_s).
242
+ total_runtime ≈ build_time + execution_time; we model execution_time.
243
+ """
244
+ if not execution_rows:
245
+ raise ValueError("no execution rows for runtime model training")
246
+ circuit_feature_names = list((execution_rows[0].get("feature_names") or []))
247
+ if not circuit_feature_names:
248
+ raise ValueError("execution rows missing feature_names")
249
+ feature_names = ["bias", "log2_threshold", "precision_is_double", "log2_shots_runner"] + circuit_feature_names
250
+ x_rows: list[list[float]] = []
251
+ y_vals: list[float] = []
252
+ y_eval_vals: list[float] = []
253
+ for r in execution_rows:
254
+ feats = list(r.get("features") or [])
255
+ if len(feats) != len(circuit_feature_names):
256
+ raise ValueError("feature length mismatch in execution rows")
257
+ x = [1.0, _safe_log2_threshold(r.get("threshold")), _precision_is_double(r.get("precision"))]
258
+ x.append(_safe_log2_shots_runner(r.get("shots_runner")))
259
+ x.extend(float(v) for v in feats)
260
+ x_rows.append(x)
261
+ exec_s = max(0.0, float(r["execution_time_s"]))
262
+ y_vals.append(math.log1p(exec_s))
263
+ y_eval_vals.append(exec_s)
264
+ return (
265
+ np.asarray(x_rows, dtype=float),
266
+ np.asarray(y_vals, dtype=float),
267
+ feature_names,
268
+ np.asarray(y_eval_vals, dtype=float),
269
+ )
270
+
271
+
272
+ def _build_xy_memory(
273
+ rows: list[dict[str, Any]],
274
+ *,
275
+ selector: Any,
276
+ ) -> tuple[np.ndarray, np.ndarray, list[str], np.ndarray]:
277
+ selected: list[dict[str, Any]] = []
278
+ for r in rows:
279
+ if not bool(selector(r)):
280
+ continue
281
+ selected.append(r)
282
+
283
+ if not selected:
284
+ raise ValueError("no rows selected for memory model training")
285
+
286
+ first = selected[0]
287
+ feature_names = list(first.get("feature_names") or [])
288
+ if not feature_names:
289
+ raise ValueError("joined rows missing feature_names")
290
+
291
+ x_rows: list[list[float]] = []
292
+ y_vals: list[float] = []
293
+ y_eval_vals: list[float] = []
294
+ for r in selected:
295
+ feats = list(r.get("features") or [])
296
+ if len(feats) != len(feature_names):
297
+ raise ValueError("feature length mismatch in joined rows")
298
+ x = [1.0, _safe_log2_threshold(r.get("threshold")), _precision_is_double(r.get("precision"))]
299
+ x.extend(float(v) for v in feats)
300
+ x_rows.append(x)
301
+
302
+ y_mb = max(0.0, float(r["peak_rss_mb"]))
303
+ y_vals.append(math.log1p(y_mb))
304
+ y_eval_vals.append(y_mb)
305
+
306
+ return np.asarray(x_rows, dtype=float), np.asarray(y_vals, dtype=float), feature_names, np.asarray(y_eval_vals, dtype=float)
307
+
308
+
309
+ def _split_indices(rows: list[dict[str, Any]], test_fraction: float) -> tuple[list[int], list[int]]:
310
+ train_idx: list[int] = []
311
+ test_idx: list[int] = []
312
+ for i, r in enumerate(rows):
313
+ ch = str(r.get("content_hash") or "")
314
+ if _content_hash_is_test(ch, test_fraction):
315
+ test_idx.append(i)
316
+ else:
317
+ train_idx.append(i)
318
+ if not train_idx:
319
+ train_idx = list(range(len(rows)))
320
+ test_idx = []
321
+ return train_idx, test_idx
322
+
323
+
324
+ def _standardize_from_train(x: np.ndarray, train_idx: list[int]) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
325
+ x_mean = np.mean(x[train_idx], axis=0)
326
+ x_std = np.std(x[train_idx], axis=0)
327
+ x_mean[0] = 0.0
328
+ x_std[0] = 1.0
329
+ x_std = np.where(x_std == 0.0, 1.0, x_std)
330
+ x_scaled = (x - x_mean) / x_std
331
+ return x_scaled, x_mean, x_std
332
+
333
+
334
+ def _fit_ridge_closed_form(x_train: np.ndarray, y_train: np.ndarray, alpha: float) -> np.ndarray:
335
+ n_features = int(x_train.shape[1])
336
+ xtx = x_train.T @ x_train
337
+ reg = np.eye(n_features, dtype=float) * float(alpha)
338
+ reg[0, 0] = 0.0 # do not regularize bias
339
+ xty = x_train.T @ y_train
340
+ try:
341
+ coef = np.linalg.solve(xtx + reg, xty)
342
+ except Exception:
343
+ coef = np.linalg.lstsq(xtx + reg, xty, rcond=None)[0]
344
+ return coef
345
+
346
+
347
+ def _runtime_metrics(y_log_true: np.ndarray, y_log_pred: np.ndarray, y_sec_true: np.ndarray) -> dict[str, float | None]:
348
+ if y_log_true.size == 0:
349
+ return {"rmse_log": None, "median_multiplicative_error_seconds": None}
350
+ rmse_log = float(np.sqrt(np.mean((y_log_pred - y_log_true) ** 2)))
351
+ y_sec_pred = np.expm1(y_log_pred)
352
+ eps = 1e-9
353
+ ratios = np.maximum((y_sec_pred + eps) / (y_sec_true + eps), (y_sec_true + eps) / (y_sec_pred + eps))
354
+ med_mul = float(np.median(ratios))
355
+ return {"rmse_log": rmse_log, "median_multiplicative_error_seconds": med_mul}
356
+
357
+
358
+ def _sigmoid(x: np.ndarray) -> np.ndarray:
359
+ return 1.0 / (1.0 + np.exp(-x))
360
+
361
+
362
+ def _fidelity_metrics(y_fid_true: np.ndarray, y_logit_pred: np.ndarray) -> dict[str, float | None]:
363
+ if y_fid_true.size == 0:
364
+ return {"mae_fidelity": None}
365
+ y_fid_pred = _sigmoid(y_logit_pred)
366
+ mae = float(np.mean(np.abs(y_fid_pred - y_fid_true)))
367
+ return {"mae_fidelity": mae}
368
+
369
+
370
+ def _memory_metrics(y_log_true: np.ndarray, y_log_pred: np.ndarray, y_mb_true: np.ndarray) -> dict[str, float | None]:
371
+ if y_log_true.size == 0:
372
+ return {"rmse_log": None, "median_multiplicative_error_mb": None}
373
+ rmse_log = float(np.sqrt(np.mean((y_log_pred - y_log_true) ** 2)))
374
+ y_mb_pred = np.expm1(y_log_pred)
375
+ eps = 1e-9
376
+ ratios = np.maximum((y_mb_pred + eps) / (y_mb_true + eps), (y_mb_true + eps) / (y_mb_pred + eps))
377
+ med_mul = float(np.median(ratios))
378
+ return {"rmse_log": rmse_log, "median_multiplicative_error_mb": med_mul}
379
+
380
+
381
+ def _build_artifact(
382
+ *,
383
+ schema_version: str,
384
+ feature_names: list[str],
385
+ coef: np.ndarray,
386
+ x_mean: np.ndarray,
387
+ x_std: np.ndarray,
388
+ alpha: float,
389
+ joined_input_path: str,
390
+ joined_input_sha256: str,
391
+ rows_total: int,
392
+ rows_train: int,
393
+ rows_test: int,
394
+ model_kind: str,
395
+ metrics: dict[str, float | None],
396
+ ) -> dict[str, Any]:
397
+ return {
398
+ "schema_version": schema_version,
399
+ "feature_names": feature_names,
400
+ "coef": [float(v) for v in coef.tolist()],
401
+ "x_mean": [float(v) for v in x_mean.tolist()],
402
+ "x_std": [float(v) for v in x_std.tolist()],
403
+ "alpha": float(alpha),
404
+ "training_metadata": {
405
+ "model_kind": model_kind,
406
+ "created_utc": _now_utc(),
407
+ "joined_input_path": str(joined_input_path),
408
+ "joined_input_sha256": joined_input_sha256,
409
+ "rows_used": int(rows_total),
410
+ "rows_train": int(rows_train),
411
+ "rows_test": int(rows_test),
412
+ "metrics": metrics,
413
+ },
414
+ }
415
+
416
+
417
+ def train_from_joined(
418
+ input_path: str | Path = DEFAULT_INPUT,
419
+ *,
420
+ runtime_out: str | Path = DEFAULT_RUNTIME_OUT,
421
+ build_out: str | Path = DEFAULT_BUILD_OUT,
422
+ fidelity_out: str | Path = DEFAULT_FIDELITY_OUT,
423
+ memory_out: str | Path = DEFAULT_MEMORY_OUT,
424
+ alpha: float = 1.0,
425
+ eps: float = 1e-6,
426
+ test_fraction: float = 0.2,
427
+ ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any], dict[str, Any] | None]:
428
+ rows = _load_jsonl(input_path)
429
+ if not rows:
430
+ raise ValueError("no joined rows found")
431
+
432
+ schema_version = str(rows[0].get("schema_version") or "")
433
+ joined_sha = sha256_file(input_path)
434
+ full_feature_names = ["bias", "log2_threshold", "precision_is_double"] + list(rows[0].get("feature_names") or [])
435
+
436
+ # Execution-time model: total_runtime ≈ build_time + execution_time; execution_time ≈ slope * shots.
437
+ # Derive execution_time_s = max(0, runner_wall_s_hist - runner_wall_s_state) per (content_hash, backend_id, precision, threshold).
438
+ execution_rows = _derive_execution_rows(rows)
439
+ x_rt, y_rt, full_feature_names_runtime, y_rt_sec = _build_xy_execution(execution_rows)
440
+ rt_train_idx, rt_test_idx = _split_indices(execution_rows, test_fraction)
441
+ x_rt_scaled, x_rt_mean, x_rt_std = _standardize_from_train(x_rt, rt_train_idx)
442
+ rt_coef = _fit_ridge_closed_form(x_rt_scaled[rt_train_idx], y_rt[rt_train_idx], alpha=float(alpha))
443
+ y_rt_pred_test = x_rt_scaled[rt_test_idx] @ rt_coef if rt_test_idx else np.asarray([], dtype=float)
444
+ rt_metrics = _runtime_metrics(
445
+ y_rt[rt_test_idx] if rt_test_idx else np.asarray([], dtype=float),
446
+ y_rt_pred_test,
447
+ y_rt_sec[rt_test_idx] if rt_test_idx else np.asarray([], dtype=float),
448
+ )
449
+ runtime_artifact = _build_artifact(
450
+ schema_version=schema_version,
451
+ feature_names=full_feature_names_runtime,
452
+ coef=rt_coef,
453
+ x_mean=x_rt_mean,
454
+ x_std=x_rt_std,
455
+ alpha=float(alpha),
456
+ joined_input_path=str(input_path),
457
+ joined_input_sha256=joined_sha,
458
+ rows_total=len(execution_rows),
459
+ rows_train=len(rt_train_idx),
460
+ rows_test=len(rt_test_idx),
461
+ model_kind="execution_runtime_ridge",
462
+ metrics=rt_metrics,
463
+ )
464
+
465
+ # Build-time model (1-shot probe forward rows)
466
+ x_bd, y_bd, _, y_bd_sec = _build_xy(rows, for_runtime=True, eps=eps, runtime_selector=_is_build_runtime_row)
467
+ bd_rows = [
468
+ r
469
+ for r in rows
470
+ if _is_build_runtime_row(r)
471
+ ]
472
+ bd_train_idx, bd_test_idx = _split_indices(bd_rows, test_fraction)
473
+ x_bd_scaled, x_bd_mean, x_bd_std = _standardize_from_train(x_bd, bd_train_idx)
474
+ bd_coef = _fit_ridge_closed_form(x_bd_scaled[bd_train_idx], y_bd[bd_train_idx], alpha=float(alpha))
475
+ y_bd_pred_test = x_bd_scaled[bd_test_idx] @ bd_coef if bd_test_idx else np.asarray([], dtype=float)
476
+ bd_metrics = _runtime_metrics(
477
+ y_bd[bd_test_idx] if bd_test_idx else np.asarray([], dtype=float),
478
+ y_bd_pred_test,
479
+ y_bd_sec[bd_test_idx] if bd_test_idx else np.asarray([], dtype=float),
480
+ )
481
+ build_artifact = _build_artifact(
482
+ schema_version=schema_version,
483
+ feature_names=full_feature_names,
484
+ coef=bd_coef,
485
+ x_mean=x_bd_mean,
486
+ x_std=x_bd_std,
487
+ alpha=float(alpha),
488
+ joined_input_path=str(input_path),
489
+ joined_input_sha256=joined_sha,
490
+ rows_total=len(bd_rows),
491
+ rows_train=len(bd_train_idx),
492
+ rows_test=len(bd_test_idx),
493
+ model_kind="build_ridge_log1p",
494
+ metrics=bd_metrics,
495
+ )
496
+
497
+ # Fidelity model
498
+ x_fd, y_fd, _, y_fd_true = _build_xy(rows, for_runtime=False, eps=eps)
499
+ fd_rows = [
500
+ r
501
+ for r in rows
502
+ if r.get("run_kind") == "mirror_threshold_attempt" and r.get("fidelity") is not None
503
+ ]
504
+ fd_train_idx, fd_test_idx = _split_indices(fd_rows, test_fraction)
505
+ x_fd_scaled, x_fd_mean, x_fd_std = _standardize_from_train(x_fd, fd_train_idx)
506
+ fd_coef = _fit_ridge_closed_form(x_fd_scaled[fd_train_idx], y_fd[fd_train_idx], alpha=float(alpha))
507
+ y_fd_pred_test = x_fd_scaled[fd_test_idx] @ fd_coef if fd_test_idx else np.asarray([], dtype=float)
508
+ fd_metrics = _fidelity_metrics(
509
+ y_fd_true[fd_test_idx] if fd_test_idx else np.asarray([], dtype=float),
510
+ y_fd_pred_test,
511
+ )
512
+ fidelity_artifact = _build_artifact(
513
+ schema_version=schema_version,
514
+ feature_names=full_feature_names,
515
+ coef=fd_coef,
516
+ x_mean=x_fd_mean,
517
+ x_std=x_fd_std,
518
+ alpha=float(alpha),
519
+ joined_input_path=str(input_path),
520
+ joined_input_sha256=joined_sha,
521
+ rows_total=len(fd_rows),
522
+ rows_train=len(fd_train_idx),
523
+ rows_test=len(fd_test_idx),
524
+ model_kind="fidelity_ridge_logit",
525
+ metrics=fd_metrics,
526
+ )
527
+
528
+ # Memory model (optional; uses total-runtime row selection)
529
+ memory_artifact: dict[str, Any] | None = None
530
+ mem_rows = [r for r in rows if _is_memory_row(r)]
531
+ if mem_rows:
532
+ x_mb, y_mb, _, y_mb_true = _build_xy_memory(rows, selector=_is_memory_row)
533
+ mb_train_idx, mb_test_idx = _split_indices(mem_rows, test_fraction)
534
+ x_mb_scaled, x_mb_mean, x_mb_std = _standardize_from_train(x_mb, mb_train_idx)
535
+ mb_coef = _fit_ridge_closed_form(x_mb_scaled[mb_train_idx], y_mb[mb_train_idx], alpha=float(alpha))
536
+ y_mb_pred_test = x_mb_scaled[mb_test_idx] @ mb_coef if mb_test_idx else np.asarray([], dtype=float)
537
+ mb_metrics = _memory_metrics(
538
+ y_mb[mb_test_idx] if mb_test_idx else np.asarray([], dtype=float),
539
+ y_mb_pred_test,
540
+ y_mb_true[mb_test_idx] if mb_test_idx else np.asarray([], dtype=float),
541
+ )
542
+ memory_artifact = _build_artifact(
543
+ schema_version=schema_version,
544
+ feature_names=full_feature_names,
545
+ coef=mb_coef,
546
+ x_mean=x_mb_mean,
547
+ x_std=x_mb_std,
548
+ alpha=float(alpha),
549
+ joined_input_path=str(input_path),
550
+ joined_input_sha256=joined_sha,
551
+ rows_total=len(mem_rows),
552
+ rows_train=len(mb_train_idx),
553
+ rows_test=len(mb_test_idx),
554
+ model_kind="memory_ridge_log1p_mb",
555
+ metrics=mb_metrics,
556
+ )
557
+
558
+ runtime_out = Path(runtime_out)
559
+ build_out = Path(build_out)
560
+ fidelity_out = Path(fidelity_out)
561
+ memory_out = Path(memory_out)
562
+ runtime_out.parent.mkdir(parents=True, exist_ok=True)
563
+ build_out.parent.mkdir(parents=True, exist_ok=True)
564
+ fidelity_out.parent.mkdir(parents=True, exist_ok=True)
565
+ memory_out.parent.mkdir(parents=True, exist_ok=True)
566
+ runtime_out.write_text(json.dumps(runtime_artifact, indent=2, sort_keys=True), encoding="utf-8")
567
+ build_out.write_text(json.dumps(build_artifact, indent=2, sort_keys=True), encoding="utf-8")
568
+ fidelity_out.write_text(json.dumps(fidelity_artifact, indent=2, sort_keys=True), encoding="utf-8")
569
+ if memory_artifact is not None:
570
+ memory_out.write_text(json.dumps(memory_artifact, indent=2, sort_keys=True), encoding="utf-8")
571
+ return runtime_artifact, build_artifact, fidelity_artifact, memory_artifact
572
+
573
+
574
+ def main(argv: list[str] | None = None) -> int:
575
+ ap = argparse.ArgumentParser(description="Train baseline runtime/fidelity ridge regressors from joined observations.")
576
+ ap.add_argument("--input", default=str(DEFAULT_INPUT), help="Joined observation JSONL path.")
577
+ ap.add_argument("--runtime-out", default=str(DEFAULT_RUNTIME_OUT), help="Runtime artifact JSON path.")
578
+ ap.add_argument("--build-out", default=str(DEFAULT_BUILD_OUT), help="Build-time artifact JSON path.")
579
+ ap.add_argument("--fidelity-out", default=str(DEFAULT_FIDELITY_OUT), help="Fidelity artifact JSON path.")
580
+ ap.add_argument("--memory-out", default=str(DEFAULT_MEMORY_OUT), help="Memory artifact JSON path (optional if labels exist).")
581
+ ap.add_argument("--alpha", type=float, default=1.0, help="Ridge L2 strength.")
582
+ ap.add_argument("--test-fraction", type=float, default=0.2, help="Deterministic content-hash holdout fraction.")
583
+ ap.add_argument("--eps", type=float, default=1e-6, help="Clipping epsilon for fidelity logit transform.")
584
+ args = ap.parse_args(argv)
585
+
586
+ rt, bd, fd, mb = train_from_joined(
587
+ input_path=args.input,
588
+ runtime_out=args.runtime_out,
589
+ build_out=args.build_out,
590
+ fidelity_out=args.fidelity_out,
591
+ memory_out=args.memory_out,
592
+ alpha=args.alpha,
593
+ eps=args.eps,
594
+ test_fraction=args.test_fraction,
595
+ )
596
+ print(
597
+ f"[ok] wrote runtime model: {args.runtime_out} (rows={rt['training_metadata']['rows_used']})",
598
+ flush=True,
599
+ )
600
+ print(
601
+ f"[ok] wrote build-time model: {args.build_out} (rows={bd['training_metadata']['rows_used']})",
602
+ flush=True,
603
+ )
604
+ print(
605
+ f"[ok] wrote fidelity model: {args.fidelity_out} (rows={fd['training_metadata']['rows_used']})",
606
+ flush=True,
607
+ )
608
+ if mb is not None:
609
+ print(
610
+ f"[ok] wrote memory model: {args.memory_out} (rows={mb['training_metadata']['rows_used']})",
611
+ flush=True,
612
+ )
613
+ else:
614
+ print("[info] skipped memory model: no eligible rows with peak_rss_mb labels", flush=True)
615
+ return 0
616
+
617
+
618
+ if __name__ == "__main__":
619
+ raise SystemExit(main())