open-research-protocol 0.4.7 → 0.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +15 -0
  2. package/cli/orp.py +1158 -43
  3. package/docs/AGENT_LOOP.md +3 -0
  4. package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
  5. package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
  6. package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
  7. package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
  8. package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
  9. package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
  10. package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
  11. package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
  12. package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
  13. package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
  14. package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
  15. package/docs/ORP_YOUTUBE_INSPECT.md +97 -0
  16. package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
  17. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
  18. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
  19. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
  20. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
  21. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
  22. package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
  23. package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
  24. package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
  25. package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
  26. package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
  27. package/examples/README.md +2 -0
  28. package/examples/kernel/comparison/comparison-corpus.json +337 -0
  29. package/examples/kernel/comparison/next-task-continuation.json +55 -0
  30. package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
  31. package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
  32. package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
  33. package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
  34. package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
  35. package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
  36. package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
  37. package/llms.txt +3 -0
  38. package/package.json +4 -1
  39. package/scripts/orp-kernel-agent-pilot.py +673 -0
  40. package/scripts/orp-kernel-agent-replication.py +307 -0
  41. package/scripts/orp-kernel-benchmark.py +471 -2
  42. package/scripts/orp-kernel-canonical-continuation.py +381 -0
  43. package/scripts/orp-kernel-ci-check.py +138 -0
  44. package/scripts/orp-kernel-comparison.py +592 -0
  45. package/scripts/orp-kernel-continuation-pilot.py +384 -0
  46. package/scripts/orp-kernel-pickup.py +401 -0
  47. package/spec/v1/kernel-extension.schema.json +96 -0
  48. package/spec/v1/kernel-proposal.schema.json +115 -0
  49. package/spec/v1/kernel.schema.json +2 -1
  50. package/spec/v1/youtube-source.schema.json +151 -0
@@ -0,0 +1,381 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import importlib.util
6
+ import json
7
+ from pathlib import Path
8
+ import platform
9
+ import re
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ import time
14
+ from typing import Any
15
+
16
+
17
+ REPO_ROOT = Path(__file__).resolve().parents[1]
18
+ AGENT_PILOT = REPO_ROOT / "scripts" / "orp-kernel-agent-pilot.py"
19
+ EXPECTED_TASKS = REPO_ROOT / "examples" / "kernel" / "comparison" / "next-task-continuation.json"
20
+ CONDITIONS = ["freeform", "generic_checklist", "kernel"]
21
+ TASK_FIELDS = ["object", "goal", "boundary", "constraints", "success_criteria"]
22
+ STOPWORDS = {
23
+ "a", "an", "and", "the", "to", "of", "for", "in", "on", "with", "without",
24
+ "is", "are", "be", "by", "or", "as", "that", "this", "it", "into", "from",
25
+ "than", "at", "all", "one", "through",
26
+ }
27
+
28
+
29
+ def _load_agent_pilot():
30
+ spec = importlib.util.spec_from_file_location("orp_kernel_agent_pilot_runtime", AGENT_PILOT)
31
+ if spec is None or spec.loader is None:
32
+ raise RuntimeError(f"failed to load agent pilot from {AGENT_PILOT}")
33
+ module = importlib.util.module_from_spec(spec)
34
+ spec.loader.exec_module(module)
35
+ return module
36
+
37
+
38
+ AGENT_PILOT_MODULE = _load_agent_pilot()
39
+
40
+
41
+ def _read_json(path: Path) -> Any:
42
+ return json.loads(path.read_text(encoding="utf-8"))
43
+
44
+
45
+ def _load_expected_tasks() -> dict[str, dict[str, str]]:
46
+ payload = _read_json(EXPECTED_TASKS)
47
+ cases = payload.get("cases")
48
+ if not isinstance(cases, dict):
49
+ raise RuntimeError(f"expected task continuation cases missing: {EXPECTED_TASKS}")
50
+ return {str(key): {str(f): str(v) for f, v in value.items()} for key, value in cases.items() if isinstance(value, dict)}
51
+
52
+
53
+ EXPECTED_TASK_MAP = _load_expected_tasks()
54
+
55
+
56
+ def _response_schema() -> dict[str, Any]:
57
+ properties = {
58
+ "artifact_class": {"type": "string", "const": "task"},
59
+ "confidence": {"type": "number"},
60
+ "missing_required_fields": {"type": "array", "items": {"type": "string"}},
61
+ }
62
+ required = ["artifact_class", "confidence", "missing_required_fields"]
63
+ for field in TASK_FIELDS:
64
+ properties[field] = {"type": ["string", "null"]}
65
+ required.append(field)
66
+ return {
67
+ "type": "object",
68
+ "additionalProperties": False,
69
+ "properties": properties,
70
+ "required": required,
71
+ }
72
+
73
+
74
+ def _build_prompt(case: dict[str, Any], condition: str) -> str:
75
+ artifact = AGENT_PILOT_MODULE._render_artifact(case, condition)
76
+ target_list = ", ".join(TASK_FIELDS)
77
+ return (
78
+ "You are simulating a fresh downstream Codex session that must convert the source artifact into the next canonical task artifact.\n"
79
+ "Using only the artifact below, produce a JSON object for a kernel task artifact.\n"
80
+ f"The task artifact must include these required fields: {target_list}.\n"
81
+ "Use a string only when the source artifact gives enough support to carry the field forward safely into a task artifact.\n"
82
+ "If the source artifact does not support a required field strongly enough, set that field to null and include it in `missing_required_fields`.\n"
83
+ "Do not invent unsupported constraints, boundaries, or success criteria.\n\n"
84
+ f"Artifact:\n{artifact}\n"
85
+ )
86
+
87
+
88
+ def _run_codex(case: dict[str, Any], condition: str, *, model: str) -> dict[str, Any]:
89
+ prompt = _build_prompt(case, condition)
90
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-canonical-continuation.") as td:
91
+ root = Path(td)
92
+ schema_path = root / "schema.json"
93
+ out_path = root / "out.json"
94
+ schema_path.write_text(json.dumps(_response_schema(), indent=2) + "\n", encoding="utf-8")
95
+ args = [
96
+ "codex",
97
+ "exec",
98
+ "--ephemeral",
99
+ "--skip-git-repo-check",
100
+ "-C",
101
+ str(root),
102
+ "--output-schema",
103
+ str(schema_path),
104
+ "-o",
105
+ str(out_path),
106
+ ]
107
+ if model:
108
+ args.extend(["--model", model])
109
+ args.append("-")
110
+ started = time.perf_counter()
111
+ proc = AGENT_PILOT_MODULE._run_codex_exec(args, cwd=REPO_ROOT, stdin=prompt)
112
+ elapsed_ms = round((time.perf_counter() - started) * 1000.0, 3)
113
+ if proc.returncode != 0:
114
+ raise RuntimeError(
115
+ f"codex exec failed for case={case['id']} condition={condition}\n"
116
+ f"stdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}"
117
+ )
118
+ payload = _read_json(out_path)
119
+ return {
120
+ "raw_response": payload,
121
+ "elapsed_ms": elapsed_ms,
122
+ "session_id": AGENT_PILOT_MODULE._extract_session_id(proc.stdout),
123
+ "tokens_used": AGENT_PILOT_MODULE._extract_tokens_used(proc.stdout),
124
+ }
125
+
126
+
127
+ def _tokenize(value: str) -> set[str]:
128
+ parts = re.findall(r"[a-z0-9]+", value.lower())
129
+ return {part for part in parts if part not in STOPWORDS and len(part) > 2}
130
+
131
+
132
+ def _similarity(answer: str | None, expected: str) -> float:
133
+ if not answer:
134
+ return 0.0
135
+ answer_tokens = _tokenize(answer)
136
+ expected_tokens = _tokenize(expected)
137
+ if not answer_tokens or not expected_tokens:
138
+ return 0.0
139
+ overlap = answer_tokens & expected_tokens
140
+ return round(len(overlap) / len(expected_tokens), 3)
141
+
142
+
143
+ def _score_case(case: dict[str, Any], response: dict[str, Any]) -> dict[str, Any]:
144
+ expected = EXPECTED_TASK_MAP[case["id"]]
145
+ answers: dict[str, str | None] = {}
146
+ field_similarity: dict[str, float] = {}
147
+ aligned = 0
148
+ misaligned = 0
149
+ for field in TASK_FIELDS:
150
+ value = response.get(field)
151
+ normalized = value.strip() if isinstance(value, str) and value.strip() else None
152
+ answers[field] = normalized
153
+ similarity = _similarity(normalized, expected[field])
154
+ field_similarity[field] = similarity
155
+ if similarity >= 0.45:
156
+ aligned += 1
157
+ elif normalized is not None:
158
+ misaligned += 1
159
+ alignment_score = round(aligned / len(TASK_FIELDS), 3)
160
+ invention_rate = round(misaligned / len([v for v in answers.values() if v is not None]), 3) if any(answers.values()) else 0.0
161
+ missing_declared = set(response.get("missing_required_fields", []))
162
+ missing_expected = {field for field, value in answers.items() if value is None}
163
+ missing_list_match = round(len(missing_declared & missing_expected) / len(missing_expected), 3) if missing_expected else 1.0
164
+ total_score = round((alignment_score + (1.0 - invention_rate) + missing_list_match) / 3.0, 3)
165
+ return {
166
+ "answers": answers,
167
+ "expected": expected,
168
+ "field_similarity": field_similarity,
169
+ "aligned_fields": aligned,
170
+ "alignment_score": alignment_score,
171
+ "misaligned_fields": misaligned,
172
+ "invention_rate": invention_rate,
173
+ "missing_declared": sorted(missing_declared),
174
+ "missing_expected": sorted(missing_expected),
175
+ "missing_list_match": missing_list_match,
176
+ "total_score": total_score,
177
+ "confidence": response["confidence"],
178
+ }
179
+
180
+
181
+ def _evaluate_case(case: dict[str, Any], *, model: str) -> dict[str, Any]:
182
+ conditions: dict[str, Any] = {}
183
+ for condition in CONDITIONS:
184
+ result = _run_codex(case, condition, model=model)
185
+ score = _score_case(case, result["raw_response"])
186
+ conditions[condition] = {
187
+ "response": result["raw_response"],
188
+ "score": score,
189
+ "elapsed_ms": result["elapsed_ms"],
190
+ "session_id": result["session_id"],
191
+ "tokens_used": result["tokens_used"],
192
+ }
193
+ return {
194
+ "id": case["id"],
195
+ "domain": case["domain"],
196
+ "artifact_class": case["artifact_class"],
197
+ "prompt": case["prompt"],
198
+ "conditions": conditions,
199
+ }
200
+
201
+
202
+ def _mean(values: list[float]) -> float:
203
+ return round(sum(values) / len(values), 3) if values else 0.0
204
+
205
+
206
+ def _aggregate(cases: list[dict[str, Any]], condition: str) -> dict[str, Any]:
207
+ rows = []
208
+ total_scores: list[float] = []
209
+ alignment_scores: list[float] = []
210
+ invention_rates: list[float] = []
211
+ missing_match: list[float] = []
212
+ confidence: list[float] = []
213
+ elapsed: list[float] = []
214
+ for case in cases:
215
+ row = case["conditions"][condition]
216
+ score = row["score"]
217
+ rows.append(
218
+ {
219
+ "id": case["id"],
220
+ "domain": case["domain"],
221
+ "artifact_class": case["artifact_class"],
222
+ "total_score": score["total_score"],
223
+ "alignment_score": score["alignment_score"],
224
+ "invention_rate": score["invention_rate"],
225
+ "missing_list_match": score["missing_list_match"],
226
+ "answers": score["answers"],
227
+ "field_similarity": score["field_similarity"],
228
+ }
229
+ )
230
+ total_scores.append(score["total_score"])
231
+ alignment_scores.append(score["alignment_score"])
232
+ invention_rates.append(score["invention_rate"])
233
+ missing_match.append(score["missing_list_match"])
234
+ confidence.append(score["confidence"])
235
+ elapsed.append(row["elapsed_ms"])
236
+ return {
237
+ "condition": condition,
238
+ "cases_total": len(rows),
239
+ "rows": rows,
240
+ "mean_total_score": _mean(total_scores),
241
+ "mean_alignment_score": _mean(alignment_scores),
242
+ "mean_invention_rate": _mean(invention_rates),
243
+ "mean_missing_list_match": _mean(missing_match),
244
+ "mean_confidence": _mean(confidence),
245
+ "mean_elapsed_ms": _mean(elapsed),
246
+ }
247
+
248
+
249
+ def _pairwise(cases: list[dict[str, Any]], left: str, right: str) -> dict[str, Any]:
250
+ wins = 0
251
+ ties = 0
252
+ losses = 0
253
+ deltas: list[float] = []
254
+ for case in cases:
255
+ left_score = case["conditions"][left]["score"]["total_score"]
256
+ right_score = case["conditions"][right]["score"]["total_score"]
257
+ delta = round(left_score - right_score, 3)
258
+ deltas.append(delta)
259
+ if delta > 0:
260
+ wins += 1
261
+ elif delta < 0:
262
+ losses += 1
263
+ else:
264
+ ties += 1
265
+ return {
266
+ "left": left,
267
+ "right": right,
268
+ "wins": wins,
269
+ "ties": ties,
270
+ "losses": losses,
271
+ "mean_total_score_delta": _mean(deltas),
272
+ }
273
+
274
+
275
+ def _gather_metadata(model: str) -> dict[str, Any]:
276
+ package_version = json.loads((REPO_ROOT / "package.json").read_text(encoding="utf-8"))["version"]
277
+ commit = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
278
+ branch = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
279
+ codex_version = subprocess.run(["codex", "--version"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
280
+ return {
281
+ "generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
282
+ "repo_commit": commit,
283
+ "repo_branch": branch,
284
+ "package_version": package_version,
285
+ "python_version": sys.version.split()[0],
286
+ "codex_version": codex_version,
287
+ "platform": platform.platform(),
288
+ "model": model or "default",
289
+ }
290
+
291
+
292
+ def build_report(*, model: str, case_ids: set[str] | None = None) -> dict[str, Any]:
293
+ cases = AGENT_PILOT_MODULE._load_cases()
294
+ if case_ids:
295
+ cases = [case for case in cases if case["id"] in case_ids]
296
+ if not cases:
297
+ raise RuntimeError("no comparison cases matched the requested ids")
298
+ evaluated = [_evaluate_case(case, model=model) for case in cases]
299
+ conditions = {condition: _aggregate(evaluated, condition) for condition in CONDITIONS}
300
+ pairwise = {
301
+ "kernel_vs_generic_checklist": _pairwise(evaluated, "kernel", "generic_checklist"),
302
+ "kernel_vs_freeform": _pairwise(evaluated, "kernel", "freeform"),
303
+ }
304
+ claims = [
305
+ {
306
+ "id": "kernel_outscores_generic_checklist_on_canonical_task_continuation",
307
+ "claim": "On the matched live canonical-task continuation benchmark, kernel artifacts produce task artifacts that meet or exceed generic checklist quality without a higher invention rate.",
308
+ "status": "pass"
309
+ if conditions["kernel"]["mean_total_score"] >= conditions["generic_checklist"]["mean_total_score"]
310
+ and pairwise["kernel_vs_generic_checklist"]["losses"] == 0
311
+ and conditions["kernel"]["mean_invention_rate"] <= conditions["generic_checklist"]["mean_invention_rate"]
312
+ else "fail",
313
+ },
314
+ {
315
+ "id": "kernel_outscores_freeform_on_canonical_task_continuation",
316
+ "claim": "On the matched live canonical-task continuation benchmark, kernel artifacts produce stronger next-task artifacts than free-form artifacts.",
317
+ "status": "pass"
318
+ if conditions["kernel"]["mean_total_score"] > conditions["freeform"]["mean_total_score"]
319
+ and pairwise["kernel_vs_freeform"]["losses"] == 0
320
+ else "fail",
321
+ },
322
+ {
323
+ "id": "kernel_minimizes_invention_on_canonical_task_continuation",
324
+ "claim": "On the matched live canonical-task continuation benchmark, kernel artifacts minimize unsupported task-field invention.",
325
+ "status": "pass"
326
+ if conditions["kernel"]["mean_invention_rate"] <= conditions["generic_checklist"]["mean_invention_rate"]
327
+ and conditions["kernel"]["mean_invention_rate"] <= conditions["freeform"]["mean_invention_rate"]
328
+ else "fail",
329
+ },
330
+ ]
331
+ return {
332
+ "schema_version": "1.0.0",
333
+ "kind": "orp_reasoning_kernel_canonical_continuation_report",
334
+ "metadata": _gather_metadata(model),
335
+ "corpus": {
336
+ "cases_total": len(evaluated),
337
+ "domains": sorted({case["domain"] for case in evaluated}),
338
+ "artifact_classes": sorted({case["artifact_class"] for case in evaluated}),
339
+ },
340
+ "conditions": conditions,
341
+ "pairwise": pairwise,
342
+ "claims": claims,
343
+ "summary": {
344
+ "all_claims_pass": all(claim["status"] == "pass" for claim in claims),
345
+ "kernel_mean_total_score": conditions["kernel"]["mean_total_score"],
346
+ "generic_checklist_mean_total_score": conditions["generic_checklist"]["mean_total_score"],
347
+ "freeform_mean_total_score": conditions["freeform"]["mean_total_score"],
348
+ "kernel_mean_invention_rate": conditions["kernel"]["mean_invention_rate"],
349
+ "generic_checklist_mean_invention_rate": conditions["generic_checklist"]["mean_invention_rate"],
350
+ "freeform_mean_invention_rate": conditions["freeform"]["mean_invention_rate"],
351
+ },
352
+ }
353
+
354
+
355
+ def main() -> int:
356
+ parser = argparse.ArgumentParser(
357
+ description="Run a live Codex canonical-task continuation benchmark across free-form, generic checklist, and kernel artifacts."
358
+ )
359
+ parser.add_argument("--out", default="", help="Optional JSON output path")
360
+ parser.add_argument("--model", default="", help="Optional Codex model override")
361
+ parser.add_argument(
362
+ "--case-id",
363
+ action="append",
364
+ default=[],
365
+ help="Optional case id to evaluate (repeatable). Default: all cases.",
366
+ )
367
+ args = parser.parse_args()
368
+ report = build_report(model=args.model, case_ids=set(args.case_id) or None)
369
+ payload = json.dumps(report, indent=2) + "\n"
370
+ if args.out:
371
+ out_path = Path(args.out)
372
+ if not out_path.is_absolute():
373
+ out_path = REPO_ROOT / out_path
374
+ out_path.parent.mkdir(parents=True, exist_ok=True)
375
+ out_path.write_text(payload, encoding="utf-8")
376
+ print(payload, end="")
377
+ return 0 if report["summary"]["all_claims_pass"] else 1
378
+
379
+
380
+ if __name__ == "__main__":
381
+ raise SystemExit(main())
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from pathlib import Path
7
+ import sys
8
+ from typing import Any
9
+
10
+
11
+ DEFAULTS = {
12
+ "comparison": "docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json",
13
+ "pickup": "docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json",
14
+ "agent_pilot": "docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json",
15
+ "replication": "docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json",
16
+ "canonical_continuation": "docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json",
17
+ }
18
+
19
+
20
+ def _load_json(path: Path) -> dict[str, Any]:
21
+ payload = json.loads(path.read_text(encoding="utf-8"))
22
+ if not isinstance(payload, dict):
23
+ raise RuntimeError(f"benchmark root must be an object: {path}")
24
+ return payload
25
+
26
+
27
+ def _summary_score(payload: dict[str, Any], key: str) -> float:
28
+ summary = payload.get("summary")
29
+ if not isinstance(summary, dict):
30
+ raise RuntimeError("benchmark summary is missing")
31
+ value = summary.get(key)
32
+ if not isinstance(value, (int, float)):
33
+ raise RuntimeError(f"benchmark summary key is missing or non-numeric: {key}")
34
+ return float(value)
35
+
36
+
37
+ def _claim_failed(payload: dict[str, Any]) -> list[str]:
38
+ failures: list[str] = []
39
+ claims = payload.get("claims")
40
+ if not isinstance(claims, list):
41
+ return failures
42
+ for row in claims:
43
+ if not isinstance(row, dict):
44
+ continue
45
+ if str(row.get("status", "")).strip().lower() != "pass":
46
+ failures.append(str(row.get("id", "(unknown)")).strip() or "(unknown)")
47
+ return failures
48
+
49
+
50
+ def evaluate(paths: dict[str, Path]) -> tuple[bool, list[str]]:
51
+ comparison = _load_json(paths["comparison"])
52
+ pickup = _load_json(paths["pickup"])
53
+ agent_pilot = _load_json(paths["agent_pilot"])
54
+ replication = _load_json(paths["replication"])
55
+ canonical = _load_json(paths["canonical_continuation"])
56
+
57
+ notes: list[str] = []
58
+ failures: list[str] = []
59
+
60
+ def check(condition: bool, message: str) -> None:
61
+ (notes if condition else failures).append(message)
62
+
63
+ comparison_failed = _claim_failed(comparison)
64
+ pickup_failed = _claim_failed(pickup)
65
+ agent_failed = _claim_failed(agent_pilot)
66
+ replication_failed = _claim_failed(replication)
67
+
68
+ check(not comparison_failed, f"comparison claims pass ({', '.join(comparison_failed) or 'ok'})")
69
+ check(not pickup_failed, f"pickup claims pass ({', '.join(pickup_failed) or 'ok'})")
70
+ check(not agent_failed, f"agent pilot claims pass ({', '.join(agent_failed) or 'ok'})")
71
+ check(not replication_failed, f"replication claims pass ({', '.join(replication_failed) or 'ok'})")
72
+
73
+ check(
74
+ _summary_score(comparison, "kernel_mean_total_score")
75
+ > _summary_score(comparison, "generic_checklist_mean_total_score")
76
+ > _summary_score(comparison, "freeform_mean_total_score"),
77
+ "comparison summary preserves kernel > checklist > free-form",
78
+ )
79
+ check(
80
+ _summary_score(pickup, "kernel_mean_pickup_score")
81
+ > _summary_score(pickup, "generic_checklist_mean_pickup_score")
82
+ > _summary_score(pickup, "freeform_mean_pickup_score"),
83
+ "pickup summary preserves kernel > checklist > free-form",
84
+ )
85
+ check(
86
+ _summary_score(agent_pilot, "kernel_mean_pickup_score")
87
+ > _summary_score(agent_pilot, "generic_checklist_mean_pickup_score")
88
+ > _summary_score(agent_pilot, "freeform_mean_pickup_score"),
89
+ "agent pilot summary preserves kernel > checklist > free-form",
90
+ )
91
+ check(
92
+ _summary_score(replication, "kernel_mean_pickup_score")
93
+ > _summary_score(replication, "generic_checklist_mean_pickup_score")
94
+ > _summary_score(replication, "freeform_mean_pickup_score"),
95
+ "replication summary preserves kernel > checklist > free-form",
96
+ )
97
+ check(
98
+ _summary_score(replication, "kernel_mean_invention_rate")
99
+ <= _summary_score(replication, "generic_checklist_mean_invention_rate")
100
+ and _summary_score(replication, "kernel_mean_invention_rate")
101
+ <= _summary_score(replication, "freeform_mean_invention_rate"),
102
+ "replication keeps kernel invention at or below the other conditions",
103
+ )
104
+ check(
105
+ _summary_score(canonical, "kernel_mean_total_score")
106
+ > _summary_score(canonical, "generic_checklist_mean_total_score")
107
+ > _summary_score(canonical, "freeform_mean_total_score"),
108
+ "canonical continuation keeps kernel > checklist > free-form on mean total score",
109
+ )
110
+ check(
111
+ _summary_score(canonical, "kernel_mean_invention_rate")
112
+ < _summary_score(canonical, "generic_checklist_mean_invention_rate")
113
+ < _summary_score(canonical, "freeform_mean_invention_rate"),
114
+ "canonical continuation keeps kernel < checklist < free-form on invention rate",
115
+ )
116
+
117
+ messages = notes + [f"FAIL: {row}" for row in failures]
118
+ return (len(failures) == 0, messages)
119
+
120
+
121
+ def main(argv: list[str] | None = None) -> int:
122
+ parser = argparse.ArgumentParser(description="Check committed ORP kernel benchmark artifacts against CI safety thresholds.")
123
+ for key, default in DEFAULTS.items():
124
+ parser.add_argument(f"--{key.replace('_', '-')}", default=default, help=f"Path to the {key} benchmark JSON")
125
+ args = parser.parse_args(argv)
126
+
127
+ paths = {
128
+ key: Path(getattr(args, key)).resolve()
129
+ for key in DEFAULTS
130
+ }
131
+ ok, messages = evaluate(paths)
132
+ for row in messages:
133
+ print(row)
134
+ return 0 if ok else 1
135
+
136
+
137
+ if __name__ == "__main__":
138
+ raise SystemExit(main())