open-research-protocol 0.4.7 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +9 -0
  2. package/cli/orp.py +668 -43
  3. package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
  4. package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
  5. package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
  6. package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
  7. package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
  8. package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
  9. package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
  10. package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
  11. package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
  12. package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
  13. package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
  14. package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
  15. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
  16. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
  17. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
  18. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
  19. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
  20. package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
  21. package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
  22. package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
  23. package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
  24. package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
  25. package/examples/README.md +2 -0
  26. package/examples/kernel/comparison/comparison-corpus.json +337 -0
  27. package/examples/kernel/comparison/next-task-continuation.json +55 -0
  28. package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
  29. package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
  30. package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
  31. package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
  32. package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
  33. package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
  34. package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
  35. package/package.json +4 -1
  36. package/scripts/orp-kernel-agent-pilot.py +673 -0
  37. package/scripts/orp-kernel-agent-replication.py +307 -0
  38. package/scripts/orp-kernel-benchmark.py +471 -2
  39. package/scripts/orp-kernel-canonical-continuation.py +381 -0
  40. package/scripts/orp-kernel-ci-check.py +138 -0
  41. package/scripts/orp-kernel-comparison.py +592 -0
  42. package/scripts/orp-kernel-continuation-pilot.py +384 -0
  43. package/scripts/orp-kernel-pickup.py +401 -0
  44. package/spec/v1/kernel-extension.schema.json +96 -0
  45. package/spec/v1/kernel-proposal.schema.json +115 -0
  46. package/spec/v1/kernel.schema.json +2 -1
@@ -0,0 +1,673 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from pathlib import Path
7
+ import platform
8
+ import re
9
+ import subprocess
10
+ import sys
11
+ import tempfile
12
+ import time
13
+ from typing import Any
14
+
15
+
16
+ REPO_ROOT = Path(__file__).resolve().parents[1]
17
+ COMPARISON_CORPUS = REPO_ROOT / "examples" / "kernel" / "comparison" / "comparison-corpus.json"
18
+ KERNEL_SCHEMA = REPO_ROOT / "spec" / "v1" / "kernel.schema.json"
19
+ CONDITIONS = ["freeform", "generic_checklist", "kernel"]
20
+ TRANSIENT_CODEX_FAILURE_SNIPPETS = [
21
+ "We're currently experiencing high demand",
22
+ "unexpected status 401 Unauthorized: Missing bearer or basic authentication in header",
23
+ "failed to connect to websocket",
24
+ "Warning: no last agent message; wrote empty content",
25
+ ]
26
+
27
+ FREEFORM_LABEL_ALIASES: dict[str, set[str]] = {
28
+ "artifact_type": {"artifact type", "type"},
29
+ "object": {"object"},
30
+ "goal": {"goal"},
31
+ "boundary": {"boundary", "scope"},
32
+ "constraints": {"constraints", "constraint"},
33
+ "success_criteria": {"success criteria", "success", "done when"},
34
+ "question": {"question"},
35
+ "chosen_path": {"decision", "chosen path", "recommendation"},
36
+ "rejected_alternatives": {"rejected alternatives", "alternatives"},
37
+ "rationale": {"why", "rationale"},
38
+ "consequences": {"consequences", "tradeoffs", "trade-offs"},
39
+ "claim": {"claim"},
40
+ "assumptions": {"assumptions"},
41
+ "test_path": {"test", "test path"},
42
+ "falsifiers": {"falsifiers", "would fail if"},
43
+ "objective": {"objective"},
44
+ "method": {"method"},
45
+ "inputs": {"inputs"},
46
+ "outputs": {"outputs", "evidence"},
47
+ "evidence_expectations": {"evidence expectations", "evidence"},
48
+ "interpretation_limits": {"limits", "interpretation limits"},
49
+ "completed_unit": {"completed", "completed unit"},
50
+ "current_state": {"current state"},
51
+ "risks": {"risks", "risk"},
52
+ "next_handoff_target": {"next", "next handoff target", "handoff"},
53
+ "artifact_refs": {"artifact refs", "artifacts", "references"},
54
+ "scope": {"scope"},
55
+ "rule": {"rule"},
56
+ "invariants": {"invariants"},
57
+ "enforcement_surface": {"enforcement", "enforcement surface"},
58
+ "evidence_paths": {"evidence", "evidence paths"},
59
+ "status": {"status"},
60
+ "next_follow_up": {"next follow up", "next follow-up", "next"},
61
+ }
62
+
63
+ CHECKLIST_FIELD_MAP: dict[str, dict[str, str]] = {
64
+ "task": {
65
+ "object": "summary",
66
+ "goal": "summary",
67
+ "boundary": "scope",
68
+ "constraints": "constraints",
69
+ "success_criteria": "checks",
70
+ },
71
+ "decision": {
72
+ "question": "summary",
73
+ "chosen_path": "approach",
74
+ "rejected_alternatives": "notes",
75
+ "rationale": "notes",
76
+ "consequences": "risks",
77
+ },
78
+ "hypothesis": {
79
+ "claim": "summary",
80
+ "boundary": "scope",
81
+ "assumptions": "notes",
82
+ "test_path": "checks",
83
+ "falsifiers": "risks",
84
+ },
85
+ "experiment": {
86
+ "objective": "summary",
87
+ "method": "approach",
88
+ "inputs": "scope",
89
+ "outputs": "checks",
90
+ "evidence_expectations": "evidence",
91
+ "interpretation_limits": "risks",
92
+ },
93
+ "checkpoint": {
94
+ "completed_unit": "summary",
95
+ "current_state": "notes",
96
+ "risks": "risks",
97
+ "next_handoff_target": "handoff",
98
+ "artifact_refs": "evidence",
99
+ },
100
+ "policy": {
101
+ "scope": "scope",
102
+ "rule": "summary",
103
+ "rationale": "notes",
104
+ "invariants": "constraints",
105
+ "enforcement_surface": "checks",
106
+ },
107
+ "result": {
108
+ "claim": "summary",
109
+ "evidence_paths": "evidence",
110
+ "status": "checks",
111
+ "interpretation_limits": "risks",
112
+ "next_follow_up": "handoff",
113
+ },
114
+ }
115
+
116
+
117
+ def _read_json(path: Path) -> Any:
118
+ return json.loads(path.read_text(encoding="utf-8"))
119
+
120
+
121
+ def _load_cases() -> list[dict[str, Any]]:
122
+ payload = _read_json(COMPARISON_CORPUS)
123
+ cases = payload.get("cases", [])
124
+ if not isinstance(cases, list) or not cases:
125
+ raise RuntimeError(f"comparison corpus has no cases: {COMPARISON_CORPUS}")
126
+ return cases
127
+
128
+
129
+ def _load_kernel_requirements() -> dict[str, list[str]]:
130
+ payload = _read_json(KERNEL_SCHEMA)
131
+ out: dict[str, list[str]] = {}
132
+ for clause in payload.get("allOf", []):
133
+ if not isinstance(clause, dict):
134
+ continue
135
+ const = (
136
+ clause.get("if", {})
137
+ .get("properties", {})
138
+ .get("artifact_class", {})
139
+ .get("const")
140
+ )
141
+ required = clause.get("then", {}).get("required")
142
+ if isinstance(const, str) and isinstance(required, list):
143
+ out[const] = [str(x) for x in required if isinstance(x, str)]
144
+ return out
145
+
146
+
147
+ KERNEL_REQUIREMENTS = _load_kernel_requirements()
148
+
149
+
150
+ def _render_artifact(case: dict[str, Any], condition: str) -> str:
151
+ if condition == "freeform":
152
+ return case["freeform_markdown"].strip()
153
+ if condition == "generic_checklist":
154
+ return json.dumps(case["generic_checklist"], indent=2)
155
+ if condition == "kernel":
156
+ return json.dumps(case["kernel_artifact"], indent=2)
157
+ raise RuntimeError(f"unsupported condition: {condition}")
158
+
159
+
160
+ def _response_schema() -> dict[str, Any]:
161
+ return {
162
+ "type": "object",
163
+ "additionalProperties": False,
164
+ "properties": {
165
+ "artifact_type_guess": {"type": "string"},
166
+ "primary_objective_or_state": {"type": "string"},
167
+ "limits_or_risks": {"type": "array", "items": {"type": "string"}},
168
+ "next_action_or_handoff": {"type": "string"},
169
+ "confidence": {"type": "number"},
170
+ "ambiguities": {"type": "array", "items": {"type": "string"}},
171
+ "pickup_targets": {
172
+ "type": "array",
173
+ "items": {
174
+ "type": "object",
175
+ "additionalProperties": False,
176
+ "properties": {
177
+ "field": {"type": "string"},
178
+ "value": {"type": ["string", "null"]},
179
+ },
180
+ "required": ["field", "value"],
181
+ },
182
+ },
183
+ },
184
+ "required": [
185
+ "artifact_type_guess",
186
+ "primary_objective_or_state",
187
+ "limits_or_risks",
188
+ "next_action_or_handoff",
189
+ "confidence",
190
+ "ambiguities",
191
+ "pickup_targets",
192
+ ],
193
+ }
194
+
195
+
196
+ def _build_prompt(case: dict[str, Any], condition: str) -> str:
197
+ required_fields = KERNEL_REQUIREMENTS[case["artifact_class"]]
198
+ artifact = _render_artifact(case, condition)
199
+ target_list = ", ".join(required_fields)
200
+ return (
201
+ "You are simulating a fresh downstream Codex session with no repo context.\n"
202
+ "Using only the artifact below, recover the required artifact fields for handoff.\n"
203
+ "Return JSON matching the provided schema.\n"
204
+ f"In `pickup_targets`, include one entry for each of these required fields: {target_list}.\n"
205
+ "Each entry must have `field` and `value` keys.\n"
206
+ "For each required field, use a string only when the artifact makes that field explicit enough "
207
+ "to carry forward directly into a canonical artifact. If the artifact does not make it explicit, use null.\n"
208
+ "A value counts as explicit only when the artifact states it directly as a dedicated field, statement, or close field-level synonym.\n"
209
+ "If the value would require synthesis across multiple hints, extrapolation from likely intent, or filling in a structurally missing field, use null.\n"
210
+ "Do not infer missing values from general world knowledge. Do not invent missing structure from likely intent.\n\n"
211
+ f"Artifact:\n{artifact}\n"
212
+ )
213
+
214
+
215
+ def _run_cmd(
216
+ args: list[str],
217
+ *,
218
+ cwd: Path,
219
+ stdin: str | None = None,
220
+ timeout_seconds: int | None = None,
221
+ ) -> subprocess.CompletedProcess[str]:
222
+ return subprocess.run(
223
+ args,
224
+ cwd=str(cwd),
225
+ capture_output=True,
226
+ text=True,
227
+ input=stdin,
228
+ check=False,
229
+ timeout=timeout_seconds,
230
+ )
231
+
232
+
233
+ def _is_transient_codex_failure(proc: subprocess.CompletedProcess[str]) -> bool:
234
+ combined = f"{proc.stdout}\n{proc.stderr}"
235
+ return any(snippet in combined for snippet in TRANSIENT_CODEX_FAILURE_SNIPPETS)
236
+
237
+
238
+ def _run_codex_exec(
239
+ args: list[str],
240
+ *,
241
+ cwd: Path,
242
+ stdin: str,
243
+ attempts: int = 6,
244
+ timeout_seconds: int = 600,
245
+ ) -> subprocess.CompletedProcess[str]:
246
+ last_proc: subprocess.CompletedProcess[str] | None = None
247
+ for attempt in range(1, attempts + 1):
248
+ try:
249
+ proc = _run_cmd(args, cwd=cwd, stdin=stdin, timeout_seconds=timeout_seconds)
250
+ except subprocess.TimeoutExpired as exc:
251
+ if attempt == attempts:
252
+ raise RuntimeError(
253
+ f"codex exec timed out after {timeout_seconds}s on attempt {attempt}/{attempts}"
254
+ ) from exc
255
+ time.sleep(float(min(30, 2 ** (attempt - 1))))
256
+ continue
257
+ if proc.returncode == 0:
258
+ return proc
259
+ last_proc = proc
260
+ if attempt == attempts or not _is_transient_codex_failure(proc):
261
+ return proc
262
+ time.sleep(float(min(30, 2 ** (attempt - 1))))
263
+ if last_proc is None:
264
+ raise RuntimeError("codex exec produced no process result")
265
+ return last_proc
266
+
267
+
268
+ def _extract_session_id(stdout: str) -> str:
269
+ match = re.search(r"session id:\s*([a-z0-9-]+)", stdout, re.IGNORECASE)
270
+ return match.group(1) if match else ""
271
+
272
+
273
+ def _extract_tokens_used(stdout: str) -> int | None:
274
+ match = re.search(r"tokens used\s*\n([0-9,]+)", stdout, re.IGNORECASE)
275
+ if not match:
276
+ return None
277
+ return int(match.group(1).replace(",", ""))
278
+
279
+
280
+ def _normalize_label(value: str) -> str:
281
+ return re.sub(r"\s+", " ", value.strip().lower())
282
+
283
+
284
+ def _extract_freeform_answers(body: str) -> dict[str, str]:
285
+ answers: dict[str, str] = {}
286
+ for raw_line in body.splitlines():
287
+ line = raw_line.strip()
288
+ if not line:
289
+ continue
290
+ match = re.match(r"^[#>*\-\s]*([A-Za-z][A-Za-z \-_/]+):\s*(.+?)\s*$", raw_line)
291
+ if not match:
292
+ continue
293
+ label = _normalize_label(match.group(1))
294
+ value = match.group(2).strip()
295
+ if not value:
296
+ continue
297
+ for field, aliases in FREEFORM_LABEL_ALIASES.items():
298
+ if label in aliases:
299
+ answers[field] = value
300
+ return answers
301
+
302
+
303
+ def _value_present(value: Any) -> bool:
304
+ if isinstance(value, str):
305
+ return bool(value.strip())
306
+ if isinstance(value, list):
307
+ if not value:
308
+ return False
309
+ return all(isinstance(item, str) and item.strip() for item in value)
310
+ return False
311
+
312
+
313
+ def _expected_explicit_fields(case: dict[str, Any], condition: str) -> set[str]:
314
+ targets = set(KERNEL_REQUIREMENTS[case["artifact_class"]])
315
+ if condition == "kernel":
316
+ return set(targets)
317
+ if condition == "generic_checklist":
318
+ checklist = case["generic_checklist"]
319
+ field_map = CHECKLIST_FIELD_MAP[case["artifact_class"]]
320
+ return {
321
+ field
322
+ for field in targets
323
+ if _value_present(checklist.get(field_map.get(field, "")))
324
+ }
325
+ if condition == "freeform":
326
+ parsed = _extract_freeform_answers(case["freeform_markdown"])
327
+ return {field for field in targets if parsed.get(field)}
328
+ raise RuntimeError(f"unsupported condition: {condition}")
329
+
330
+
331
+ def _run_codex_pickup(case: dict[str, Any], condition: str, *, model: str) -> dict[str, Any]:
332
+ prompt = _build_prompt(case, condition)
333
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-agent-pilot.") as td:
334
+ root = Path(td)
335
+ schema_path = root / "schema.json"
336
+ out_path = root / "out.json"
337
+ schema_path.write_text(json.dumps(_response_schema(), indent=2) + "\n", encoding="utf-8")
338
+
339
+ args = [
340
+ "codex",
341
+ "exec",
342
+ "--ephemeral",
343
+ "--skip-git-repo-check",
344
+ "-C",
345
+ str(root),
346
+ "--output-schema",
347
+ str(schema_path),
348
+ "-o",
349
+ str(out_path),
350
+ ]
351
+ if model:
352
+ args.extend(["--model", model])
353
+ args.append("-")
354
+
355
+ started = time.perf_counter()
356
+ proc = _run_codex_exec(args, cwd=REPO_ROOT, stdin=prompt)
357
+ elapsed_ms = round((time.perf_counter() - started) * 1000.0, 3)
358
+ if proc.returncode != 0:
359
+ raise RuntimeError(
360
+ f"codex exec failed for case={case['id']} condition={condition}\n"
361
+ f"stdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}"
362
+ )
363
+ payload = _read_json(out_path)
364
+ return {
365
+ "raw_response": payload,
366
+ "elapsed_ms": elapsed_ms,
367
+ "session_id": _extract_session_id(proc.stdout),
368
+ "tokens_used": _extract_tokens_used(proc.stdout),
369
+ }
370
+
371
+
372
+ def _score_pickup(case: dict[str, Any], response: dict[str, Any]) -> dict[str, Any]:
373
+ targets = KERNEL_REQUIREMENTS[case["artifact_class"]]
374
+ pickup_target_entries = response.get("pickup_targets", [])
375
+ pickup_targets: dict[str, Any] = {}
376
+ if isinstance(pickup_target_entries, list):
377
+ for entry in pickup_target_entries:
378
+ if not isinstance(entry, dict):
379
+ continue
380
+ field = entry.get("field")
381
+ if isinstance(field, str):
382
+ pickup_targets[field] = entry.get("value")
383
+ answered = 0
384
+ answers: dict[str, str | None] = {}
385
+ expected_present = _expected_explicit_fields(case, response.get("_condition", "kernel"))
386
+ for field in targets:
387
+ value = pickup_targets.get(field)
388
+ normalized = value.strip() if isinstance(value, str) and value.strip() else None
389
+ answers[field] = normalized
390
+ if normalized is not None:
391
+ answered += 1
392
+ invented_fields = [
393
+ field
394
+ for field, value in answers.items()
395
+ if value is not None and field not in expected_present
396
+ ]
397
+ pickup_score = round(answered / len(targets), 3)
398
+ invention_rate = round(len(invented_fields) / answered, 3) if answered else 0.0
399
+ return {
400
+ "pickup_targets": targets,
401
+ "expected_present_fields": sorted(expected_present),
402
+ "answers": answers,
403
+ "answered_targets": answered,
404
+ "pickup_targets_total": len(targets),
405
+ "pickup_score": pickup_score,
406
+ "missing_targets": [field for field, value in answers.items() if value is None],
407
+ "invented_fields": invented_fields,
408
+ "invented_fields_count": len(invented_fields),
409
+ "invention_rate": invention_rate,
410
+ "ambiguity_remaining": round(1.0 - pickup_score, 3),
411
+ "confidence": response["confidence"],
412
+ "ambiguities_count": len(response["ambiguities"]),
413
+ }
414
+
415
+
416
+ def _evaluate_case(case: dict[str, Any], *, model: str) -> dict[str, Any]:
417
+ conditions: dict[str, Any] = {}
418
+ for condition in CONDITIONS:
419
+ result = _run_codex_pickup(case, condition, model=model)
420
+ score = _score_pickup(case, {**result["raw_response"], "_condition": condition})
421
+ conditions[condition] = {
422
+ "response": result["raw_response"],
423
+ "score": score,
424
+ "elapsed_ms": result["elapsed_ms"],
425
+ "session_id": result["session_id"],
426
+ "tokens_used": result["tokens_used"],
427
+ }
428
+ return {
429
+ "id": case["id"],
430
+ "domain": case["domain"],
431
+ "artifact_class": case["artifact_class"],
432
+ "prompt": case["prompt"],
433
+ "conditions": conditions,
434
+ }
435
+
436
+
437
+ def _mean(values: list[float]) -> float:
438
+ return round(sum(values) / len(values), 3) if values else 0.0
439
+
440
+
441
+ def _aggregate(cases: list[dict[str, Any]], condition: str) -> dict[str, Any]:
442
+ rows: list[dict[str, Any]] = []
443
+ pickup_scores: list[float] = []
444
+ ambiguity: list[float] = []
445
+ confidence: list[float] = []
446
+ ambiguity_counts: list[float] = []
447
+ invention_rates: list[float] = []
448
+ elapsed: list[float] = []
449
+ tokens: list[float] = []
450
+ answered_rates: list[float] = []
451
+ for case in cases:
452
+ row = case["conditions"][condition]
453
+ score = row["score"]
454
+ rows.append(
455
+ {
456
+ "id": case["id"],
457
+ "domain": case["domain"],
458
+ "artifact_class": case["artifact_class"],
459
+ "pickup_score": score["pickup_score"],
460
+ "ambiguity_remaining": score["ambiguity_remaining"],
461
+ "answered_targets": score["answered_targets"],
462
+ "pickup_targets_total": score["pickup_targets_total"],
463
+ "expected_present_fields": score["expected_present_fields"],
464
+ "answers": score["answers"],
465
+ "invented_fields": score["invented_fields"],
466
+ "invention_rate": score["invention_rate"],
467
+ "artifact_type_guess": row["response"]["artifact_type_guess"],
468
+ "confidence": score["confidence"],
469
+ "ambiguities_count": score["ambiguities_count"],
470
+ "elapsed_ms": row["elapsed_ms"],
471
+ "tokens_used": row["tokens_used"],
472
+ "session_id": row["session_id"],
473
+ }
474
+ )
475
+ pickup_scores.append(score["pickup_score"])
476
+ ambiguity.append(score["ambiguity_remaining"])
477
+ confidence.append(score["confidence"])
478
+ ambiguity_counts.append(score["ambiguities_count"])
479
+ invention_rates.append(score["invention_rate"])
480
+ elapsed.append(row["elapsed_ms"])
481
+ if row["tokens_used"] is not None:
482
+ tokens.append(float(row["tokens_used"]))
483
+ answered_rates.append(score["answered_targets"] / score["pickup_targets_total"])
484
+ return {
485
+ "condition": condition,
486
+ "cases_total": len(rows),
487
+ "rows": rows,
488
+ "mean_pickup_score": _mean(pickup_scores),
489
+ "mean_ambiguity_remaining": _mean(ambiguity),
490
+ "mean_answered_target_rate": _mean(answered_rates),
491
+ "mean_confidence": _mean(confidence),
492
+ "mean_ambiguities_count": _mean(ambiguity_counts),
493
+ "mean_invention_rate": _mean(invention_rates),
494
+ "mean_elapsed_ms": _mean(elapsed),
495
+ "mean_tokens_used": _mean(tokens) if tokens else None,
496
+ }
497
+
498
+
499
+ def _pairwise(cases: list[dict[str, Any]], left: str, right: str) -> dict[str, Any]:
500
+ wins = 0
501
+ ties = 0
502
+ losses = 0
503
+ deltas: list[float] = []
504
+ by_case: list[dict[str, Any]] = []
505
+ for case in cases:
506
+ left_score = case["conditions"][left]["score"]["pickup_score"]
507
+ right_score = case["conditions"][right]["score"]["pickup_score"]
508
+ delta = round(left_score - right_score, 3)
509
+ deltas.append(delta)
510
+ if delta > 0:
511
+ wins += 1
512
+ outcome = "win"
513
+ elif delta < 0:
514
+ losses += 1
515
+ outcome = "loss"
516
+ else:
517
+ ties += 1
518
+ outcome = "tie"
519
+ by_case.append(
520
+ {
521
+ "id": case["id"],
522
+ "domain": case["domain"],
523
+ "artifact_class": case["artifact_class"],
524
+ "left_score": left_score,
525
+ "right_score": right_score,
526
+ "delta": delta,
527
+ "outcome": outcome,
528
+ }
529
+ )
530
+ return {
531
+ "left": left,
532
+ "right": right,
533
+ "wins": wins,
534
+ "ties": ties,
535
+ "losses": losses,
536
+ "mean_pickup_score_delta": _mean(deltas),
537
+ "by_case": by_case,
538
+ }
539
+
540
+
541
+ def _gather_metadata(model: str) -> dict[str, Any]:
542
+ package_version = _read_json(REPO_ROOT / "package.json")["version"]
543
+ commit = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
544
+ branch = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
545
+ codex_version = subprocess.run(["codex", "--version"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
546
+ return {
547
+ "generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
548
+ "repo_commit": commit,
549
+ "repo_branch": branch,
550
+ "package_version": package_version,
551
+ "python_version": sys.version.split()[0],
552
+ "codex_version": codex_version,
553
+ "platform": platform.platform(),
554
+ "model": model or "default",
555
+ }
556
+
557
+
558
+ def build_report(*, model: str, case_ids: set[str] | None = None) -> dict[str, Any]:
559
+ cases = _load_cases()
560
+ if case_ids:
561
+ cases = [case for case in cases if case["id"] in case_ids]
562
+ if not cases:
563
+ raise RuntimeError("no comparison cases matched the requested ids")
564
+ evaluated = [_evaluate_case(case, model=model) for case in cases]
565
+ domains = sorted({case["domain"] for case in evaluated})
566
+ classes = sorted({case["artifact_class"] for case in evaluated})
567
+ conditions = {condition: _aggregate(evaluated, condition) for condition in CONDITIONS}
568
+ pairwise = {
569
+ "kernel_vs_generic_checklist": _pairwise(evaluated, "kernel", "generic_checklist"),
570
+ "kernel_vs_freeform": _pairwise(evaluated, "kernel", "freeform"),
571
+ "generic_checklist_vs_freeform": _pairwise(evaluated, "generic_checklist", "freeform"),
572
+ }
573
+ claims = [
574
+ {
575
+ "id": "matched_agent_pilot_corpus_exists",
576
+ "claim": "ORP ran a matched Codex pickup simulation corpus spanning the requested artifact classes and domains.",
577
+ "status": "pass" if evaluated else "fail",
578
+ },
579
+ {
580
+ "id": "kernel_outscores_generic_checklist_on_agent_pickup",
581
+ "claim": "On the matched Codex recoverability simulation, kernel artifacts preserve more explicit required-field recoverability than generic checklist artifacts.",
582
+ "status": "pass"
583
+ if conditions["kernel"]["mean_pickup_score"] > conditions["generic_checklist"]["mean_pickup_score"]
584
+ and pairwise["kernel_vs_generic_checklist"]["losses"] == 0
585
+ else "fail",
586
+ },
587
+ {
588
+ "id": "kernel_outscores_freeform_on_agent_pickup",
589
+ "claim": "On the matched Codex recoverability simulation, kernel artifacts preserve more explicit required-field recoverability than free-form artifacts.",
590
+ "status": "pass"
591
+ if conditions["kernel"]["mean_pickup_score"] > conditions["freeform"]["mean_pickup_score"]
592
+ and pairwise["kernel_vs_freeform"]["losses"] == 0
593
+ else "fail",
594
+ },
595
+ {
596
+ "id": "generic_checklist_improves_on_freeform_on_agent_pickup",
597
+ "claim": "On the matched Codex recoverability simulation, a generic checklist preserves more explicit required-field recoverability on average than free-form artifacts, but not uniformly case by case.",
598
+ "status": "pass"
599
+ if conditions["generic_checklist"]["mean_pickup_score"] > conditions["freeform"]["mean_pickup_score"]
600
+ else "fail",
601
+ },
602
+ {
603
+ "id": "kernel_preserves_full_pickup_targets_in_agent_simulation",
604
+ "claim": "On the matched Codex recoverability simulation, kernel artifacts keep all required fields explicitly recoverable.",
605
+ "status": "pass"
606
+ if conditions["kernel"]["mean_pickup_score"] == 1.0
607
+ and conditions["kernel"]["mean_answered_target_rate"] == 1.0
608
+ else "fail",
609
+ },
610
+ {
611
+ "id": "kernel_minimizes_invention_on_agent_pickup",
612
+ "claim": "On the matched Codex recoverability simulation, kernel artifacts minimize unsupported field invention relative to free-form and generic checklist artifacts.",
613
+ "status": "pass"
614
+ if conditions["kernel"]["mean_invention_rate"] <= conditions["generic_checklist"]["mean_invention_rate"]
615
+ and conditions["kernel"]["mean_invention_rate"] <= conditions["freeform"]["mean_invention_rate"]
616
+ else "fail",
617
+ },
618
+ ]
619
+ return {
620
+ "schema_version": "1.0.0",
621
+ "kind": "orp_reasoning_kernel_agent_pilot_report",
622
+ "metadata": _gather_metadata(model),
623
+ "corpus": {
624
+ "source": str(COMPARISON_CORPUS.relative_to(REPO_ROOT)),
625
+ "cases_total": len(evaluated),
626
+ "domains_total": len(domains),
627
+ "domains": domains,
628
+ "artifact_classes_total": len(classes),
629
+ "artifact_classes": classes,
630
+ },
631
+ "conditions": conditions,
632
+ "pairwise": pairwise,
633
+ "claims": claims,
634
+ "summary": {
635
+ "all_claims_pass": all(claim["status"] == "pass" for claim in claims),
636
+ "kernel_mean_pickup_score": conditions["kernel"]["mean_pickup_score"],
637
+ "generic_checklist_mean_pickup_score": conditions["generic_checklist"]["mean_pickup_score"],
638
+ "freeform_mean_pickup_score": conditions["freeform"]["mean_pickup_score"],
639
+ "kernel_mean_invention_rate": conditions["kernel"]["mean_invention_rate"],
640
+ "generic_checklist_mean_invention_rate": conditions["generic_checklist"]["mean_invention_rate"],
641
+ "freeform_mean_invention_rate": conditions["freeform"]["mean_invention_rate"],
642
+ },
643
+ }
644
+
645
+
646
+ def main() -> int:
647
+ parser = argparse.ArgumentParser(
648
+ description="Run a live Codex recoverability simulation across free-form, generic checklist, and kernel artifacts."
649
+ )
650
+ parser.add_argument("--out", default="", help="Optional JSON output path")
651
+ parser.add_argument("--model", default="", help="Optional Codex model override")
652
+ parser.add_argument(
653
+ "--case-id",
654
+ action="append",
655
+ default=[],
656
+ help="Optional case id to evaluate (repeatable). Default: all cases.",
657
+ )
658
+ args = parser.parse_args()
659
+
660
+ report = build_report(model=args.model, case_ids=set(args.case_id) or None)
661
+ payload = json.dumps(report, indent=2) + "\n"
662
+ if args.out:
663
+ out_path = Path(args.out)
664
+ if not out_path.is_absolute():
665
+ out_path = REPO_ROOT / out_path
666
+ out_path.parent.mkdir(parents=True, exist_ok=True)
667
+ out_path.write_text(payload, encoding="utf-8")
668
+ print(payload, end="")
669
+ return 0 if report["summary"]["all_claims_pass"] else 1
670
+
671
+
672
+ if __name__ == "__main__":
673
+ raise SystemExit(main())