open-research-protocol 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -0
- package/cli/orp.py +668 -43
- package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
- package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
- package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
- package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
- package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
- package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
- package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
- package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
- package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
- package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
- package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
- package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
- package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
- package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
- package/examples/README.md +2 -0
- package/examples/kernel/comparison/comparison-corpus.json +337 -0
- package/examples/kernel/comparison/next-task-continuation.json +55 -0
- package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
- package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
- package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
- package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
- package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
- package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
- package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
- package/package.json +4 -1
- package/scripts/orp-kernel-agent-pilot.py +673 -0
- package/scripts/orp-kernel-agent-replication.py +307 -0
- package/scripts/orp-kernel-benchmark.py +471 -2
- package/scripts/orp-kernel-canonical-continuation.py +381 -0
- package/scripts/orp-kernel-ci-check.py +138 -0
- package/scripts/orp-kernel-comparison.py +592 -0
- package/scripts/orp-kernel-continuation-pilot.py +384 -0
- package/scripts/orp-kernel-pickup.py +401 -0
- package/spec/v1/kernel-extension.schema.json +96 -0
- package/spec/v1/kernel-proposal.schema.json +115 -0
- package/spec/v1/kernel.schema.json +2 -1
|
@@ -0,0 +1,673 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import platform
|
|
8
|
+
import re
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
import tempfile
|
|
12
|
+
import time
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
17
|
+
COMPARISON_CORPUS = REPO_ROOT / "examples" / "kernel" / "comparison" / "comparison-corpus.json"
|
|
18
|
+
KERNEL_SCHEMA = REPO_ROOT / "spec" / "v1" / "kernel.schema.json"
|
|
19
|
+
CONDITIONS = ["freeform", "generic_checklist", "kernel"]
|
|
20
|
+
TRANSIENT_CODEX_FAILURE_SNIPPETS = [
|
|
21
|
+
"We're currently experiencing high demand",
|
|
22
|
+
"unexpected status 401 Unauthorized: Missing bearer or basic authentication in header",
|
|
23
|
+
"failed to connect to websocket",
|
|
24
|
+
"Warning: no last agent message; wrote empty content",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
FREEFORM_LABEL_ALIASES: dict[str, set[str]] = {
|
|
28
|
+
"artifact_type": {"artifact type", "type"},
|
|
29
|
+
"object": {"object"},
|
|
30
|
+
"goal": {"goal"},
|
|
31
|
+
"boundary": {"boundary", "scope"},
|
|
32
|
+
"constraints": {"constraints", "constraint"},
|
|
33
|
+
"success_criteria": {"success criteria", "success", "done when"},
|
|
34
|
+
"question": {"question"},
|
|
35
|
+
"chosen_path": {"decision", "chosen path", "recommendation"},
|
|
36
|
+
"rejected_alternatives": {"rejected alternatives", "alternatives"},
|
|
37
|
+
"rationale": {"why", "rationale"},
|
|
38
|
+
"consequences": {"consequences", "tradeoffs", "trade-offs"},
|
|
39
|
+
"claim": {"claim"},
|
|
40
|
+
"assumptions": {"assumptions"},
|
|
41
|
+
"test_path": {"test", "test path"},
|
|
42
|
+
"falsifiers": {"falsifiers", "would fail if"},
|
|
43
|
+
"objective": {"objective"},
|
|
44
|
+
"method": {"method"},
|
|
45
|
+
"inputs": {"inputs"},
|
|
46
|
+
"outputs": {"outputs", "evidence"},
|
|
47
|
+
"evidence_expectations": {"evidence expectations", "evidence"},
|
|
48
|
+
"interpretation_limits": {"limits", "interpretation limits"},
|
|
49
|
+
"completed_unit": {"completed", "completed unit"},
|
|
50
|
+
"current_state": {"current state"},
|
|
51
|
+
"risks": {"risks", "risk"},
|
|
52
|
+
"next_handoff_target": {"next", "next handoff target", "handoff"},
|
|
53
|
+
"artifact_refs": {"artifact refs", "artifacts", "references"},
|
|
54
|
+
"scope": {"scope"},
|
|
55
|
+
"rule": {"rule"},
|
|
56
|
+
"invariants": {"invariants"},
|
|
57
|
+
"enforcement_surface": {"enforcement", "enforcement surface"},
|
|
58
|
+
"evidence_paths": {"evidence", "evidence paths"},
|
|
59
|
+
"status": {"status"},
|
|
60
|
+
"next_follow_up": {"next follow up", "next follow-up", "next"},
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
CHECKLIST_FIELD_MAP: dict[str, dict[str, str]] = {
|
|
64
|
+
"task": {
|
|
65
|
+
"object": "summary",
|
|
66
|
+
"goal": "summary",
|
|
67
|
+
"boundary": "scope",
|
|
68
|
+
"constraints": "constraints",
|
|
69
|
+
"success_criteria": "checks",
|
|
70
|
+
},
|
|
71
|
+
"decision": {
|
|
72
|
+
"question": "summary",
|
|
73
|
+
"chosen_path": "approach",
|
|
74
|
+
"rejected_alternatives": "notes",
|
|
75
|
+
"rationale": "notes",
|
|
76
|
+
"consequences": "risks",
|
|
77
|
+
},
|
|
78
|
+
"hypothesis": {
|
|
79
|
+
"claim": "summary",
|
|
80
|
+
"boundary": "scope",
|
|
81
|
+
"assumptions": "notes",
|
|
82
|
+
"test_path": "checks",
|
|
83
|
+
"falsifiers": "risks",
|
|
84
|
+
},
|
|
85
|
+
"experiment": {
|
|
86
|
+
"objective": "summary",
|
|
87
|
+
"method": "approach",
|
|
88
|
+
"inputs": "scope",
|
|
89
|
+
"outputs": "checks",
|
|
90
|
+
"evidence_expectations": "evidence",
|
|
91
|
+
"interpretation_limits": "risks",
|
|
92
|
+
},
|
|
93
|
+
"checkpoint": {
|
|
94
|
+
"completed_unit": "summary",
|
|
95
|
+
"current_state": "notes",
|
|
96
|
+
"risks": "risks",
|
|
97
|
+
"next_handoff_target": "handoff",
|
|
98
|
+
"artifact_refs": "evidence",
|
|
99
|
+
},
|
|
100
|
+
"policy": {
|
|
101
|
+
"scope": "scope",
|
|
102
|
+
"rule": "summary",
|
|
103
|
+
"rationale": "notes",
|
|
104
|
+
"invariants": "constraints",
|
|
105
|
+
"enforcement_surface": "checks",
|
|
106
|
+
},
|
|
107
|
+
"result": {
|
|
108
|
+
"claim": "summary",
|
|
109
|
+
"evidence_paths": "evidence",
|
|
110
|
+
"status": "checks",
|
|
111
|
+
"interpretation_limits": "risks",
|
|
112
|
+
"next_follow_up": "handoff",
|
|
113
|
+
},
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _read_json(path: Path) -> Any:
|
|
118
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _load_cases() -> list[dict[str, Any]]:
|
|
122
|
+
payload = _read_json(COMPARISON_CORPUS)
|
|
123
|
+
cases = payload.get("cases", [])
|
|
124
|
+
if not isinstance(cases, list) or not cases:
|
|
125
|
+
raise RuntimeError(f"comparison corpus has no cases: {COMPARISON_CORPUS}")
|
|
126
|
+
return cases
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _load_kernel_requirements() -> dict[str, list[str]]:
|
|
130
|
+
payload = _read_json(KERNEL_SCHEMA)
|
|
131
|
+
out: dict[str, list[str]] = {}
|
|
132
|
+
for clause in payload.get("allOf", []):
|
|
133
|
+
if not isinstance(clause, dict):
|
|
134
|
+
continue
|
|
135
|
+
const = (
|
|
136
|
+
clause.get("if", {})
|
|
137
|
+
.get("properties", {})
|
|
138
|
+
.get("artifact_class", {})
|
|
139
|
+
.get("const")
|
|
140
|
+
)
|
|
141
|
+
required = clause.get("then", {}).get("required")
|
|
142
|
+
if isinstance(const, str) and isinstance(required, list):
|
|
143
|
+
out[const] = [str(x) for x in required if isinstance(x, str)]
|
|
144
|
+
return out
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
KERNEL_REQUIREMENTS = _load_kernel_requirements()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _render_artifact(case: dict[str, Any], condition: str) -> str:
|
|
151
|
+
if condition == "freeform":
|
|
152
|
+
return case["freeform_markdown"].strip()
|
|
153
|
+
if condition == "generic_checklist":
|
|
154
|
+
return json.dumps(case["generic_checklist"], indent=2)
|
|
155
|
+
if condition == "kernel":
|
|
156
|
+
return json.dumps(case["kernel_artifact"], indent=2)
|
|
157
|
+
raise RuntimeError(f"unsupported condition: {condition}")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _response_schema() -> dict[str, Any]:
|
|
161
|
+
return {
|
|
162
|
+
"type": "object",
|
|
163
|
+
"additionalProperties": False,
|
|
164
|
+
"properties": {
|
|
165
|
+
"artifact_type_guess": {"type": "string"},
|
|
166
|
+
"primary_objective_or_state": {"type": "string"},
|
|
167
|
+
"limits_or_risks": {"type": "array", "items": {"type": "string"}},
|
|
168
|
+
"next_action_or_handoff": {"type": "string"},
|
|
169
|
+
"confidence": {"type": "number"},
|
|
170
|
+
"ambiguities": {"type": "array", "items": {"type": "string"}},
|
|
171
|
+
"pickup_targets": {
|
|
172
|
+
"type": "array",
|
|
173
|
+
"items": {
|
|
174
|
+
"type": "object",
|
|
175
|
+
"additionalProperties": False,
|
|
176
|
+
"properties": {
|
|
177
|
+
"field": {"type": "string"},
|
|
178
|
+
"value": {"type": ["string", "null"]},
|
|
179
|
+
},
|
|
180
|
+
"required": ["field", "value"],
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
},
|
|
184
|
+
"required": [
|
|
185
|
+
"artifact_type_guess",
|
|
186
|
+
"primary_objective_or_state",
|
|
187
|
+
"limits_or_risks",
|
|
188
|
+
"next_action_or_handoff",
|
|
189
|
+
"confidence",
|
|
190
|
+
"ambiguities",
|
|
191
|
+
"pickup_targets",
|
|
192
|
+
],
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _build_prompt(case: dict[str, Any], condition: str) -> str:
|
|
197
|
+
required_fields = KERNEL_REQUIREMENTS[case["artifact_class"]]
|
|
198
|
+
artifact = _render_artifact(case, condition)
|
|
199
|
+
target_list = ", ".join(required_fields)
|
|
200
|
+
return (
|
|
201
|
+
"You are simulating a fresh downstream Codex session with no repo context.\n"
|
|
202
|
+
"Using only the artifact below, recover the required artifact fields for handoff.\n"
|
|
203
|
+
"Return JSON matching the provided schema.\n"
|
|
204
|
+
f"In `pickup_targets`, include one entry for each of these required fields: {target_list}.\n"
|
|
205
|
+
"Each entry must have `field` and `value` keys.\n"
|
|
206
|
+
"For each required field, use a string only when the artifact makes that field explicit enough "
|
|
207
|
+
"to carry forward directly into a canonical artifact. If the artifact does not make it explicit, use null.\n"
|
|
208
|
+
"A value counts as explicit only when the artifact states it directly as a dedicated field, statement, or close field-level synonym.\n"
|
|
209
|
+
"If the value would require synthesis across multiple hints, extrapolation from likely intent, or filling in a structurally missing field, use null.\n"
|
|
210
|
+
"Do not infer missing values from general world knowledge. Do not invent missing structure from likely intent.\n\n"
|
|
211
|
+
f"Artifact:\n{artifact}\n"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _run_cmd(
|
|
216
|
+
args: list[str],
|
|
217
|
+
*,
|
|
218
|
+
cwd: Path,
|
|
219
|
+
stdin: str | None = None,
|
|
220
|
+
timeout_seconds: int | None = None,
|
|
221
|
+
) -> subprocess.CompletedProcess[str]:
|
|
222
|
+
return subprocess.run(
|
|
223
|
+
args,
|
|
224
|
+
cwd=str(cwd),
|
|
225
|
+
capture_output=True,
|
|
226
|
+
text=True,
|
|
227
|
+
input=stdin,
|
|
228
|
+
check=False,
|
|
229
|
+
timeout=timeout_seconds,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _is_transient_codex_failure(proc: subprocess.CompletedProcess[str]) -> bool:
|
|
234
|
+
combined = f"{proc.stdout}\n{proc.stderr}"
|
|
235
|
+
return any(snippet in combined for snippet in TRANSIENT_CODEX_FAILURE_SNIPPETS)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _run_codex_exec(
|
|
239
|
+
args: list[str],
|
|
240
|
+
*,
|
|
241
|
+
cwd: Path,
|
|
242
|
+
stdin: str,
|
|
243
|
+
attempts: int = 6,
|
|
244
|
+
timeout_seconds: int = 600,
|
|
245
|
+
) -> subprocess.CompletedProcess[str]:
|
|
246
|
+
last_proc: subprocess.CompletedProcess[str] | None = None
|
|
247
|
+
for attempt in range(1, attempts + 1):
|
|
248
|
+
try:
|
|
249
|
+
proc = _run_cmd(args, cwd=cwd, stdin=stdin, timeout_seconds=timeout_seconds)
|
|
250
|
+
except subprocess.TimeoutExpired as exc:
|
|
251
|
+
if attempt == attempts:
|
|
252
|
+
raise RuntimeError(
|
|
253
|
+
f"codex exec timed out after {timeout_seconds}s on attempt {attempt}/{attempts}"
|
|
254
|
+
) from exc
|
|
255
|
+
time.sleep(float(min(30, 2 ** (attempt - 1))))
|
|
256
|
+
continue
|
|
257
|
+
if proc.returncode == 0:
|
|
258
|
+
return proc
|
|
259
|
+
last_proc = proc
|
|
260
|
+
if attempt == attempts or not _is_transient_codex_failure(proc):
|
|
261
|
+
return proc
|
|
262
|
+
time.sleep(float(min(30, 2 ** (attempt - 1))))
|
|
263
|
+
if last_proc is None:
|
|
264
|
+
raise RuntimeError("codex exec produced no process result")
|
|
265
|
+
return last_proc
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _extract_session_id(stdout: str) -> str:
|
|
269
|
+
match = re.search(r"session id:\s*([a-z0-9-]+)", stdout, re.IGNORECASE)
|
|
270
|
+
return match.group(1) if match else ""
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _extract_tokens_used(stdout: str) -> int | None:
|
|
274
|
+
match = re.search(r"tokens used\s*\n([0-9,]+)", stdout, re.IGNORECASE)
|
|
275
|
+
if not match:
|
|
276
|
+
return None
|
|
277
|
+
return int(match.group(1).replace(",", ""))
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _normalize_label(value: str) -> str:
|
|
281
|
+
return re.sub(r"\s+", " ", value.strip().lower())
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _extract_freeform_answers(body: str) -> dict[str, str]:
|
|
285
|
+
answers: dict[str, str] = {}
|
|
286
|
+
for raw_line in body.splitlines():
|
|
287
|
+
line = raw_line.strip()
|
|
288
|
+
if not line:
|
|
289
|
+
continue
|
|
290
|
+
match = re.match(r"^[#>*\-\s]*([A-Za-z][A-Za-z \-_/]+):\s*(.+?)\s*$", raw_line)
|
|
291
|
+
if not match:
|
|
292
|
+
continue
|
|
293
|
+
label = _normalize_label(match.group(1))
|
|
294
|
+
value = match.group(2).strip()
|
|
295
|
+
if not value:
|
|
296
|
+
continue
|
|
297
|
+
for field, aliases in FREEFORM_LABEL_ALIASES.items():
|
|
298
|
+
if label in aliases:
|
|
299
|
+
answers[field] = value
|
|
300
|
+
return answers
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _value_present(value: Any) -> bool:
|
|
304
|
+
if isinstance(value, str):
|
|
305
|
+
return bool(value.strip())
|
|
306
|
+
if isinstance(value, list):
|
|
307
|
+
if not value:
|
|
308
|
+
return False
|
|
309
|
+
return all(isinstance(item, str) and item.strip() for item in value)
|
|
310
|
+
return False
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _expected_explicit_fields(case: dict[str, Any], condition: str) -> set[str]:
|
|
314
|
+
targets = set(KERNEL_REQUIREMENTS[case["artifact_class"]])
|
|
315
|
+
if condition == "kernel":
|
|
316
|
+
return set(targets)
|
|
317
|
+
if condition == "generic_checklist":
|
|
318
|
+
checklist = case["generic_checklist"]
|
|
319
|
+
field_map = CHECKLIST_FIELD_MAP[case["artifact_class"]]
|
|
320
|
+
return {
|
|
321
|
+
field
|
|
322
|
+
for field in targets
|
|
323
|
+
if _value_present(checklist.get(field_map.get(field, "")))
|
|
324
|
+
}
|
|
325
|
+
if condition == "freeform":
|
|
326
|
+
parsed = _extract_freeform_answers(case["freeform_markdown"])
|
|
327
|
+
return {field for field in targets if parsed.get(field)}
|
|
328
|
+
raise RuntimeError(f"unsupported condition: {condition}")
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _run_codex_pickup(case: dict[str, Any], condition: str, *, model: str) -> dict[str, Any]:
|
|
332
|
+
prompt = _build_prompt(case, condition)
|
|
333
|
+
with tempfile.TemporaryDirectory(prefix="orp-kernel-agent-pilot.") as td:
|
|
334
|
+
root = Path(td)
|
|
335
|
+
schema_path = root / "schema.json"
|
|
336
|
+
out_path = root / "out.json"
|
|
337
|
+
schema_path.write_text(json.dumps(_response_schema(), indent=2) + "\n", encoding="utf-8")
|
|
338
|
+
|
|
339
|
+
args = [
|
|
340
|
+
"codex",
|
|
341
|
+
"exec",
|
|
342
|
+
"--ephemeral",
|
|
343
|
+
"--skip-git-repo-check",
|
|
344
|
+
"-C",
|
|
345
|
+
str(root),
|
|
346
|
+
"--output-schema",
|
|
347
|
+
str(schema_path),
|
|
348
|
+
"-o",
|
|
349
|
+
str(out_path),
|
|
350
|
+
]
|
|
351
|
+
if model:
|
|
352
|
+
args.extend(["--model", model])
|
|
353
|
+
args.append("-")
|
|
354
|
+
|
|
355
|
+
started = time.perf_counter()
|
|
356
|
+
proc = _run_codex_exec(args, cwd=REPO_ROOT, stdin=prompt)
|
|
357
|
+
elapsed_ms = round((time.perf_counter() - started) * 1000.0, 3)
|
|
358
|
+
if proc.returncode != 0:
|
|
359
|
+
raise RuntimeError(
|
|
360
|
+
f"codex exec failed for case={case['id']} condition={condition}\n"
|
|
361
|
+
f"stdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}"
|
|
362
|
+
)
|
|
363
|
+
payload = _read_json(out_path)
|
|
364
|
+
return {
|
|
365
|
+
"raw_response": payload,
|
|
366
|
+
"elapsed_ms": elapsed_ms,
|
|
367
|
+
"session_id": _extract_session_id(proc.stdout),
|
|
368
|
+
"tokens_used": _extract_tokens_used(proc.stdout),
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _score_pickup(case: dict[str, Any], response: dict[str, Any]) -> dict[str, Any]:
|
|
373
|
+
targets = KERNEL_REQUIREMENTS[case["artifact_class"]]
|
|
374
|
+
pickup_target_entries = response.get("pickup_targets", [])
|
|
375
|
+
pickup_targets: dict[str, Any] = {}
|
|
376
|
+
if isinstance(pickup_target_entries, list):
|
|
377
|
+
for entry in pickup_target_entries:
|
|
378
|
+
if not isinstance(entry, dict):
|
|
379
|
+
continue
|
|
380
|
+
field = entry.get("field")
|
|
381
|
+
if isinstance(field, str):
|
|
382
|
+
pickup_targets[field] = entry.get("value")
|
|
383
|
+
answered = 0
|
|
384
|
+
answers: dict[str, str | None] = {}
|
|
385
|
+
expected_present = _expected_explicit_fields(case, response.get("_condition", "kernel"))
|
|
386
|
+
for field in targets:
|
|
387
|
+
value = pickup_targets.get(field)
|
|
388
|
+
normalized = value.strip() if isinstance(value, str) and value.strip() else None
|
|
389
|
+
answers[field] = normalized
|
|
390
|
+
if normalized is not None:
|
|
391
|
+
answered += 1
|
|
392
|
+
invented_fields = [
|
|
393
|
+
field
|
|
394
|
+
for field, value in answers.items()
|
|
395
|
+
if value is not None and field not in expected_present
|
|
396
|
+
]
|
|
397
|
+
pickup_score = round(answered / len(targets), 3)
|
|
398
|
+
invention_rate = round(len(invented_fields) / answered, 3) if answered else 0.0
|
|
399
|
+
return {
|
|
400
|
+
"pickup_targets": targets,
|
|
401
|
+
"expected_present_fields": sorted(expected_present),
|
|
402
|
+
"answers": answers,
|
|
403
|
+
"answered_targets": answered,
|
|
404
|
+
"pickup_targets_total": len(targets),
|
|
405
|
+
"pickup_score": pickup_score,
|
|
406
|
+
"missing_targets": [field for field, value in answers.items() if value is None],
|
|
407
|
+
"invented_fields": invented_fields,
|
|
408
|
+
"invented_fields_count": len(invented_fields),
|
|
409
|
+
"invention_rate": invention_rate,
|
|
410
|
+
"ambiguity_remaining": round(1.0 - pickup_score, 3),
|
|
411
|
+
"confidence": response["confidence"],
|
|
412
|
+
"ambiguities_count": len(response["ambiguities"]),
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _evaluate_case(case: dict[str, Any], *, model: str) -> dict[str, Any]:
|
|
417
|
+
conditions: dict[str, Any] = {}
|
|
418
|
+
for condition in CONDITIONS:
|
|
419
|
+
result = _run_codex_pickup(case, condition, model=model)
|
|
420
|
+
score = _score_pickup(case, {**result["raw_response"], "_condition": condition})
|
|
421
|
+
conditions[condition] = {
|
|
422
|
+
"response": result["raw_response"],
|
|
423
|
+
"score": score,
|
|
424
|
+
"elapsed_ms": result["elapsed_ms"],
|
|
425
|
+
"session_id": result["session_id"],
|
|
426
|
+
"tokens_used": result["tokens_used"],
|
|
427
|
+
}
|
|
428
|
+
return {
|
|
429
|
+
"id": case["id"],
|
|
430
|
+
"domain": case["domain"],
|
|
431
|
+
"artifact_class": case["artifact_class"],
|
|
432
|
+
"prompt": case["prompt"],
|
|
433
|
+
"conditions": conditions,
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _mean(values: list[float]) -> float:
|
|
438
|
+
return round(sum(values) / len(values), 3) if values else 0.0
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _aggregate(cases: list[dict[str, Any]], condition: str) -> dict[str, Any]:
|
|
442
|
+
rows: list[dict[str, Any]] = []
|
|
443
|
+
pickup_scores: list[float] = []
|
|
444
|
+
ambiguity: list[float] = []
|
|
445
|
+
confidence: list[float] = []
|
|
446
|
+
ambiguity_counts: list[float] = []
|
|
447
|
+
invention_rates: list[float] = []
|
|
448
|
+
elapsed: list[float] = []
|
|
449
|
+
tokens: list[float] = []
|
|
450
|
+
answered_rates: list[float] = []
|
|
451
|
+
for case in cases:
|
|
452
|
+
row = case["conditions"][condition]
|
|
453
|
+
score = row["score"]
|
|
454
|
+
rows.append(
|
|
455
|
+
{
|
|
456
|
+
"id": case["id"],
|
|
457
|
+
"domain": case["domain"],
|
|
458
|
+
"artifact_class": case["artifact_class"],
|
|
459
|
+
"pickup_score": score["pickup_score"],
|
|
460
|
+
"ambiguity_remaining": score["ambiguity_remaining"],
|
|
461
|
+
"answered_targets": score["answered_targets"],
|
|
462
|
+
"pickup_targets_total": score["pickup_targets_total"],
|
|
463
|
+
"expected_present_fields": score["expected_present_fields"],
|
|
464
|
+
"answers": score["answers"],
|
|
465
|
+
"invented_fields": score["invented_fields"],
|
|
466
|
+
"invention_rate": score["invention_rate"],
|
|
467
|
+
"artifact_type_guess": row["response"]["artifact_type_guess"],
|
|
468
|
+
"confidence": score["confidence"],
|
|
469
|
+
"ambiguities_count": score["ambiguities_count"],
|
|
470
|
+
"elapsed_ms": row["elapsed_ms"],
|
|
471
|
+
"tokens_used": row["tokens_used"],
|
|
472
|
+
"session_id": row["session_id"],
|
|
473
|
+
}
|
|
474
|
+
)
|
|
475
|
+
pickup_scores.append(score["pickup_score"])
|
|
476
|
+
ambiguity.append(score["ambiguity_remaining"])
|
|
477
|
+
confidence.append(score["confidence"])
|
|
478
|
+
ambiguity_counts.append(score["ambiguities_count"])
|
|
479
|
+
invention_rates.append(score["invention_rate"])
|
|
480
|
+
elapsed.append(row["elapsed_ms"])
|
|
481
|
+
if row["tokens_used"] is not None:
|
|
482
|
+
tokens.append(float(row["tokens_used"]))
|
|
483
|
+
answered_rates.append(score["answered_targets"] / score["pickup_targets_total"])
|
|
484
|
+
return {
|
|
485
|
+
"condition": condition,
|
|
486
|
+
"cases_total": len(rows),
|
|
487
|
+
"rows": rows,
|
|
488
|
+
"mean_pickup_score": _mean(pickup_scores),
|
|
489
|
+
"mean_ambiguity_remaining": _mean(ambiguity),
|
|
490
|
+
"mean_answered_target_rate": _mean(answered_rates),
|
|
491
|
+
"mean_confidence": _mean(confidence),
|
|
492
|
+
"mean_ambiguities_count": _mean(ambiguity_counts),
|
|
493
|
+
"mean_invention_rate": _mean(invention_rates),
|
|
494
|
+
"mean_elapsed_ms": _mean(elapsed),
|
|
495
|
+
"mean_tokens_used": _mean(tokens) if tokens else None,
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def _pairwise(cases: list[dict[str, Any]], left: str, right: str) -> dict[str, Any]:
|
|
500
|
+
wins = 0
|
|
501
|
+
ties = 0
|
|
502
|
+
losses = 0
|
|
503
|
+
deltas: list[float] = []
|
|
504
|
+
by_case: list[dict[str, Any]] = []
|
|
505
|
+
for case in cases:
|
|
506
|
+
left_score = case["conditions"][left]["score"]["pickup_score"]
|
|
507
|
+
right_score = case["conditions"][right]["score"]["pickup_score"]
|
|
508
|
+
delta = round(left_score - right_score, 3)
|
|
509
|
+
deltas.append(delta)
|
|
510
|
+
if delta > 0:
|
|
511
|
+
wins += 1
|
|
512
|
+
outcome = "win"
|
|
513
|
+
elif delta < 0:
|
|
514
|
+
losses += 1
|
|
515
|
+
outcome = "loss"
|
|
516
|
+
else:
|
|
517
|
+
ties += 1
|
|
518
|
+
outcome = "tie"
|
|
519
|
+
by_case.append(
|
|
520
|
+
{
|
|
521
|
+
"id": case["id"],
|
|
522
|
+
"domain": case["domain"],
|
|
523
|
+
"artifact_class": case["artifact_class"],
|
|
524
|
+
"left_score": left_score,
|
|
525
|
+
"right_score": right_score,
|
|
526
|
+
"delta": delta,
|
|
527
|
+
"outcome": outcome,
|
|
528
|
+
}
|
|
529
|
+
)
|
|
530
|
+
return {
|
|
531
|
+
"left": left,
|
|
532
|
+
"right": right,
|
|
533
|
+
"wins": wins,
|
|
534
|
+
"ties": ties,
|
|
535
|
+
"losses": losses,
|
|
536
|
+
"mean_pickup_score_delta": _mean(deltas),
|
|
537
|
+
"by_case": by_case,
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _gather_metadata(model: str) -> dict[str, Any]:
|
|
542
|
+
package_version = _read_json(REPO_ROOT / "package.json")["version"]
|
|
543
|
+
commit = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
|
|
544
|
+
branch = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
|
|
545
|
+
codex_version = subprocess.run(["codex", "--version"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
|
|
546
|
+
return {
|
|
547
|
+
"generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
548
|
+
"repo_commit": commit,
|
|
549
|
+
"repo_branch": branch,
|
|
550
|
+
"package_version": package_version,
|
|
551
|
+
"python_version": sys.version.split()[0],
|
|
552
|
+
"codex_version": codex_version,
|
|
553
|
+
"platform": platform.platform(),
|
|
554
|
+
"model": model or "default",
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def build_report(*, model: str, case_ids: set[str] | None = None) -> dict[str, Any]:
|
|
559
|
+
cases = _load_cases()
|
|
560
|
+
if case_ids:
|
|
561
|
+
cases = [case for case in cases if case["id"] in case_ids]
|
|
562
|
+
if not cases:
|
|
563
|
+
raise RuntimeError("no comparison cases matched the requested ids")
|
|
564
|
+
evaluated = [_evaluate_case(case, model=model) for case in cases]
|
|
565
|
+
domains = sorted({case["domain"] for case in evaluated})
|
|
566
|
+
classes = sorted({case["artifact_class"] for case in evaluated})
|
|
567
|
+
conditions = {condition: _aggregate(evaluated, condition) for condition in CONDITIONS}
|
|
568
|
+
pairwise = {
|
|
569
|
+
"kernel_vs_generic_checklist": _pairwise(evaluated, "kernel", "generic_checklist"),
|
|
570
|
+
"kernel_vs_freeform": _pairwise(evaluated, "kernel", "freeform"),
|
|
571
|
+
"generic_checklist_vs_freeform": _pairwise(evaluated, "generic_checklist", "freeform"),
|
|
572
|
+
}
|
|
573
|
+
claims = [
|
|
574
|
+
{
|
|
575
|
+
"id": "matched_agent_pilot_corpus_exists",
|
|
576
|
+
"claim": "ORP ran a matched Codex pickup simulation corpus spanning the requested artifact classes and domains.",
|
|
577
|
+
"status": "pass" if evaluated else "fail",
|
|
578
|
+
},
|
|
579
|
+
{
|
|
580
|
+
"id": "kernel_outscores_generic_checklist_on_agent_pickup",
|
|
581
|
+
"claim": "On the matched Codex recoverability simulation, kernel artifacts preserve more explicit required-field recoverability than generic checklist artifacts.",
|
|
582
|
+
"status": "pass"
|
|
583
|
+
if conditions["kernel"]["mean_pickup_score"] > conditions["generic_checklist"]["mean_pickup_score"]
|
|
584
|
+
and pairwise["kernel_vs_generic_checklist"]["losses"] == 0
|
|
585
|
+
else "fail",
|
|
586
|
+
},
|
|
587
|
+
{
|
|
588
|
+
"id": "kernel_outscores_freeform_on_agent_pickup",
|
|
589
|
+
"claim": "On the matched Codex recoverability simulation, kernel artifacts preserve more explicit required-field recoverability than free-form artifacts.",
|
|
590
|
+
"status": "pass"
|
|
591
|
+
if conditions["kernel"]["mean_pickup_score"] > conditions["freeform"]["mean_pickup_score"]
|
|
592
|
+
and pairwise["kernel_vs_freeform"]["losses"] == 0
|
|
593
|
+
else "fail",
|
|
594
|
+
},
|
|
595
|
+
{
|
|
596
|
+
"id": "generic_checklist_improves_on_freeform_on_agent_pickup",
|
|
597
|
+
"claim": "On the matched Codex recoverability simulation, a generic checklist preserves more explicit required-field recoverability on average than free-form artifacts, but not uniformly case by case.",
|
|
598
|
+
"status": "pass"
|
|
599
|
+
if conditions["generic_checklist"]["mean_pickup_score"] > conditions["freeform"]["mean_pickup_score"]
|
|
600
|
+
else "fail",
|
|
601
|
+
},
|
|
602
|
+
{
|
|
603
|
+
"id": "kernel_preserves_full_pickup_targets_in_agent_simulation",
|
|
604
|
+
"claim": "On the matched Codex recoverability simulation, kernel artifacts keep all required fields explicitly recoverable.",
|
|
605
|
+
"status": "pass"
|
|
606
|
+
if conditions["kernel"]["mean_pickup_score"] == 1.0
|
|
607
|
+
and conditions["kernel"]["mean_answered_target_rate"] == 1.0
|
|
608
|
+
else "fail",
|
|
609
|
+
},
|
|
610
|
+
{
|
|
611
|
+
"id": "kernel_minimizes_invention_on_agent_pickup",
|
|
612
|
+
"claim": "On the matched Codex recoverability simulation, kernel artifacts minimize unsupported field invention relative to free-form and generic checklist artifacts.",
|
|
613
|
+
"status": "pass"
|
|
614
|
+
if conditions["kernel"]["mean_invention_rate"] <= conditions["generic_checklist"]["mean_invention_rate"]
|
|
615
|
+
and conditions["kernel"]["mean_invention_rate"] <= conditions["freeform"]["mean_invention_rate"]
|
|
616
|
+
else "fail",
|
|
617
|
+
},
|
|
618
|
+
]
|
|
619
|
+
return {
|
|
620
|
+
"schema_version": "1.0.0",
|
|
621
|
+
"kind": "orp_reasoning_kernel_agent_pilot_report",
|
|
622
|
+
"metadata": _gather_metadata(model),
|
|
623
|
+
"corpus": {
|
|
624
|
+
"source": str(COMPARISON_CORPUS.relative_to(REPO_ROOT)),
|
|
625
|
+
"cases_total": len(evaluated),
|
|
626
|
+
"domains_total": len(domains),
|
|
627
|
+
"domains": domains,
|
|
628
|
+
"artifact_classes_total": len(classes),
|
|
629
|
+
"artifact_classes": classes,
|
|
630
|
+
},
|
|
631
|
+
"conditions": conditions,
|
|
632
|
+
"pairwise": pairwise,
|
|
633
|
+
"claims": claims,
|
|
634
|
+
"summary": {
|
|
635
|
+
"all_claims_pass": all(claim["status"] == "pass" for claim in claims),
|
|
636
|
+
"kernel_mean_pickup_score": conditions["kernel"]["mean_pickup_score"],
|
|
637
|
+
"generic_checklist_mean_pickup_score": conditions["generic_checklist"]["mean_pickup_score"],
|
|
638
|
+
"freeform_mean_pickup_score": conditions["freeform"]["mean_pickup_score"],
|
|
639
|
+
"kernel_mean_invention_rate": conditions["kernel"]["mean_invention_rate"],
|
|
640
|
+
"generic_checklist_mean_invention_rate": conditions["generic_checklist"]["mean_invention_rate"],
|
|
641
|
+
"freeform_mean_invention_rate": conditions["freeform"]["mean_invention_rate"],
|
|
642
|
+
},
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def main() -> int:
|
|
647
|
+
parser = argparse.ArgumentParser(
|
|
648
|
+
description="Run a live Codex recoverability simulation across free-form, generic checklist, and kernel artifacts."
|
|
649
|
+
)
|
|
650
|
+
parser.add_argument("--out", default="", help="Optional JSON output path")
|
|
651
|
+
parser.add_argument("--model", default="", help="Optional Codex model override")
|
|
652
|
+
parser.add_argument(
|
|
653
|
+
"--case-id",
|
|
654
|
+
action="append",
|
|
655
|
+
default=[],
|
|
656
|
+
help="Optional case id to evaluate (repeatable). Default: all cases.",
|
|
657
|
+
)
|
|
658
|
+
args = parser.parse_args()
|
|
659
|
+
|
|
660
|
+
report = build_report(model=args.model, case_ids=set(args.case_id) or None)
|
|
661
|
+
payload = json.dumps(report, indent=2) + "\n"
|
|
662
|
+
if args.out:
|
|
663
|
+
out_path = Path(args.out)
|
|
664
|
+
if not out_path.is_absolute():
|
|
665
|
+
out_path = REPO_ROOT / out_path
|
|
666
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
667
|
+
out_path.write_text(payload, encoding="utf-8")
|
|
668
|
+
print(payload, end="")
|
|
669
|
+
return 0 if report["summary"]["all_claims_pass"] else 1
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
if __name__ == "__main__":
|
|
673
|
+
raise SystemExit(main())
|