open-research-protocol 0.4.7 → 0.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +15 -0
  2. package/cli/orp.py +1158 -43
  3. package/docs/AGENT_LOOP.md +3 -0
  4. package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
  5. package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
  6. package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
  7. package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
  8. package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
  9. package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
  10. package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
  11. package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
  12. package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
  13. package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
  14. package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
  15. package/docs/ORP_YOUTUBE_INSPECT.md +97 -0
  16. package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
  17. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
  18. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
  19. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
  20. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
  21. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
  22. package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
  23. package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
  24. package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
  25. package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
  26. package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
  27. package/examples/README.md +2 -0
  28. package/examples/kernel/comparison/comparison-corpus.json +337 -0
  29. package/examples/kernel/comparison/next-task-continuation.json +55 -0
  30. package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
  31. package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
  32. package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
  33. package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
  34. package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
  35. package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
  36. package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
  37. package/llms.txt +3 -0
  38. package/package.json +4 -1
  39. package/scripts/orp-kernel-agent-pilot.py +673 -0
  40. package/scripts/orp-kernel-agent-replication.py +307 -0
  41. package/scripts/orp-kernel-benchmark.py +471 -2
  42. package/scripts/orp-kernel-canonical-continuation.py +381 -0
  43. package/scripts/orp-kernel-ci-check.py +138 -0
  44. package/scripts/orp-kernel-comparison.py +592 -0
  45. package/scripts/orp-kernel-continuation-pilot.py +384 -0
  46. package/scripts/orp-kernel-pickup.py +401 -0
  47. package/spec/v1/kernel-extension.schema.json +96 -0
  48. package/spec/v1/kernel-proposal.schema.json +115 -0
  49. package/spec/v1/kernel.schema.json +2 -1
  50. package/spec/v1/youtube-source.schema.json +151 -0
@@ -0,0 +1,384 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import importlib.util
6
+ import json
7
+ from pathlib import Path
8
+ import platform
9
+ import subprocess
10
+ import sys
11
+ import tempfile
12
+ import time
13
+ from typing import Any
14
+
15
+
16
+ REPO_ROOT = Path(__file__).resolve().parents[1]
17
+ AGENT_PILOT = REPO_ROOT / "scripts" / "orp-kernel-agent-pilot.py"
18
+ CONDITIONS = ["freeform", "generic_checklist", "kernel"]
19
+
20
+ HANDOFF_FIELDS: dict[str, list[str]] = {
21
+ "task": ["object", "constraints", "success_criteria"],
22
+ "decision": ["question", "chosen_path", "consequences"],
23
+ "hypothesis": ["claim", "boundary", "test_path"],
24
+ "experiment": ["objective", "method", "outputs"],
25
+ "checkpoint": ["current_state", "risks", "next_handoff_target"],
26
+ "policy": ["scope", "rule", "enforcement_surface"],
27
+ "result": ["claim", "status", "next_follow_up"],
28
+ }
29
+
30
+
31
+ def _load_agent_pilot():
32
+ spec = importlib.util.spec_from_file_location("orp_kernel_agent_pilot_runtime", AGENT_PILOT)
33
+ if spec is None or spec.loader is None:
34
+ raise RuntimeError(f"failed to load agent pilot from {AGENT_PILOT}")
35
+ module = importlib.util.module_from_spec(spec)
36
+ spec.loader.exec_module(module)
37
+ return module
38
+
39
+
40
+ AGENT_PILOT_MODULE = _load_agent_pilot()
41
+
42
+
43
+ def _read_json(path: Path) -> Any:
44
+ return json.loads(path.read_text(encoding="utf-8"))
45
+
46
+
47
+ def _response_schema() -> dict[str, Any]:
48
+ return {
49
+ "type": "object",
50
+ "additionalProperties": False,
51
+ "properties": {
52
+ "artifact_type_guess": {"type": "string"},
53
+ "recommended_next_action": {"type": "string"},
54
+ "carry_forward": {
55
+ "type": "array",
56
+ "items": {
57
+ "type": "object",
58
+ "additionalProperties": False,
59
+ "properties": {
60
+ "field": {"type": "string"},
61
+ "value": {"type": ["string", "null"]},
62
+ },
63
+ "required": ["field", "value"],
64
+ },
65
+ },
66
+ "explicitly_missing": {"type": "array", "items": {"type": "string"}},
67
+ "confidence": {"type": "number"},
68
+ },
69
+ "required": [
70
+ "artifact_type_guess",
71
+ "recommended_next_action",
72
+ "carry_forward",
73
+ "explicitly_missing",
74
+ "confidence",
75
+ ],
76
+ }
77
+
78
+
79
+ def _build_prompt(case: dict[str, Any], condition: str) -> str:
80
+ targets = HANDOFF_FIELDS[case["artifact_class"]]
81
+ artifact = AGENT_PILOT_MODULE._render_artifact(case, condition)
82
+ target_list = ", ".join(targets)
83
+ return (
84
+ "You are simulating a fresh downstream Codex session that must continue the work safely.\n"
85
+ "Using only the artifact below, propose the next action and identify the key fields that must be carried forward.\n"
86
+ "Return JSON matching the provided schema.\n"
87
+ f"In `carry_forward`, include one entry for each of these handoff-critical fields: {target_list}.\n"
88
+ "Use a string only when the artifact makes the field explicit enough to carry forward safely. Otherwise use null.\n"
89
+ "Do not invent missing structure. If information is missing, put it in `explicitly_missing` rather than fabricating it.\n\n"
90
+ f"Artifact:\n{artifact}\n"
91
+ )
92
+
93
+
94
+ def _run_codex_continuation(case: dict[str, Any], condition: str, *, model: str) -> dict[str, Any]:
95
+ prompt = _build_prompt(case, condition)
96
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-continuation.") as td:
97
+ root = Path(td)
98
+ schema_path = root / "schema.json"
99
+ out_path = root / "out.json"
100
+ schema_path.write_text(json.dumps(_response_schema(), indent=2) + "\n", encoding="utf-8")
101
+
102
+ args = [
103
+ "codex",
104
+ "exec",
105
+ "--ephemeral",
106
+ "--skip-git-repo-check",
107
+ "-C",
108
+ str(root),
109
+ "--output-schema",
110
+ str(schema_path),
111
+ "-o",
112
+ str(out_path),
113
+ ]
114
+ if model:
115
+ args.extend(["--model", model])
116
+ args.append("-")
117
+
118
+ started = time.perf_counter()
119
+ proc = AGENT_PILOT_MODULE._run_codex_exec(args, cwd=REPO_ROOT, stdin=prompt)
120
+ elapsed_ms = round((time.perf_counter() - started) * 1000.0, 3)
121
+ if proc.returncode != 0:
122
+ raise RuntimeError(
123
+ f"codex exec failed for case={case['id']} condition={condition}\n"
124
+ f"stdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}"
125
+ )
126
+ payload = _read_json(out_path)
127
+ return {
128
+ "raw_response": payload,
129
+ "elapsed_ms": elapsed_ms,
130
+ "session_id": AGENT_PILOT_MODULE._extract_session_id(proc.stdout),
131
+ "tokens_used": AGENT_PILOT_MODULE._extract_tokens_used(proc.stdout),
132
+ }
133
+
134
+
135
+ def _score_continuation(case: dict[str, Any], response: dict[str, Any]) -> dict[str, Any]:
136
+ targets = HANDOFF_FIELDS[case["artifact_class"]]
137
+ expected_present = AGENT_PILOT_MODULE._expected_explicit_fields(case, response.get("_condition", "kernel"))
138
+ entries = response.get("carry_forward", [])
139
+ carry_forward: dict[str, str | None] = {}
140
+ if isinstance(entries, list):
141
+ for entry in entries:
142
+ if not isinstance(entry, dict):
143
+ continue
144
+ field = entry.get("field")
145
+ if isinstance(field, str):
146
+ value = entry.get("value")
147
+ carry_forward[field] = value.strip() if isinstance(value, str) and value.strip() else None
148
+
149
+ answered = 0
150
+ answers: dict[str, str | None] = {}
151
+ for field in targets:
152
+ value = carry_forward.get(field)
153
+ answers[field] = value
154
+ if value is not None:
155
+ answered += 1
156
+ invented_fields = [
157
+ field
158
+ for field, value in answers.items()
159
+ if value is not None and field not in expected_present
160
+ ]
161
+ carry_forward_score = round(answered / len(targets), 3)
162
+ invention_rate = round(len(invented_fields) / answered, 3) if answered else 0.0
163
+ next_action_present = bool(response.get("recommended_next_action", "").strip())
164
+ continuation_score = round(
165
+ (carry_forward_score + (1.0 - invention_rate) + (1.0 if next_action_present else 0.0)) / 3.0,
166
+ 3,
167
+ )
168
+ return {
169
+ "handoff_fields": targets,
170
+ "expected_present_fields": sorted(field for field in targets if field in expected_present),
171
+ "answers": answers,
172
+ "answered_targets": answered,
173
+ "handoff_fields_total": len(targets),
174
+ "carry_forward_score": carry_forward_score,
175
+ "invented_fields": invented_fields,
176
+ "invented_fields_count": len(invented_fields),
177
+ "invention_rate": invention_rate,
178
+ "next_action_present": next_action_present,
179
+ "continuation_score": continuation_score,
180
+ "explicitly_missing_count": len(response.get("explicitly_missing", [])),
181
+ "confidence": response["confidence"],
182
+ }
183
+
184
+
185
+ def _evaluate_case(case: dict[str, Any], *, model: str) -> dict[str, Any]:
186
+ conditions: dict[str, Any] = {}
187
+ for condition in CONDITIONS:
188
+ result = _run_codex_continuation(case, condition, model=model)
189
+ score = _score_continuation(case, {**result["raw_response"], "_condition": condition})
190
+ conditions[condition] = {
191
+ "response": result["raw_response"],
192
+ "score": score,
193
+ "elapsed_ms": result["elapsed_ms"],
194
+ "session_id": result["session_id"],
195
+ "tokens_used": result["tokens_used"],
196
+ }
197
+ return {
198
+ "id": case["id"],
199
+ "domain": case["domain"],
200
+ "artifact_class": case["artifact_class"],
201
+ "prompt": case["prompt"],
202
+ "conditions": conditions,
203
+ }
204
+
205
+
206
+ def _mean(values: list[float]) -> float:
207
+ return round(sum(values) / len(values), 3) if values else 0.0
208
+
209
+
210
+ def _aggregate(cases: list[dict[str, Any]], condition: str) -> dict[str, Any]:
211
+ rows = []
212
+ continuation_scores: list[float] = []
213
+ carry_forward_scores: list[float] = []
214
+ invention_rates: list[float] = []
215
+ confidence: list[float] = []
216
+ elapsed: list[float] = []
217
+ for case in cases:
218
+ row = case["conditions"][condition]
219
+ score = row["score"]
220
+ rows.append(
221
+ {
222
+ "id": case["id"],
223
+ "domain": case["domain"],
224
+ "artifact_class": case["artifact_class"],
225
+ "continuation_score": score["continuation_score"],
226
+ "carry_forward_score": score["carry_forward_score"],
227
+ "invention_rate": score["invention_rate"],
228
+ "next_action_present": score["next_action_present"],
229
+ "answers": score["answers"],
230
+ "explicitly_missing_count": score["explicitly_missing_count"],
231
+ "recommended_next_action": row["response"]["recommended_next_action"],
232
+ }
233
+ )
234
+ continuation_scores.append(score["continuation_score"])
235
+ carry_forward_scores.append(score["carry_forward_score"])
236
+ invention_rates.append(score["invention_rate"])
237
+ confidence.append(score["confidence"])
238
+ elapsed.append(row["elapsed_ms"])
239
+ return {
240
+ "condition": condition,
241
+ "cases_total": len(rows),
242
+ "rows": rows,
243
+ "mean_continuation_score": _mean(continuation_scores),
244
+ "mean_carry_forward_score": _mean(carry_forward_scores),
245
+ "mean_invention_rate": _mean(invention_rates),
246
+ "mean_confidence": _mean(confidence),
247
+ "mean_elapsed_ms": _mean(elapsed),
248
+ }
249
+
250
+
251
+ def _pairwise(cases: list[dict[str, Any]], left: str, right: str) -> dict[str, Any]:
252
+ wins = 0
253
+ ties = 0
254
+ losses = 0
255
+ deltas: list[float] = []
256
+ for case in cases:
257
+ left_score = case["conditions"][left]["score"]["continuation_score"]
258
+ right_score = case["conditions"][right]["score"]["continuation_score"]
259
+ delta = round(left_score - right_score, 3)
260
+ deltas.append(delta)
261
+ if delta > 0:
262
+ wins += 1
263
+ elif delta < 0:
264
+ losses += 1
265
+ else:
266
+ ties += 1
267
+ return {
268
+ "left": left,
269
+ "right": right,
270
+ "wins": wins,
271
+ "ties": ties,
272
+ "losses": losses,
273
+ "mean_continuation_score_delta": _mean(deltas),
274
+ }
275
+
276
+
277
+ def _gather_metadata(model: str) -> dict[str, Any]:
278
+ package_version = json.loads((REPO_ROOT / "package.json").read_text(encoding="utf-8"))["version"]
279
+ commit = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
280
+ branch = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
281
+ codex_version = subprocess.run(["codex", "--version"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
282
+ return {
283
+ "generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
284
+ "repo_commit": commit,
285
+ "repo_branch": branch,
286
+ "package_version": package_version,
287
+ "python_version": sys.version.split()[0],
288
+ "codex_version": codex_version,
289
+ "platform": platform.platform(),
290
+ "model": model or "default",
291
+ }
292
+
293
+
294
+ def build_report(*, model: str, case_ids: set[str] | None = None) -> dict[str, Any]:
295
+ cases = AGENT_PILOT_MODULE._load_cases()
296
+ if case_ids:
297
+ cases = [case for case in cases if case["id"] in case_ids]
298
+ if not cases:
299
+ raise RuntimeError("no comparison cases matched the requested ids")
300
+ evaluated = [_evaluate_case(case, model=model) for case in cases]
301
+ conditions = {condition: _aggregate(evaluated, condition) for condition in CONDITIONS}
302
+ pairwise = {
303
+ "kernel_vs_generic_checklist": _pairwise(evaluated, "kernel", "generic_checklist"),
304
+ "kernel_vs_freeform": _pairwise(evaluated, "kernel", "freeform"),
305
+ }
306
+ claims = [
307
+ {
308
+ "id": "kernel_outscores_generic_checklist_on_continuation",
309
+ "claim": "On the matched live continuation simulation, kernel artifacts support a downstream continuation score that meets or exceeds generic checklist artifacts without a higher invention rate.",
310
+ "status": "pass"
311
+ if conditions["kernel"]["mean_continuation_score"] >= conditions["generic_checklist"]["mean_continuation_score"]
312
+ and pairwise["kernel_vs_generic_checklist"]["losses"] == 0
313
+ and conditions["kernel"]["mean_invention_rate"] <= conditions["generic_checklist"]["mean_invention_rate"]
314
+ else "fail",
315
+ },
316
+ {
317
+ "id": "kernel_outscores_freeform_on_continuation",
318
+ "claim": "On the matched live continuation simulation, kernel artifacts support a stronger downstream continuation score than free-form artifacts.",
319
+ "status": "pass"
320
+ if conditions["kernel"]["mean_continuation_score"] > conditions["freeform"]["mean_continuation_score"]
321
+ and pairwise["kernel_vs_freeform"]["losses"] == 0
322
+ else "fail",
323
+ },
324
+ {
325
+ "id": "kernel_minimizes_continuation_invention",
326
+ "claim": "On the matched live continuation simulation, kernel artifacts minimize unsupported carry-forward invention.",
327
+ "status": "pass"
328
+ if conditions["kernel"]["mean_invention_rate"] <= conditions["generic_checklist"]["mean_invention_rate"]
329
+ and conditions["kernel"]["mean_invention_rate"] <= conditions["freeform"]["mean_invention_rate"]
330
+ else "fail",
331
+ },
332
+ ]
333
+ return {
334
+ "schema_version": "1.0.0",
335
+ "kind": "orp_reasoning_kernel_continuation_pilot_report",
336
+ "metadata": _gather_metadata(model),
337
+ "corpus": {
338
+ "cases_total": len(evaluated),
339
+ "domains": sorted({case["domain"] for case in evaluated}),
340
+ "artifact_classes": sorted({case["artifact_class"] for case in evaluated}),
341
+ },
342
+ "conditions": conditions,
343
+ "pairwise": pairwise,
344
+ "claims": claims,
345
+ "summary": {
346
+ "all_claims_pass": all(claim["status"] == "pass" for claim in claims),
347
+ "kernel_mean_continuation_score": conditions["kernel"]["mean_continuation_score"],
348
+ "generic_checklist_mean_continuation_score": conditions["generic_checklist"]["mean_continuation_score"],
349
+ "freeform_mean_continuation_score": conditions["freeform"]["mean_continuation_score"],
350
+ "kernel_mean_invention_rate": conditions["kernel"]["mean_invention_rate"],
351
+ "generic_checklist_mean_invention_rate": conditions["generic_checklist"]["mean_invention_rate"],
352
+ "freeform_mean_invention_rate": conditions["freeform"]["mean_invention_rate"],
353
+ },
354
+ }
355
+
356
+
357
+ def main() -> int:
358
+ parser = argparse.ArgumentParser(
359
+ description="Run a live Codex continuation simulation across free-form, generic checklist, and kernel artifacts."
360
+ )
361
+ parser.add_argument("--out", default="", help="Optional JSON output path")
362
+ parser.add_argument("--model", default="", help="Optional Codex model override")
363
+ parser.add_argument(
364
+ "--case-id",
365
+ action="append",
366
+ default=[],
367
+ help="Optional case id to evaluate (repeatable). Default: all cases.",
368
+ )
369
+ args = parser.parse_args()
370
+
371
+ report = build_report(model=args.model, case_ids=set(args.case_id) or None)
372
+ payload = json.dumps(report, indent=2) + "\n"
373
+ if args.out:
374
+ out_path = Path(args.out)
375
+ if not out_path.is_absolute():
376
+ out_path = REPO_ROOT / out_path
377
+ out_path.parent.mkdir(parents=True, exist_ok=True)
378
+ out_path.write_text(payload, encoding="utf-8")
379
+ print(payload, end="")
380
+ return 0 if report["summary"]["all_claims_pass"] else 1
381
+
382
+
383
+ if __name__ == "__main__":
384
+ raise SystemExit(main())