open-research-protocol 0.4.7 → 0.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +15 -0
  2. package/cli/orp.py +1158 -43
  3. package/docs/AGENT_LOOP.md +3 -0
  4. package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
  5. package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
  6. package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
  7. package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
  8. package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
  9. package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
  10. package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
  11. package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
  12. package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
  13. package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
  14. package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
  15. package/docs/ORP_YOUTUBE_INSPECT.md +97 -0
  16. package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
  17. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
  18. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
  19. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
  20. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
  21. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
  22. package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
  23. package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
  24. package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
  25. package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
  26. package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
  27. package/examples/README.md +2 -0
  28. package/examples/kernel/comparison/comparison-corpus.json +337 -0
  29. package/examples/kernel/comparison/next-task-continuation.json +55 -0
  30. package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
  31. package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
  32. package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
  33. package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
  34. package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
  35. package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
  36. package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
  37. package/llms.txt +3 -0
  38. package/package.json +4 -1
  39. package/scripts/orp-kernel-agent-pilot.py +673 -0
  40. package/scripts/orp-kernel-agent-replication.py +307 -0
  41. package/scripts/orp-kernel-benchmark.py +471 -2
  42. package/scripts/orp-kernel-canonical-continuation.py +381 -0
  43. package/scripts/orp-kernel-ci-check.py +138 -0
  44. package/scripts/orp-kernel-comparison.py +592 -0
  45. package/scripts/orp-kernel-continuation-pilot.py +384 -0
  46. package/scripts/orp-kernel-pickup.py +401 -0
  47. package/spec/v1/kernel-extension.schema.json +96 -0
  48. package/spec/v1/kernel-proposal.schema.json +115 -0
  49. package/spec/v1/kernel.schema.json +2 -1
  50. package/spec/v1/youtube-source.schema.json +151 -0
@@ -0,0 +1,307 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import importlib.util
6
+ import json
7
+ import math
8
+ from pathlib import Path
9
+ import platform
10
+ import statistics
11
+ import subprocess
12
+ import sys
13
+ import time
14
+ from typing import Any
15
+
16
+
17
+ REPO_ROOT = Path(__file__).resolve().parents[1]
18
+ AGENT_PILOT = REPO_ROOT / "scripts" / "orp-kernel-agent-pilot.py"
19
+ CONDITIONS = ["freeform", "generic_checklist", "kernel"]
20
+
21
+
22
+ def _load_agent_pilot():
23
+ spec = importlib.util.spec_from_file_location("orp_kernel_agent_pilot_runtime", AGENT_PILOT)
24
+ if spec is None or spec.loader is None:
25
+ raise RuntimeError(f"failed to load agent pilot from {AGENT_PILOT}")
26
+ module = importlib.util.module_from_spec(spec)
27
+ spec.loader.exec_module(module)
28
+ return module
29
+
30
+
31
+ AGENT_PILOT_MODULE = _load_agent_pilot()
32
+
33
+
34
+ def _mean(values: list[float]) -> float:
35
+ return round(sum(values) / len(values), 3) if values else 0.0
36
+
37
+
38
+ def _pstdev(values: list[float]) -> float:
39
+ if len(values) < 2:
40
+ return 0.0
41
+ return round(statistics.pstdev(values), 3)
42
+
43
+
44
+ def _ci95_half_width(values: list[float]) -> float:
45
+ if len(values) < 2:
46
+ return 0.0
47
+ return round(1.96 * statistics.pstdev(values) / math.sqrt(len(values)), 3)
48
+
49
+
50
+ def _gather_metadata(model: str, repeats: int) -> dict[str, Any]:
51
+ package_version = json.loads((REPO_ROOT / "package.json").read_text(encoding="utf-8"))["version"]
52
+ commit = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
53
+ branch = subprocess.run(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
54
+ codex_version = subprocess.run(["codex", "--version"], cwd=str(REPO_ROOT), capture_output=True, text=True, check=True).stdout.strip()
55
+ return {
56
+ "generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
57
+ "repo_commit": commit,
58
+ "repo_branch": branch,
59
+ "package_version": package_version,
60
+ "python_version": sys.version.split()[0],
61
+ "codex_version": codex_version,
62
+ "platform": platform.platform(),
63
+ "model": model or "default",
64
+ "repeats": repeats,
65
+ }
66
+
67
+
68
+ def _aggregate_runs(runs: list[dict[str, Any]]) -> dict[str, Any]:
69
+ out: dict[str, Any] = {}
70
+ for condition in CONDITIONS:
71
+ pickup_scores = [run["conditions"][condition]["mean_pickup_score"] for run in runs]
72
+ invention_rates = [run["conditions"][condition]["mean_invention_rate"] for run in runs]
73
+ confidence = [run["conditions"][condition]["mean_confidence"] for run in runs]
74
+ elapsed = [run["conditions"][condition]["mean_elapsed_ms"] for run in runs]
75
+ out[condition] = {
76
+ "mean_pickup_score": _mean(pickup_scores),
77
+ "pickup_score_stdev": _pstdev(pickup_scores),
78
+ "pickup_score_ci95_half_width": _ci95_half_width(pickup_scores),
79
+ "mean_invention_rate": _mean(invention_rates),
80
+ "invention_rate_stdev": _pstdev(invention_rates),
81
+ "invention_rate_ci95_half_width": _ci95_half_width(invention_rates),
82
+ "mean_confidence": _mean(confidence),
83
+ "confidence_stdev": _pstdev(confidence),
84
+ "mean_elapsed_ms": _mean(elapsed),
85
+ "elapsed_ms_stdev": _pstdev(elapsed),
86
+ }
87
+ return out
88
+
89
+
90
+ def _aggregate_per_field_stability(runs: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]:
91
+ out: dict[str, list[dict[str, Any]]] = {}
92
+ for condition in CONDITIONS:
93
+ buckets: dict[tuple[str, str], dict[str, Any]] = {}
94
+ for run in runs:
95
+ condition_rows = run["conditions"][condition].get("rows", [])
96
+ for row in condition_rows:
97
+ artifact_class = row["artifact_class"]
98
+ answers = row["answers"]
99
+ invented_fields = set(row.get("invented_fields", []))
100
+ expected_present = set(row.get("expected_present_fields", []))
101
+ for field in sorted(answers):
102
+ key = (artifact_class, field)
103
+ bucket = buckets.setdefault(
104
+ key,
105
+ {
106
+ "artifact_class": artifact_class,
107
+ "field": field,
108
+ "case_count": 0,
109
+ "expected_present_count": 0,
110
+ "answered_count": 0,
111
+ "invented_count": 0,
112
+ },
113
+ )
114
+ bucket["case_count"] += 1
115
+ if field in expected_present:
116
+ bucket["expected_present_count"] += 1
117
+ if answers.get(field):
118
+ bucket["answered_count"] += 1
119
+ if field in invented_fields:
120
+ bucket["invented_count"] += 1
121
+ rows = []
122
+ for key in sorted(buckets):
123
+ bucket = buckets[key]
124
+ case_count = bucket["case_count"]
125
+ rows.append(
126
+ {
127
+ "artifact_class": bucket["artifact_class"],
128
+ "field": bucket["field"],
129
+ "expected_present_rate": round(bucket["expected_present_count"] / case_count, 3) if case_count else 0.0,
130
+ "answered_rate": round(bucket["answered_count"] / case_count, 3) if case_count else 0.0,
131
+ "invented_rate": round(bucket["invented_count"] / case_count, 3) if case_count else 0.0,
132
+ "stability_gap": round((bucket["answered_count"] - bucket["expected_present_count"]) / case_count, 3) if case_count else 0.0,
133
+ }
134
+ )
135
+ out[condition] = rows
136
+ return out
137
+
138
+
139
+ def _build_report_from_runs(*, runs: list[dict[str, Any]], model: str) -> dict[str, Any]:
140
+ condition_summary = _aggregate_runs(runs)
141
+ per_field_stability = _aggregate_per_field_stability(runs)
142
+ claims = [
143
+ {
144
+ "id": "kernel_stays_above_generic_checklist_across_replication",
145
+ "claim": "Across repeated live Codex runs, kernel mean pickup stays at or above generic checklist mean pickup, and above it on the aggregated sample.",
146
+ "status": "pass"
147
+ if all(
148
+ run["summary"]["kernel_mean_pickup_score"]
149
+ >= run["summary"]["generic_checklist_mean_pickup_score"]
150
+ for run in runs
151
+ )
152
+ and condition_summary["kernel"]["mean_pickup_score"]
153
+ > condition_summary["generic_checklist"]["mean_pickup_score"]
154
+ else "fail",
155
+ },
156
+ {
157
+ "id": "kernel_stays_above_freeform_across_replication",
158
+ "claim": "Across repeated live Codex runs, kernel mean pickup stays above free-form mean pickup.",
159
+ "status": "pass"
160
+ if all(
161
+ run["summary"]["kernel_mean_pickup_score"]
162
+ > run["summary"]["freeform_mean_pickup_score"]
163
+ for run in runs
164
+ )
165
+ else "fail",
166
+ },
167
+ {
168
+ "id": "kernel_keeps_lowest_or_equal_invention_rate_across_replication",
169
+ "claim": "Across repeated live Codex runs, kernel mean invention rate stays at or below the other conditions.",
170
+ "status": "pass"
171
+ if all(
172
+ run["summary"]["kernel_mean_invention_rate"]
173
+ <= run["summary"]["generic_checklist_mean_invention_rate"]
174
+ and run["summary"]["kernel_mean_invention_rate"]
175
+ <= run["summary"]["freeform_mean_invention_rate"]
176
+ for run in runs
177
+ )
178
+ else "fail",
179
+ },
180
+ ]
181
+ return {
182
+ "schema_version": "1.0.0",
183
+ "kind": "orp_reasoning_kernel_agent_replication_report",
184
+ "metadata": _gather_metadata(model, len(runs)),
185
+ "runs": runs,
186
+ "conditions": condition_summary,
187
+ "per_field_stability": per_field_stability,
188
+ "claims": claims,
189
+ "summary": {
190
+ "all_claims_pass": all(claim["status"] == "pass" for claim in claims),
191
+ "kernel_mean_pickup_score": condition_summary["kernel"]["mean_pickup_score"],
192
+ "generic_checklist_mean_pickup_score": condition_summary["generic_checklist"]["mean_pickup_score"],
193
+ "freeform_mean_pickup_score": condition_summary["freeform"]["mean_pickup_score"],
194
+ "kernel_mean_invention_rate": condition_summary["kernel"]["mean_invention_rate"],
195
+ "generic_checklist_mean_invention_rate": condition_summary["generic_checklist"]["mean_invention_rate"],
196
+ "freeform_mean_invention_rate": condition_summary["freeform"]["mean_invention_rate"],
197
+ },
198
+ }
199
+
200
+
201
+ def build_report(
202
+ *,
203
+ model: str,
204
+ repeats: int,
205
+ case_ids: set[str] | None = None,
206
+ progress: bool = False,
207
+ ) -> dict[str, Any]:
208
+ runs = []
209
+ for index in range(repeats):
210
+ run = AGENT_PILOT_MODULE.build_report(model=model, case_ids=case_ids)
211
+ runs.append(
212
+ {
213
+ "run_index": index + 1,
214
+ "summary": run["summary"],
215
+ "conditions": run["conditions"],
216
+ "pairwise": run["pairwise"],
217
+ }
218
+ )
219
+ if progress:
220
+ print(
221
+ f"[orp-kernel-agent-replication] completed repeat {index + 1}/{repeats}",
222
+ file=sys.stderr,
223
+ flush=True,
224
+ )
225
+ return _build_report_from_runs(runs=runs, model=model)
226
+
227
+
228
+ def merge_reports(paths: list[Path], *, model: str) -> dict[str, Any]:
229
+ runs: list[dict[str, Any]] = []
230
+ for path in paths:
231
+ payload = json.loads(path.read_text(encoding="utf-8"))
232
+ payload_runs = payload.get("runs", [])
233
+ if not isinstance(payload_runs, list):
234
+ raise RuntimeError(f"replication report has no runs list: {path}")
235
+ for run in payload_runs:
236
+ if not isinstance(run, dict):
237
+ continue
238
+ runs.append(
239
+ {
240
+ "run_index": len(runs) + 1,
241
+ "summary": run["summary"],
242
+ "conditions": run["conditions"],
243
+ "pairwise": run["pairwise"],
244
+ }
245
+ )
246
+ if not runs:
247
+ raise RuntimeError("no runs found across merged replication reports")
248
+ report = _build_report_from_runs(runs=runs, model=model)
249
+ report["metadata"]["source_reports"] = [str(path) for path in paths]
250
+ return report
251
+
252
+
253
+ def main() -> int:
254
+ parser = argparse.ArgumentParser(
255
+ description="Run repeated live Codex kernel pickup simulations and summarize stability."
256
+ )
257
+ parser.add_argument("--out", default="", help="Optional JSON output path")
258
+ parser.add_argument("--model", default="", help="Optional Codex model override")
259
+ parser.add_argument("--repeats", type=int, default=3, help="Number of repeated live runs. Default: 3")
260
+ parser.add_argument(
261
+ "--merge-report",
262
+ action="append",
263
+ default=[],
264
+ help="Merge existing replication JSON reports instead of running live repeats (repeatable).",
265
+ )
266
+ parser.add_argument(
267
+ "--progress",
268
+ action="store_true",
269
+ help="Print per-repeat progress lines to stderr during live runs.",
270
+ )
271
+ parser.add_argument(
272
+ "--case-id",
273
+ action="append",
274
+ default=[],
275
+ help="Optional case id to evaluate (repeatable). Default: all cases.",
276
+ )
277
+ args = parser.parse_args()
278
+ if args.merge_report and args.repeats != 3:
279
+ raise SystemExit("--repeats cannot be combined with --merge-report")
280
+ if not args.merge_report and args.repeats < 1:
281
+ raise SystemExit("--repeats must be at least 1")
282
+
283
+ if args.merge_report:
284
+ report = merge_reports(
285
+ [Path(path) if Path(path).is_absolute() else REPO_ROOT / path for path in args.merge_report],
286
+ model=args.model,
287
+ )
288
+ else:
289
+ report = build_report(
290
+ model=args.model,
291
+ repeats=args.repeats,
292
+ case_ids=set(args.case_id) or None,
293
+ progress=args.progress,
294
+ )
295
+ payload = json.dumps(report, indent=2) + "\n"
296
+ if args.out:
297
+ out_path = Path(args.out)
298
+ if not out_path.is_absolute():
299
+ out_path = REPO_ROOT / out_path
300
+ out_path.parent.mkdir(parents=True, exist_ok=True)
301
+ out_path.write_text(payload, encoding="utf-8")
302
+ print(payload, end="")
303
+ return 0 if report["summary"]["all_claims_pass"] else 1
304
+
305
+
306
+ if __name__ == "__main__":
307
+ raise SystemExit(main())