open-research-protocol 0.4.7 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +9 -0
  2. package/cli/orp.py +668 -43
  3. package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
  4. package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
  5. package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
  6. package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
  7. package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
  8. package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
  9. package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
  10. package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
  11. package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
  12. package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
  13. package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
  14. package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
  15. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
  16. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
  17. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
  18. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
  19. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
  20. package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
  21. package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
  22. package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
  23. package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
  24. package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
  25. package/examples/README.md +2 -0
  26. package/examples/kernel/comparison/comparison-corpus.json +337 -0
  27. package/examples/kernel/comparison/next-task-continuation.json +55 -0
  28. package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
  29. package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
  30. package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
  31. package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
  32. package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
  33. package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
  34. package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
  35. package/package.json +4 -1
  36. package/scripts/orp-kernel-agent-pilot.py +673 -0
  37. package/scripts/orp-kernel-agent-replication.py +307 -0
  38. package/scripts/orp-kernel-benchmark.py +471 -2
  39. package/scripts/orp-kernel-canonical-continuation.py +381 -0
  40. package/scripts/orp-kernel-ci-check.py +138 -0
  41. package/scripts/orp-kernel-comparison.py +592 -0
  42. package/scripts/orp-kernel-continuation-pilot.py +384 -0
  43. package/scripts/orp-kernel-pickup.py +401 -0
  44. package/spec/v1/kernel-extension.schema.json +96 -0
  45. package/spec/v1/kernel-proposal.schema.json +115 -0
  46. package/spec/v1/kernel.schema.json +2 -1
@@ -0,0 +1,592 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from pathlib import Path
7
+ import platform
8
+ import re
9
+ import subprocess
10
+ import sys
11
+ import tempfile
12
+ import time
13
+ from typing import Any
14
+
15
+
16
+ REPO_ROOT = Path(__file__).resolve().parents[1]
17
+ CLI = ["node", "bin/orp.js"]
18
+ COMPARISON_CORPUS = REPO_ROOT / "examples" / "kernel" / "comparison" / "comparison-corpus.json"
19
+ KERNEL_SCHEMA = REPO_ROOT / "spec" / "v1" / "kernel.schema.json"
20
+ CONDITIONS = ["freeform", "generic_checklist", "kernel"]
21
+
22
+ FREEFORM_LABEL_ALIASES: dict[str, set[str]] = {
23
+ "artifact_type": {"artifact type", "type"},
24
+ "object": {"object"},
25
+ "goal": {"goal"},
26
+ "boundary": {"boundary", "scope"},
27
+ "constraints": {"constraints", "constraint"},
28
+ "success_criteria": {"success criteria", "success", "done when"},
29
+ "question": {"question"},
30
+ "chosen_path": {"decision", "chosen path", "recommendation"},
31
+ "rejected_alternatives": {"rejected alternatives", "alternatives"},
32
+ "rationale": {"why", "rationale"},
33
+ "consequences": {"consequences", "tradeoffs", "trade-offs"},
34
+ "claim": {"claim"},
35
+ "assumptions": {"assumptions"},
36
+ "test_path": {"test", "test path"},
37
+ "falsifiers": {"falsifiers", "would fail if"},
38
+ "objective": {"objective"},
39
+ "method": {"method"},
40
+ "inputs": {"inputs"},
41
+ "outputs": {"outputs"},
42
+ "evidence_expectations": {"evidence expectations"},
43
+ "interpretation_limits": {"limits", "interpretation limits"},
44
+ "completed_unit": {"completed", "completed unit"},
45
+ "current_state": {"current state"},
46
+ "risks": {"risks", "risk"},
47
+ "next_handoff_target": {"next", "next handoff target", "handoff"},
48
+ "artifact_refs": {"artifact refs", "artifacts", "references"},
49
+ "scope": {"scope"},
50
+ "rule": {"rule"},
51
+ "invariants": {"invariants"},
52
+ "enforcement_surface": {"enforcement", "enforcement surface"},
53
+ "evidence_paths": {"evidence", "evidence paths"},
54
+ "status": {"status"},
55
+ "next_follow_up": {"next follow up", "next follow-up", "next"},
56
+ }
57
+
58
+ CHECKLIST_FIELD_MAP: dict[str, dict[str, str]] = {
59
+ "task": {
60
+ "object": "summary",
61
+ "goal": "summary",
62
+ "boundary": "scope",
63
+ "constraints": "constraints",
64
+ "success_criteria": "checks",
65
+ },
66
+ "decision": {
67
+ "question": "summary",
68
+ "chosen_path": "approach",
69
+ "rejected_alternatives": "notes",
70
+ "rationale": "notes",
71
+ "consequences": "risks",
72
+ },
73
+ "hypothesis": {
74
+ "claim": "summary",
75
+ "boundary": "scope",
76
+ "assumptions": "notes",
77
+ "test_path": "checks",
78
+ "falsifiers": "risks",
79
+ },
80
+ "experiment": {
81
+ "objective": "summary",
82
+ "method": "approach",
83
+ "inputs": "scope",
84
+ "outputs": "checks",
85
+ "evidence_expectations": "evidence",
86
+ "interpretation_limits": "risks",
87
+ },
88
+ "checkpoint": {
89
+ "completed_unit": "summary",
90
+ "current_state": "notes",
91
+ "risks": "risks",
92
+ "next_handoff_target": "handoff",
93
+ "artifact_refs": "evidence",
94
+ },
95
+ "policy": {
96
+ "scope": "scope",
97
+ "rule": "summary",
98
+ "rationale": "notes",
99
+ "invariants": "constraints",
100
+ "enforcement_surface": "checks",
101
+ },
102
+ "result": {
103
+ "claim": "summary",
104
+ "evidence_paths": "evidence",
105
+ "status": "checks",
106
+ "interpretation_limits": "risks",
107
+ "next_follow_up": "handoff",
108
+ },
109
+ }
110
+
111
+ OBJECTIVE_FIELDS: dict[str, list[str]] = {
112
+ "task": ["object", "goal"],
113
+ "decision": ["question", "chosen_path"],
114
+ "hypothesis": ["claim"],
115
+ "experiment": ["objective", "method"],
116
+ "checkpoint": ["completed_unit", "current_state"],
117
+ "policy": ["rule", "scope"],
118
+ "result": ["claim", "status"],
119
+ }
120
+
121
+ LIMIT_FIELDS: dict[str, list[str]] = {
122
+ "task": ["boundary", "constraints"],
123
+ "decision": ["rejected_alternatives", "consequences"],
124
+ "hypothesis": ["boundary", "assumptions"],
125
+ "experiment": ["inputs", "interpretation_limits"],
126
+ "checkpoint": ["risks"],
127
+ "policy": ["invariants"],
128
+ "result": ["interpretation_limits"],
129
+ }
130
+
131
+ EVALUATION_FIELDS: dict[str, list[str]] = {
132
+ "task": ["success_criteria"],
133
+ "decision": ["rationale"],
134
+ "hypothesis": ["test_path", "falsifiers"],
135
+ "experiment": ["outputs", "evidence_expectations"],
136
+ "checkpoint": ["artifact_refs"],
137
+ "policy": ["enforcement_surface", "rationale"],
138
+ "result": ["evidence_paths"],
139
+ }
140
+
141
+ HANDOFF_FIELDS: dict[str, list[str]] = {
142
+ "task": ["object", "goal", "success_criteria"],
143
+ "decision": ["question", "chosen_path", "consequences"],
144
+ "hypothesis": ["claim", "boundary", "test_path"],
145
+ "experiment": ["objective", "method", "outputs"],
146
+ "checkpoint": ["current_state", "next_handoff_target"],
147
+ "policy": ["rule", "scope", "enforcement_surface"],
148
+ "result": ["claim", "status", "next_follow_up"],
149
+ }
150
+
151
+ CHECKLIST_SOURCE_WEIGHTS: dict[str, float] = {
152
+ "summary": 0.55,
153
+ "scope": 0.8,
154
+ "constraints": 0.8,
155
+ "approach": 0.7,
156
+ "checks": 0.7,
157
+ "risks": 0.65,
158
+ "evidence": 0.75,
159
+ "handoff": 0.8,
160
+ "notes": 0.5,
161
+ }
162
+
163
+ FREEFORM_FIELD_WEIGHT = 0.45
164
+ FREEFORM_TYPE_WEIGHT = 0.35
165
+ CHECKLIST_TYPE_WEIGHT = 0.85
166
+
167
+
168
+ def _run(args: list[str], *, cwd: Path = REPO_ROOT, check: bool = True) -> subprocess.CompletedProcess[str]:
169
+ proc = subprocess.run(
170
+ args,
171
+ cwd=str(cwd),
172
+ capture_output=True,
173
+ text=True,
174
+ )
175
+ if check and proc.returncode != 0:
176
+ raise RuntimeError(
177
+ f"command failed: {' '.join(args)}\nstdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
178
+ )
179
+ return proc
180
+
181
+
182
+ def _run_orp(repo_root: Path, *args: str, check: bool = True) -> subprocess.CompletedProcess[str]:
183
+ return _run([*CLI, "--repo-root", str(repo_root), *args], check=check)
184
+
185
+
186
+ def _timed_orp(repo_root: Path, *args: str, check: bool = True) -> tuple[float, subprocess.CompletedProcess[str]]:
187
+ started = time.perf_counter()
188
+ proc = _run_orp(repo_root, *args, check=check)
189
+ return (time.perf_counter() - started) * 1000.0, proc
190
+
191
+
192
+ def _read_json(path: Path) -> Any:
193
+ return json.loads(path.read_text(encoding="utf-8"))
194
+
195
+
196
+ def _load_cases() -> list[dict[str, Any]]:
197
+ payload = _read_json(COMPARISON_CORPUS)
198
+ cases = payload.get("cases", [])
199
+ if not isinstance(cases, list) or not cases:
200
+ raise RuntimeError(f"comparison corpus has no cases: {COMPARISON_CORPUS}")
201
+ return cases
202
+
203
+
204
+ def _load_kernel_requirements() -> dict[str, list[str]]:
205
+ payload = _read_json(KERNEL_SCHEMA)
206
+ out: dict[str, list[str]] = {}
207
+ for clause in payload.get("allOf", []):
208
+ if not isinstance(clause, dict):
209
+ continue
210
+ const = (
211
+ clause.get("if", {})
212
+ .get("properties", {})
213
+ .get("artifact_class", {})
214
+ .get("const")
215
+ )
216
+ required = clause.get("then", {}).get("required")
217
+ if isinstance(const, str) and isinstance(required, list):
218
+ out[const] = [str(x) for x in required if isinstance(x, str)]
219
+ return out
220
+
221
+
222
+ def _normalize_label(value: str) -> str:
223
+ return re.sub(r"\s+", " ", value.strip().lower())
224
+
225
+
226
+ def _value_present(value: Any) -> bool:
227
+ if isinstance(value, str):
228
+ return bool(value.strip())
229
+ if isinstance(value, list):
230
+ if not value:
231
+ return False
232
+ return all(isinstance(item, str) and item.strip() for item in value)
233
+ return False
234
+
235
+
236
+ def _coverage(fields: list[str], present_map: dict[str, float]) -> float:
237
+ if not fields:
238
+ return 1.0
239
+ hits = sum(present_map.get(field, 0.0) for field in fields)
240
+ return hits / len(fields)
241
+
242
+
243
+ def _mean(values: list[float]) -> float:
244
+ return round(sum(values) / len(values), 3) if values else 0.0
245
+
246
+
247
+ def _score_dimensions(artifact_class: str, present_map: dict[str, float], *, type_clarity: float) -> dict[str, float]:
248
+ required = KERNEL_REQUIREMENTS[artifact_class]
249
+ dimensions = {
250
+ "artifact_type_clarity": round(type_clarity, 3),
251
+ "objective_clarity": round(_coverage(OBJECTIVE_FIELDS[artifact_class], present_map), 3),
252
+ "limits_clarity": round(_coverage(LIMIT_FIELDS[artifact_class], present_map), 3),
253
+ "evaluation_clarity": round(_coverage(EVALUATION_FIELDS[artifact_class], present_map), 3),
254
+ "handoff_readiness": round(_coverage(HANDOFF_FIELDS[artifact_class], present_map), 3),
255
+ "class_specific_completeness": round(_coverage(required, present_map), 3),
256
+ }
257
+ dimensions["total_score"] = round(
258
+ sum(dimensions[key] for key in [
259
+ "artifact_type_clarity",
260
+ "objective_clarity",
261
+ "limits_clarity",
262
+ "evaluation_clarity",
263
+ "handoff_readiness",
264
+ "class_specific_completeness",
265
+ ])
266
+ / 6.0,
267
+ 3,
268
+ )
269
+ dimensions["ambiguity_remaining"] = round(1.0 - dimensions["class_specific_completeness"], 3)
270
+ return dimensions
271
+
272
+
273
+ def _parse_freeform_fields(body: str) -> dict[str, bool]:
274
+ found: dict[str, bool] = {}
275
+ for raw_line in body.splitlines():
276
+ line = raw_line.strip()
277
+ if not line:
278
+ continue
279
+ match = re.match(r"^[#>*\-\s]*([A-Za-z][A-Za-z \-_/]+):\s*(.+?)\s*$", raw_line)
280
+ if not match:
281
+ continue
282
+ label = _normalize_label(match.group(1))
283
+ value = match.group(2).strip()
284
+ if not value:
285
+ continue
286
+ for field, aliases in FREEFORM_LABEL_ALIASES.items():
287
+ if label in aliases:
288
+ found[field] = True
289
+ return found
290
+
291
+
292
+ def _score_freeform(case: dict[str, Any]) -> dict[str, Any]:
293
+ artifact_class = case["artifact_class"]
294
+ body = case["freeform_markdown"]
295
+ parsed = _parse_freeform_fields(body)
296
+ present = {
297
+ field: (FREEFORM_FIELD_WEIGHT if parsed.get(field, False) else 0.0)
298
+ for field in KERNEL_REQUIREMENTS[artifact_class]
299
+ }
300
+ type_clarity = FREEFORM_TYPE_WEIGHT if parsed.get("artifact_type", False) else 0.0
301
+ dimensions = _score_dimensions(artifact_class, present, type_clarity=type_clarity)
302
+ return {
303
+ "condition": "freeform",
304
+ "artifact_class": artifact_class,
305
+ "present_fields": sorted(field for field, score in present.items() if score > 0),
306
+ "missing_fields": [field for field in KERNEL_REQUIREMENTS[artifact_class] if present.get(field, 0.0) == 0.0],
307
+ "field_scores": {field: round(score, 3) for field, score in present.items()},
308
+ "dimensions": dimensions,
309
+ }
310
+
311
+
312
+ def _score_checklist(case: dict[str, Any]) -> dict[str, Any]:
313
+ artifact_class = case["artifact_class"]
314
+ checklist = case["generic_checklist"]
315
+ present = {field: 0.0 for field in KERNEL_REQUIREMENTS[artifact_class]}
316
+ mapping = CHECKLIST_FIELD_MAP[artifact_class]
317
+ for field in KERNEL_REQUIREMENTS[artifact_class]:
318
+ source_field = mapping.get(field, "")
319
+ if source_field and _value_present(checklist.get(source_field)):
320
+ present[field] = CHECKLIST_SOURCE_WEIGHTS.get(source_field, 0.5)
321
+ type_clarity = CHECKLIST_TYPE_WEIGHT if checklist.get("artifact_type") == artifact_class else 0.0
322
+ dimensions = _score_dimensions(artifact_class, present, type_clarity=type_clarity)
323
+ return {
324
+ "condition": "generic_checklist",
325
+ "artifact_class": artifact_class,
326
+ "present_fields": sorted(field for field, score in present.items() if score > 0),
327
+ "missing_fields": [field for field in KERNEL_REQUIREMENTS[artifact_class] if present.get(field, 0.0) == 0.0],
328
+ "field_scores": {field: round(score, 3) for field, score in present.items()},
329
+ "dimensions": dimensions,
330
+ }
331
+
332
+
333
+ def _score_kernel(case: dict[str, Any]) -> dict[str, Any]:
334
+ artifact_class = case["artifact_class"]
335
+ kernel_artifact = case["kernel_artifact"]
336
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-comparison.") as td:
337
+ root = Path(td)
338
+ target = root / "analysis" / f"{case['id']}.kernel.json"
339
+ target.parent.mkdir(parents=True, exist_ok=True)
340
+ target.write_text(json.dumps(kernel_artifact, indent=2) + "\n", encoding="utf-8")
341
+ validate_ms, proc = _timed_orp(
342
+ root,
343
+ "kernel",
344
+ "validate",
345
+ str(target.relative_to(root)),
346
+ "--artifact-class",
347
+ artifact_class,
348
+ "--json",
349
+ check=False,
350
+ )
351
+ payload = json.loads(proc.stdout)
352
+ artifact_result = payload["artifact_result"]
353
+ present = {
354
+ field: (1.0 if _value_present(kernel_artifact.get(field)) else 0.0)
355
+ for field in KERNEL_REQUIREMENTS[artifact_class]
356
+ }
357
+ dimensions = _score_dimensions(
358
+ artifact_class,
359
+ present,
360
+ type_clarity=1.0 if artifact_result["artifact_class"] == artifact_class else 0.0,
361
+ )
362
+ return {
363
+ "condition": "kernel",
364
+ "artifact_class": artifact_class,
365
+ "present_fields": sorted(field for field, score in present.items() if score > 0),
366
+ "missing_fields": artifact_result.get("missing_fields", []),
367
+ "field_scores": {field: round(score, 3) for field, score in present.items()},
368
+ "dimensions": dimensions,
369
+ "validate_ms": round(validate_ms, 3),
370
+ "valid": bool(payload.get("ok")),
371
+ "issues": artifact_result.get("issues", []),
372
+ "exit_code": proc.returncode,
373
+ }
374
+
375
+
376
+ def _score_case(case: dict[str, Any]) -> dict[str, Any]:
377
+ freeform = _score_freeform(case)
378
+ checklist = _score_checklist(case)
379
+ kernel = _score_kernel(case)
380
+ return {
381
+ "id": case["id"],
382
+ "domain": case["domain"],
383
+ "artifact_class": case["artifact_class"],
384
+ "prompt": case["prompt"],
385
+ "conditions": {
386
+ "freeform": freeform,
387
+ "generic_checklist": checklist,
388
+ "kernel": kernel,
389
+ },
390
+ }
391
+
392
+
393
+ def _aggregate_condition(cases: list[dict[str, Any]], condition: str) -> dict[str, Any]:
394
+ rows: list[dict[str, Any]] = []
395
+ totals: list[float] = []
396
+ completeness: list[float] = []
397
+ ambiguity: list[float] = []
398
+ dims: dict[str, list[float]] = {
399
+ "artifact_type_clarity": [],
400
+ "objective_clarity": [],
401
+ "limits_clarity": [],
402
+ "evaluation_clarity": [],
403
+ "handoff_readiness": [],
404
+ "class_specific_completeness": [],
405
+ }
406
+ for case in cases:
407
+ row = case["conditions"][condition]
408
+ rows.append(
409
+ {
410
+ "id": case["id"],
411
+ "domain": case["domain"],
412
+ "artifact_class": case["artifact_class"],
413
+ "total_score": row["dimensions"]["total_score"],
414
+ "class_specific_completeness": row["dimensions"]["class_specific_completeness"],
415
+ "ambiguity_remaining": row["dimensions"]["ambiguity_remaining"],
416
+ "present_fields": row["present_fields"],
417
+ "missing_fields": row["missing_fields"],
418
+ }
419
+ )
420
+ totals.append(row["dimensions"]["total_score"])
421
+ completeness.append(row["dimensions"]["class_specific_completeness"])
422
+ ambiguity.append(row["dimensions"]["ambiguity_remaining"])
423
+ for key in dims:
424
+ dims[key].append(row["dimensions"][key])
425
+ return {
426
+ "condition": condition,
427
+ "cases_total": len(rows),
428
+ "rows": rows,
429
+ "mean_total_score": _mean(totals),
430
+ "mean_class_specific_completeness": _mean(completeness),
431
+ "mean_ambiguity_remaining": _mean(ambiguity),
432
+ "mean_dimension_scores": {key: _mean(values) for key, values in dims.items()},
433
+ }
434
+
435
+
436
+ def _pairwise(cases: list[dict[str, Any]], left: str, right: str) -> dict[str, Any]:
437
+ wins = 0
438
+ ties = 0
439
+ losses = 0
440
+ deltas: list[float] = []
441
+ by_case: list[dict[str, Any]] = []
442
+ for case in cases:
443
+ left_score = case["conditions"][left]["dimensions"]["total_score"]
444
+ right_score = case["conditions"][right]["dimensions"]["total_score"]
445
+ delta = round(left_score - right_score, 3)
446
+ deltas.append(delta)
447
+ if delta > 0:
448
+ wins += 1
449
+ outcome = "win"
450
+ elif delta < 0:
451
+ losses += 1
452
+ outcome = "loss"
453
+ else:
454
+ ties += 1
455
+ outcome = "tie"
456
+ by_case.append(
457
+ {
458
+ "id": case["id"],
459
+ "domain": case["domain"],
460
+ "artifact_class": case["artifact_class"],
461
+ "left_score": left_score,
462
+ "right_score": right_score,
463
+ "delta": delta,
464
+ "outcome": outcome,
465
+ }
466
+ )
467
+ return {
468
+ "left": left,
469
+ "right": right,
470
+ "wins": wins,
471
+ "ties": ties,
472
+ "losses": losses,
473
+ "mean_total_score_delta": _mean(deltas),
474
+ "by_case": by_case,
475
+ }
476
+
477
+
478
+ def _gather_metadata() -> dict[str, Any]:
479
+ package_version = _read_json(REPO_ROOT / "package.json")["version"]
480
+ commit = _run(["git", "rev-parse", "HEAD"]).stdout.strip()
481
+ branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"]).stdout.strip()
482
+ node_version = _run(["node", "--version"]).stdout.strip()
483
+ return {
484
+ "generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
485
+ "repo_commit": commit,
486
+ "repo_branch": branch,
487
+ "package_version": package_version,
488
+ "python_version": sys.version.split()[0],
489
+ "node_version": node_version,
490
+ "platform": platform.platform(),
491
+ }
492
+
493
+
494
+ def build_report() -> dict[str, Any]:
495
+ cases = [_score_case(case) for case in _load_cases()]
496
+ domains = sorted({case["domain"] for case in cases})
497
+ classes = sorted({case["artifact_class"] for case in cases})
498
+
499
+ conditions = {condition: _aggregate_condition(cases, condition) for condition in CONDITIONS}
500
+ pairwise = {
501
+ "kernel_vs_generic_checklist": _pairwise(cases, "kernel", "generic_checklist"),
502
+ "kernel_vs_freeform": _pairwise(cases, "kernel", "freeform"),
503
+ "generic_checklist_vs_freeform": _pairwise(cases, "generic_checklist", "freeform"),
504
+ }
505
+
506
+ claims = [
507
+ {
508
+ "id": "matched_internal_corpus_exists",
509
+ "claim": "ORP has a matched internal comparison corpus spanning multiple domains and all seven kernel artifact classes.",
510
+ "status": "pass" if len(cases) >= 7 and len(domains) >= 5 and len(classes) >= 7 else "fail",
511
+ },
512
+ {
513
+ "id": "kernel_outscores_generic_checklist_on_matched_corpus",
514
+ "claim": "On the matched internal comparison corpus, kernel artifacts achieve higher mean structural scores than generic checklist artifacts.",
515
+ "status": "pass"
516
+ if conditions["kernel"]["mean_total_score"] > conditions["generic_checklist"]["mean_total_score"]
517
+ and pairwise["kernel_vs_generic_checklist"]["losses"] == 0
518
+ else "fail",
519
+ },
520
+ {
521
+ "id": "kernel_outscores_freeform_on_matched_corpus",
522
+ "claim": "On the matched internal comparison corpus, kernel artifacts achieve higher mean structural scores than free-form artifacts.",
523
+ "status": "pass"
524
+ if conditions["kernel"]["mean_total_score"] > conditions["freeform"]["mean_total_score"]
525
+ and pairwise["kernel_vs_freeform"]["losses"] == 0
526
+ else "fail",
527
+ },
528
+ {
529
+ "id": "generic_checklist_improves_on_freeform_for_structure",
530
+ "claim": "On the matched internal comparison corpus, a generic checklist condition improves structural scores over free-form artifacts.",
531
+ "status": "pass"
532
+ if conditions["generic_checklist"]["mean_total_score"] > conditions["freeform"]["mean_total_score"]
533
+ and pairwise["generic_checklist_vs_freeform"]["losses"] == 0
534
+ else "fail",
535
+ },
536
+ {
537
+ "id": "kernel_preserves_full_required_coverage",
538
+ "claim": "On the matched internal comparison corpus, kernel artifacts preserve full class-specific required-field coverage.",
539
+ "status": "pass"
540
+ if conditions["kernel"]["mean_class_specific_completeness"] == 1.0
541
+ else "fail",
542
+ },
543
+ ]
544
+
545
+ return {
546
+ "schema_version": "1.0.0",
547
+ "kind": "orp_reasoning_kernel_comparison_report",
548
+ "metadata": _gather_metadata(),
549
+ "corpus": {
550
+ "source": str(COMPARISON_CORPUS.relative_to(REPO_ROOT)),
551
+ "cases_total": len(cases),
552
+ "domains_total": len(domains),
553
+ "domains": domains,
554
+ "artifact_classes_total": len(classes),
555
+ "artifact_classes": classes,
556
+ },
557
+ "conditions": conditions,
558
+ "pairwise": pairwise,
559
+ "claims": claims,
560
+ "summary": {
561
+ "all_claims_pass": all(claim["status"] == "pass" for claim in claims),
562
+ "kernel_mean_total_score": conditions["kernel"]["mean_total_score"],
563
+ "generic_checklist_mean_total_score": conditions["generic_checklist"]["mean_total_score"],
564
+ "freeform_mean_total_score": conditions["freeform"]["mean_total_score"],
565
+ },
566
+ }
567
+
568
+
569
+ def main() -> int:
570
+ parser = argparse.ArgumentParser(
571
+ description="Run a matched internal comparison between free-form, generic checklist, and ORP kernel artifacts."
572
+ )
573
+ parser.add_argument("--out", default="", help="Optional JSON output path")
574
+ args = parser.parse_args()
575
+
576
+ report = build_report()
577
+ payload = json.dumps(report, indent=2) + "\n"
578
+ if args.out:
579
+ out_path = Path(args.out)
580
+ if not out_path.is_absolute():
581
+ out_path = REPO_ROOT / out_path
582
+ out_path.parent.mkdir(parents=True, exist_ok=True)
583
+ out_path.write_text(payload, encoding="utf-8")
584
+ print(payload, end="")
585
+ return 0 if report["summary"]["all_claims_pass"] else 1
586
+
587
+
588
+ KERNEL_REQUIREMENTS = _load_kernel_requirements()
589
+
590
+
591
+ if __name__ == "__main__":
592
+ raise SystemExit(main())