open-research-protocol 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -0
- package/cli/orp.py +668 -43
- package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
- package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
- package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
- package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
- package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
- package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
- package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
- package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
- package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
- package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
- package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
- package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
- package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
- package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
- package/examples/README.md +2 -0
- package/examples/kernel/comparison/comparison-corpus.json +337 -0
- package/examples/kernel/comparison/next-task-continuation.json +55 -0
- package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
- package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
- package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
- package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
- package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
- package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
- package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
- package/package.json +4 -1
- package/scripts/orp-kernel-agent-pilot.py +673 -0
- package/scripts/orp-kernel-agent-replication.py +307 -0
- package/scripts/orp-kernel-benchmark.py +471 -2
- package/scripts/orp-kernel-canonical-continuation.py +381 -0
- package/scripts/orp-kernel-ci-check.py +138 -0
- package/scripts/orp-kernel-comparison.py +592 -0
- package/scripts/orp-kernel-continuation-pilot.py +384 -0
- package/scripts/orp-kernel-pickup.py +401 -0
- package/spec/v1/kernel-extension.schema.json +96 -0
- package/spec/v1/kernel-proposal.schema.json +115 -0
- package/spec/v1/kernel.schema.json +2 -1
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import platform
|
|
8
|
+
import re
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
import tempfile
|
|
12
|
+
import time
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
17
|
+
CLI = ["node", "bin/orp.js"]
|
|
18
|
+
COMPARISON_CORPUS = REPO_ROOT / "examples" / "kernel" / "comparison" / "comparison-corpus.json"
|
|
19
|
+
KERNEL_SCHEMA = REPO_ROOT / "spec" / "v1" / "kernel.schema.json"
|
|
20
|
+
CONDITIONS = ["freeform", "generic_checklist", "kernel"]
|
|
21
|
+
|
|
22
|
+
FREEFORM_LABEL_ALIASES: dict[str, set[str]] = {
|
|
23
|
+
"artifact_type": {"artifact type", "type"},
|
|
24
|
+
"object": {"object"},
|
|
25
|
+
"goal": {"goal"},
|
|
26
|
+
"boundary": {"boundary", "scope"},
|
|
27
|
+
"constraints": {"constraints", "constraint"},
|
|
28
|
+
"success_criteria": {"success criteria", "success", "done when"},
|
|
29
|
+
"question": {"question"},
|
|
30
|
+
"chosen_path": {"decision", "chosen path", "recommendation"},
|
|
31
|
+
"rejected_alternatives": {"rejected alternatives", "alternatives"},
|
|
32
|
+
"rationale": {"why", "rationale"},
|
|
33
|
+
"consequences": {"consequences", "tradeoffs", "trade-offs"},
|
|
34
|
+
"claim": {"claim"},
|
|
35
|
+
"assumptions": {"assumptions"},
|
|
36
|
+
"test_path": {"test", "test path"},
|
|
37
|
+
"falsifiers": {"falsifiers", "would fail if"},
|
|
38
|
+
"objective": {"objective"},
|
|
39
|
+
"method": {"method"},
|
|
40
|
+
"inputs": {"inputs"},
|
|
41
|
+
"outputs": {"outputs"},
|
|
42
|
+
"evidence_expectations": {"evidence expectations"},
|
|
43
|
+
"interpretation_limits": {"limits", "interpretation limits"},
|
|
44
|
+
"completed_unit": {"completed", "completed unit"},
|
|
45
|
+
"current_state": {"current state"},
|
|
46
|
+
"risks": {"risks", "risk"},
|
|
47
|
+
"next_handoff_target": {"next", "next handoff target", "handoff"},
|
|
48
|
+
"artifact_refs": {"artifact refs", "artifacts", "references"},
|
|
49
|
+
"scope": {"scope"},
|
|
50
|
+
"rule": {"rule"},
|
|
51
|
+
"invariants": {"invariants"},
|
|
52
|
+
"enforcement_surface": {"enforcement", "enforcement surface"},
|
|
53
|
+
"evidence_paths": {"evidence", "evidence paths"},
|
|
54
|
+
"status": {"status"},
|
|
55
|
+
"next_follow_up": {"next follow up", "next follow-up", "next"},
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
CHECKLIST_FIELD_MAP: dict[str, dict[str, str]] = {
|
|
59
|
+
"task": {
|
|
60
|
+
"object": "summary",
|
|
61
|
+
"goal": "summary",
|
|
62
|
+
"boundary": "scope",
|
|
63
|
+
"constraints": "constraints",
|
|
64
|
+
"success_criteria": "checks",
|
|
65
|
+
},
|
|
66
|
+
"decision": {
|
|
67
|
+
"question": "summary",
|
|
68
|
+
"chosen_path": "approach",
|
|
69
|
+
"rejected_alternatives": "notes",
|
|
70
|
+
"rationale": "notes",
|
|
71
|
+
"consequences": "risks",
|
|
72
|
+
},
|
|
73
|
+
"hypothesis": {
|
|
74
|
+
"claim": "summary",
|
|
75
|
+
"boundary": "scope",
|
|
76
|
+
"assumptions": "notes",
|
|
77
|
+
"test_path": "checks",
|
|
78
|
+
"falsifiers": "risks",
|
|
79
|
+
},
|
|
80
|
+
"experiment": {
|
|
81
|
+
"objective": "summary",
|
|
82
|
+
"method": "approach",
|
|
83
|
+
"inputs": "scope",
|
|
84
|
+
"outputs": "checks",
|
|
85
|
+
"evidence_expectations": "evidence",
|
|
86
|
+
"interpretation_limits": "risks",
|
|
87
|
+
},
|
|
88
|
+
"checkpoint": {
|
|
89
|
+
"completed_unit": "summary",
|
|
90
|
+
"current_state": "notes",
|
|
91
|
+
"risks": "risks",
|
|
92
|
+
"next_handoff_target": "handoff",
|
|
93
|
+
"artifact_refs": "evidence",
|
|
94
|
+
},
|
|
95
|
+
"policy": {
|
|
96
|
+
"scope": "scope",
|
|
97
|
+
"rule": "summary",
|
|
98
|
+
"rationale": "notes",
|
|
99
|
+
"invariants": "constraints",
|
|
100
|
+
"enforcement_surface": "checks",
|
|
101
|
+
},
|
|
102
|
+
"result": {
|
|
103
|
+
"claim": "summary",
|
|
104
|
+
"evidence_paths": "evidence",
|
|
105
|
+
"status": "checks",
|
|
106
|
+
"interpretation_limits": "risks",
|
|
107
|
+
"next_follow_up": "handoff",
|
|
108
|
+
},
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
OBJECTIVE_FIELDS: dict[str, list[str]] = {
|
|
112
|
+
"task": ["object", "goal"],
|
|
113
|
+
"decision": ["question", "chosen_path"],
|
|
114
|
+
"hypothesis": ["claim"],
|
|
115
|
+
"experiment": ["objective", "method"],
|
|
116
|
+
"checkpoint": ["completed_unit", "current_state"],
|
|
117
|
+
"policy": ["rule", "scope"],
|
|
118
|
+
"result": ["claim", "status"],
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
LIMIT_FIELDS: dict[str, list[str]] = {
|
|
122
|
+
"task": ["boundary", "constraints"],
|
|
123
|
+
"decision": ["rejected_alternatives", "consequences"],
|
|
124
|
+
"hypothesis": ["boundary", "assumptions"],
|
|
125
|
+
"experiment": ["inputs", "interpretation_limits"],
|
|
126
|
+
"checkpoint": ["risks"],
|
|
127
|
+
"policy": ["invariants"],
|
|
128
|
+
"result": ["interpretation_limits"],
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
EVALUATION_FIELDS: dict[str, list[str]] = {
|
|
132
|
+
"task": ["success_criteria"],
|
|
133
|
+
"decision": ["rationale"],
|
|
134
|
+
"hypothesis": ["test_path", "falsifiers"],
|
|
135
|
+
"experiment": ["outputs", "evidence_expectations"],
|
|
136
|
+
"checkpoint": ["artifact_refs"],
|
|
137
|
+
"policy": ["enforcement_surface", "rationale"],
|
|
138
|
+
"result": ["evidence_paths"],
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
HANDOFF_FIELDS: dict[str, list[str]] = {
|
|
142
|
+
"task": ["object", "goal", "success_criteria"],
|
|
143
|
+
"decision": ["question", "chosen_path", "consequences"],
|
|
144
|
+
"hypothesis": ["claim", "boundary", "test_path"],
|
|
145
|
+
"experiment": ["objective", "method", "outputs"],
|
|
146
|
+
"checkpoint": ["current_state", "next_handoff_target"],
|
|
147
|
+
"policy": ["rule", "scope", "enforcement_surface"],
|
|
148
|
+
"result": ["claim", "status", "next_follow_up"],
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
CHECKLIST_SOURCE_WEIGHTS: dict[str, float] = {
|
|
152
|
+
"summary": 0.55,
|
|
153
|
+
"scope": 0.8,
|
|
154
|
+
"constraints": 0.8,
|
|
155
|
+
"approach": 0.7,
|
|
156
|
+
"checks": 0.7,
|
|
157
|
+
"risks": 0.65,
|
|
158
|
+
"evidence": 0.75,
|
|
159
|
+
"handoff": 0.8,
|
|
160
|
+
"notes": 0.5,
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
FREEFORM_FIELD_WEIGHT = 0.45
|
|
164
|
+
FREEFORM_TYPE_WEIGHT = 0.35
|
|
165
|
+
CHECKLIST_TYPE_WEIGHT = 0.85
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _run(args: list[str], *, cwd: Path = REPO_ROOT, check: bool = True) -> subprocess.CompletedProcess[str]:
|
|
169
|
+
proc = subprocess.run(
|
|
170
|
+
args,
|
|
171
|
+
cwd=str(cwd),
|
|
172
|
+
capture_output=True,
|
|
173
|
+
text=True,
|
|
174
|
+
)
|
|
175
|
+
if check and proc.returncode != 0:
|
|
176
|
+
raise RuntimeError(
|
|
177
|
+
f"command failed: {' '.join(args)}\nstdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
|
|
178
|
+
)
|
|
179
|
+
return proc
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _run_orp(repo_root: Path, *args: str, check: bool = True) -> subprocess.CompletedProcess[str]:
|
|
183
|
+
return _run([*CLI, "--repo-root", str(repo_root), *args], check=check)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _timed_orp(repo_root: Path, *args: str, check: bool = True) -> tuple[float, subprocess.CompletedProcess[str]]:
|
|
187
|
+
started = time.perf_counter()
|
|
188
|
+
proc = _run_orp(repo_root, *args, check=check)
|
|
189
|
+
return (time.perf_counter() - started) * 1000.0, proc
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _read_json(path: Path) -> Any:
|
|
193
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _load_cases() -> list[dict[str, Any]]:
|
|
197
|
+
payload = _read_json(COMPARISON_CORPUS)
|
|
198
|
+
cases = payload.get("cases", [])
|
|
199
|
+
if not isinstance(cases, list) or not cases:
|
|
200
|
+
raise RuntimeError(f"comparison corpus has no cases: {COMPARISON_CORPUS}")
|
|
201
|
+
return cases
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _load_kernel_requirements() -> dict[str, list[str]]:
|
|
205
|
+
payload = _read_json(KERNEL_SCHEMA)
|
|
206
|
+
out: dict[str, list[str]] = {}
|
|
207
|
+
for clause in payload.get("allOf", []):
|
|
208
|
+
if not isinstance(clause, dict):
|
|
209
|
+
continue
|
|
210
|
+
const = (
|
|
211
|
+
clause.get("if", {})
|
|
212
|
+
.get("properties", {})
|
|
213
|
+
.get("artifact_class", {})
|
|
214
|
+
.get("const")
|
|
215
|
+
)
|
|
216
|
+
required = clause.get("then", {}).get("required")
|
|
217
|
+
if isinstance(const, str) and isinstance(required, list):
|
|
218
|
+
out[const] = [str(x) for x in required if isinstance(x, str)]
|
|
219
|
+
return out
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _normalize_label(value: str) -> str:
|
|
223
|
+
return re.sub(r"\s+", " ", value.strip().lower())
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _value_present(value: Any) -> bool:
|
|
227
|
+
if isinstance(value, str):
|
|
228
|
+
return bool(value.strip())
|
|
229
|
+
if isinstance(value, list):
|
|
230
|
+
if not value:
|
|
231
|
+
return False
|
|
232
|
+
return all(isinstance(item, str) and item.strip() for item in value)
|
|
233
|
+
return False
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _coverage(fields: list[str], present_map: dict[str, float]) -> float:
|
|
237
|
+
if not fields:
|
|
238
|
+
return 1.0
|
|
239
|
+
hits = sum(present_map.get(field, 0.0) for field in fields)
|
|
240
|
+
return hits / len(fields)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _mean(values: list[float]) -> float:
|
|
244
|
+
return round(sum(values) / len(values), 3) if values else 0.0
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _score_dimensions(artifact_class: str, present_map: dict[str, float], *, type_clarity: float) -> dict[str, float]:
|
|
248
|
+
required = KERNEL_REQUIREMENTS[artifact_class]
|
|
249
|
+
dimensions = {
|
|
250
|
+
"artifact_type_clarity": round(type_clarity, 3),
|
|
251
|
+
"objective_clarity": round(_coverage(OBJECTIVE_FIELDS[artifact_class], present_map), 3),
|
|
252
|
+
"limits_clarity": round(_coverage(LIMIT_FIELDS[artifact_class], present_map), 3),
|
|
253
|
+
"evaluation_clarity": round(_coverage(EVALUATION_FIELDS[artifact_class], present_map), 3),
|
|
254
|
+
"handoff_readiness": round(_coverage(HANDOFF_FIELDS[artifact_class], present_map), 3),
|
|
255
|
+
"class_specific_completeness": round(_coverage(required, present_map), 3),
|
|
256
|
+
}
|
|
257
|
+
dimensions["total_score"] = round(
|
|
258
|
+
sum(dimensions[key] for key in [
|
|
259
|
+
"artifact_type_clarity",
|
|
260
|
+
"objective_clarity",
|
|
261
|
+
"limits_clarity",
|
|
262
|
+
"evaluation_clarity",
|
|
263
|
+
"handoff_readiness",
|
|
264
|
+
"class_specific_completeness",
|
|
265
|
+
])
|
|
266
|
+
/ 6.0,
|
|
267
|
+
3,
|
|
268
|
+
)
|
|
269
|
+
dimensions["ambiguity_remaining"] = round(1.0 - dimensions["class_specific_completeness"], 3)
|
|
270
|
+
return dimensions
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _parse_freeform_fields(body: str) -> dict[str, bool]:
|
|
274
|
+
found: dict[str, bool] = {}
|
|
275
|
+
for raw_line in body.splitlines():
|
|
276
|
+
line = raw_line.strip()
|
|
277
|
+
if not line:
|
|
278
|
+
continue
|
|
279
|
+
match = re.match(r"^[#>*\-\s]*([A-Za-z][A-Za-z \-_/]+):\s*(.+?)\s*$", raw_line)
|
|
280
|
+
if not match:
|
|
281
|
+
continue
|
|
282
|
+
label = _normalize_label(match.group(1))
|
|
283
|
+
value = match.group(2).strip()
|
|
284
|
+
if not value:
|
|
285
|
+
continue
|
|
286
|
+
for field, aliases in FREEFORM_LABEL_ALIASES.items():
|
|
287
|
+
if label in aliases:
|
|
288
|
+
found[field] = True
|
|
289
|
+
return found
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _score_freeform(case: dict[str, Any]) -> dict[str, Any]:
|
|
293
|
+
artifact_class = case["artifact_class"]
|
|
294
|
+
body = case["freeform_markdown"]
|
|
295
|
+
parsed = _parse_freeform_fields(body)
|
|
296
|
+
present = {
|
|
297
|
+
field: (FREEFORM_FIELD_WEIGHT if parsed.get(field, False) else 0.0)
|
|
298
|
+
for field in KERNEL_REQUIREMENTS[artifact_class]
|
|
299
|
+
}
|
|
300
|
+
type_clarity = FREEFORM_TYPE_WEIGHT if parsed.get("artifact_type", False) else 0.0
|
|
301
|
+
dimensions = _score_dimensions(artifact_class, present, type_clarity=type_clarity)
|
|
302
|
+
return {
|
|
303
|
+
"condition": "freeform",
|
|
304
|
+
"artifact_class": artifact_class,
|
|
305
|
+
"present_fields": sorted(field for field, score in present.items() if score > 0),
|
|
306
|
+
"missing_fields": [field for field in KERNEL_REQUIREMENTS[artifact_class] if present.get(field, 0.0) == 0.0],
|
|
307
|
+
"field_scores": {field: round(score, 3) for field, score in present.items()},
|
|
308
|
+
"dimensions": dimensions,
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _score_checklist(case: dict[str, Any]) -> dict[str, Any]:
|
|
313
|
+
artifact_class = case["artifact_class"]
|
|
314
|
+
checklist = case["generic_checklist"]
|
|
315
|
+
present = {field: 0.0 for field in KERNEL_REQUIREMENTS[artifact_class]}
|
|
316
|
+
mapping = CHECKLIST_FIELD_MAP[artifact_class]
|
|
317
|
+
for field in KERNEL_REQUIREMENTS[artifact_class]:
|
|
318
|
+
source_field = mapping.get(field, "")
|
|
319
|
+
if source_field and _value_present(checklist.get(source_field)):
|
|
320
|
+
present[field] = CHECKLIST_SOURCE_WEIGHTS.get(source_field, 0.5)
|
|
321
|
+
type_clarity = CHECKLIST_TYPE_WEIGHT if checklist.get("artifact_type") == artifact_class else 0.0
|
|
322
|
+
dimensions = _score_dimensions(artifact_class, present, type_clarity=type_clarity)
|
|
323
|
+
return {
|
|
324
|
+
"condition": "generic_checklist",
|
|
325
|
+
"artifact_class": artifact_class,
|
|
326
|
+
"present_fields": sorted(field for field, score in present.items() if score > 0),
|
|
327
|
+
"missing_fields": [field for field in KERNEL_REQUIREMENTS[artifact_class] if present.get(field, 0.0) == 0.0],
|
|
328
|
+
"field_scores": {field: round(score, 3) for field, score in present.items()},
|
|
329
|
+
"dimensions": dimensions,
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _score_kernel(case: dict[str, Any]) -> dict[str, Any]:
|
|
334
|
+
artifact_class = case["artifact_class"]
|
|
335
|
+
kernel_artifact = case["kernel_artifact"]
|
|
336
|
+
with tempfile.TemporaryDirectory(prefix="orp-kernel-comparison.") as td:
|
|
337
|
+
root = Path(td)
|
|
338
|
+
target = root / "analysis" / f"{case['id']}.kernel.json"
|
|
339
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
340
|
+
target.write_text(json.dumps(kernel_artifact, indent=2) + "\n", encoding="utf-8")
|
|
341
|
+
validate_ms, proc = _timed_orp(
|
|
342
|
+
root,
|
|
343
|
+
"kernel",
|
|
344
|
+
"validate",
|
|
345
|
+
str(target.relative_to(root)),
|
|
346
|
+
"--artifact-class",
|
|
347
|
+
artifact_class,
|
|
348
|
+
"--json",
|
|
349
|
+
check=False,
|
|
350
|
+
)
|
|
351
|
+
payload = json.loads(proc.stdout)
|
|
352
|
+
artifact_result = payload["artifact_result"]
|
|
353
|
+
present = {
|
|
354
|
+
field: (1.0 if _value_present(kernel_artifact.get(field)) else 0.0)
|
|
355
|
+
for field in KERNEL_REQUIREMENTS[artifact_class]
|
|
356
|
+
}
|
|
357
|
+
dimensions = _score_dimensions(
|
|
358
|
+
artifact_class,
|
|
359
|
+
present,
|
|
360
|
+
type_clarity=1.0 if artifact_result["artifact_class"] == artifact_class else 0.0,
|
|
361
|
+
)
|
|
362
|
+
return {
|
|
363
|
+
"condition": "kernel",
|
|
364
|
+
"artifact_class": artifact_class,
|
|
365
|
+
"present_fields": sorted(field for field, score in present.items() if score > 0),
|
|
366
|
+
"missing_fields": artifact_result.get("missing_fields", []),
|
|
367
|
+
"field_scores": {field: round(score, 3) for field, score in present.items()},
|
|
368
|
+
"dimensions": dimensions,
|
|
369
|
+
"validate_ms": round(validate_ms, 3),
|
|
370
|
+
"valid": bool(payload.get("ok")),
|
|
371
|
+
"issues": artifact_result.get("issues", []),
|
|
372
|
+
"exit_code": proc.returncode,
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _score_case(case: dict[str, Any]) -> dict[str, Any]:
|
|
377
|
+
freeform = _score_freeform(case)
|
|
378
|
+
checklist = _score_checklist(case)
|
|
379
|
+
kernel = _score_kernel(case)
|
|
380
|
+
return {
|
|
381
|
+
"id": case["id"],
|
|
382
|
+
"domain": case["domain"],
|
|
383
|
+
"artifact_class": case["artifact_class"],
|
|
384
|
+
"prompt": case["prompt"],
|
|
385
|
+
"conditions": {
|
|
386
|
+
"freeform": freeform,
|
|
387
|
+
"generic_checklist": checklist,
|
|
388
|
+
"kernel": kernel,
|
|
389
|
+
},
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _aggregate_condition(cases: list[dict[str, Any]], condition: str) -> dict[str, Any]:
|
|
394
|
+
rows: list[dict[str, Any]] = []
|
|
395
|
+
totals: list[float] = []
|
|
396
|
+
completeness: list[float] = []
|
|
397
|
+
ambiguity: list[float] = []
|
|
398
|
+
dims: dict[str, list[float]] = {
|
|
399
|
+
"artifact_type_clarity": [],
|
|
400
|
+
"objective_clarity": [],
|
|
401
|
+
"limits_clarity": [],
|
|
402
|
+
"evaluation_clarity": [],
|
|
403
|
+
"handoff_readiness": [],
|
|
404
|
+
"class_specific_completeness": [],
|
|
405
|
+
}
|
|
406
|
+
for case in cases:
|
|
407
|
+
row = case["conditions"][condition]
|
|
408
|
+
rows.append(
|
|
409
|
+
{
|
|
410
|
+
"id": case["id"],
|
|
411
|
+
"domain": case["domain"],
|
|
412
|
+
"artifact_class": case["artifact_class"],
|
|
413
|
+
"total_score": row["dimensions"]["total_score"],
|
|
414
|
+
"class_specific_completeness": row["dimensions"]["class_specific_completeness"],
|
|
415
|
+
"ambiguity_remaining": row["dimensions"]["ambiguity_remaining"],
|
|
416
|
+
"present_fields": row["present_fields"],
|
|
417
|
+
"missing_fields": row["missing_fields"],
|
|
418
|
+
}
|
|
419
|
+
)
|
|
420
|
+
totals.append(row["dimensions"]["total_score"])
|
|
421
|
+
completeness.append(row["dimensions"]["class_specific_completeness"])
|
|
422
|
+
ambiguity.append(row["dimensions"]["ambiguity_remaining"])
|
|
423
|
+
for key in dims:
|
|
424
|
+
dims[key].append(row["dimensions"][key])
|
|
425
|
+
return {
|
|
426
|
+
"condition": condition,
|
|
427
|
+
"cases_total": len(rows),
|
|
428
|
+
"rows": rows,
|
|
429
|
+
"mean_total_score": _mean(totals),
|
|
430
|
+
"mean_class_specific_completeness": _mean(completeness),
|
|
431
|
+
"mean_ambiguity_remaining": _mean(ambiguity),
|
|
432
|
+
"mean_dimension_scores": {key: _mean(values) for key, values in dims.items()},
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _pairwise(cases: list[dict[str, Any]], left: str, right: str) -> dict[str, Any]:
|
|
437
|
+
wins = 0
|
|
438
|
+
ties = 0
|
|
439
|
+
losses = 0
|
|
440
|
+
deltas: list[float] = []
|
|
441
|
+
by_case: list[dict[str, Any]] = []
|
|
442
|
+
for case in cases:
|
|
443
|
+
left_score = case["conditions"][left]["dimensions"]["total_score"]
|
|
444
|
+
right_score = case["conditions"][right]["dimensions"]["total_score"]
|
|
445
|
+
delta = round(left_score - right_score, 3)
|
|
446
|
+
deltas.append(delta)
|
|
447
|
+
if delta > 0:
|
|
448
|
+
wins += 1
|
|
449
|
+
outcome = "win"
|
|
450
|
+
elif delta < 0:
|
|
451
|
+
losses += 1
|
|
452
|
+
outcome = "loss"
|
|
453
|
+
else:
|
|
454
|
+
ties += 1
|
|
455
|
+
outcome = "tie"
|
|
456
|
+
by_case.append(
|
|
457
|
+
{
|
|
458
|
+
"id": case["id"],
|
|
459
|
+
"domain": case["domain"],
|
|
460
|
+
"artifact_class": case["artifact_class"],
|
|
461
|
+
"left_score": left_score,
|
|
462
|
+
"right_score": right_score,
|
|
463
|
+
"delta": delta,
|
|
464
|
+
"outcome": outcome,
|
|
465
|
+
}
|
|
466
|
+
)
|
|
467
|
+
return {
|
|
468
|
+
"left": left,
|
|
469
|
+
"right": right,
|
|
470
|
+
"wins": wins,
|
|
471
|
+
"ties": ties,
|
|
472
|
+
"losses": losses,
|
|
473
|
+
"mean_total_score_delta": _mean(deltas),
|
|
474
|
+
"by_case": by_case,
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def _gather_metadata() -> dict[str, Any]:
|
|
479
|
+
package_version = _read_json(REPO_ROOT / "package.json")["version"]
|
|
480
|
+
commit = _run(["git", "rev-parse", "HEAD"]).stdout.strip()
|
|
481
|
+
branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"]).stdout.strip()
|
|
482
|
+
node_version = _run(["node", "--version"]).stdout.strip()
|
|
483
|
+
return {
|
|
484
|
+
"generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
485
|
+
"repo_commit": commit,
|
|
486
|
+
"repo_branch": branch,
|
|
487
|
+
"package_version": package_version,
|
|
488
|
+
"python_version": sys.version.split()[0],
|
|
489
|
+
"node_version": node_version,
|
|
490
|
+
"platform": platform.platform(),
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def build_report() -> dict[str, Any]:
|
|
495
|
+
cases = [_score_case(case) for case in _load_cases()]
|
|
496
|
+
domains = sorted({case["domain"] for case in cases})
|
|
497
|
+
classes = sorted({case["artifact_class"] for case in cases})
|
|
498
|
+
|
|
499
|
+
conditions = {condition: _aggregate_condition(cases, condition) for condition in CONDITIONS}
|
|
500
|
+
pairwise = {
|
|
501
|
+
"kernel_vs_generic_checklist": _pairwise(cases, "kernel", "generic_checklist"),
|
|
502
|
+
"kernel_vs_freeform": _pairwise(cases, "kernel", "freeform"),
|
|
503
|
+
"generic_checklist_vs_freeform": _pairwise(cases, "generic_checklist", "freeform"),
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
claims = [
|
|
507
|
+
{
|
|
508
|
+
"id": "matched_internal_corpus_exists",
|
|
509
|
+
"claim": "ORP has a matched internal comparison corpus spanning multiple domains and all seven kernel artifact classes.",
|
|
510
|
+
"status": "pass" if len(cases) >= 7 and len(domains) >= 5 and len(classes) >= 7 else "fail",
|
|
511
|
+
},
|
|
512
|
+
{
|
|
513
|
+
"id": "kernel_outscores_generic_checklist_on_matched_corpus",
|
|
514
|
+
"claim": "On the matched internal comparison corpus, kernel artifacts achieve higher mean structural scores than generic checklist artifacts.",
|
|
515
|
+
"status": "pass"
|
|
516
|
+
if conditions["kernel"]["mean_total_score"] > conditions["generic_checklist"]["mean_total_score"]
|
|
517
|
+
and pairwise["kernel_vs_generic_checklist"]["losses"] == 0
|
|
518
|
+
else "fail",
|
|
519
|
+
},
|
|
520
|
+
{
|
|
521
|
+
"id": "kernel_outscores_freeform_on_matched_corpus",
|
|
522
|
+
"claim": "On the matched internal comparison corpus, kernel artifacts achieve higher mean structural scores than free-form artifacts.",
|
|
523
|
+
"status": "pass"
|
|
524
|
+
if conditions["kernel"]["mean_total_score"] > conditions["freeform"]["mean_total_score"]
|
|
525
|
+
and pairwise["kernel_vs_freeform"]["losses"] == 0
|
|
526
|
+
else "fail",
|
|
527
|
+
},
|
|
528
|
+
{
|
|
529
|
+
"id": "generic_checklist_improves_on_freeform_for_structure",
|
|
530
|
+
"claim": "On the matched internal comparison corpus, a generic checklist condition improves structural scores over free-form artifacts.",
|
|
531
|
+
"status": "pass"
|
|
532
|
+
if conditions["generic_checklist"]["mean_total_score"] > conditions["freeform"]["mean_total_score"]
|
|
533
|
+
and pairwise["generic_checklist_vs_freeform"]["losses"] == 0
|
|
534
|
+
else "fail",
|
|
535
|
+
},
|
|
536
|
+
{
|
|
537
|
+
"id": "kernel_preserves_full_required_coverage",
|
|
538
|
+
"claim": "On the matched internal comparison corpus, kernel artifacts preserve full class-specific required-field coverage.",
|
|
539
|
+
"status": "pass"
|
|
540
|
+
if conditions["kernel"]["mean_class_specific_completeness"] == 1.0
|
|
541
|
+
else "fail",
|
|
542
|
+
},
|
|
543
|
+
]
|
|
544
|
+
|
|
545
|
+
return {
|
|
546
|
+
"schema_version": "1.0.0",
|
|
547
|
+
"kind": "orp_reasoning_kernel_comparison_report",
|
|
548
|
+
"metadata": _gather_metadata(),
|
|
549
|
+
"corpus": {
|
|
550
|
+
"source": str(COMPARISON_CORPUS.relative_to(REPO_ROOT)),
|
|
551
|
+
"cases_total": len(cases),
|
|
552
|
+
"domains_total": len(domains),
|
|
553
|
+
"domains": domains,
|
|
554
|
+
"artifact_classes_total": len(classes),
|
|
555
|
+
"artifact_classes": classes,
|
|
556
|
+
},
|
|
557
|
+
"conditions": conditions,
|
|
558
|
+
"pairwise": pairwise,
|
|
559
|
+
"claims": claims,
|
|
560
|
+
"summary": {
|
|
561
|
+
"all_claims_pass": all(claim["status"] == "pass" for claim in claims),
|
|
562
|
+
"kernel_mean_total_score": conditions["kernel"]["mean_total_score"],
|
|
563
|
+
"generic_checklist_mean_total_score": conditions["generic_checklist"]["mean_total_score"],
|
|
564
|
+
"freeform_mean_total_score": conditions["freeform"]["mean_total_score"],
|
|
565
|
+
},
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def main() -> int:
|
|
570
|
+
parser = argparse.ArgumentParser(
|
|
571
|
+
description="Run a matched internal comparison between free-form, generic checklist, and ORP kernel artifacts."
|
|
572
|
+
)
|
|
573
|
+
parser.add_argument("--out", default="", help="Optional JSON output path")
|
|
574
|
+
args = parser.parse_args()
|
|
575
|
+
|
|
576
|
+
report = build_report()
|
|
577
|
+
payload = json.dumps(report, indent=2) + "\n"
|
|
578
|
+
if args.out:
|
|
579
|
+
out_path = Path(args.out)
|
|
580
|
+
if not out_path.is_absolute():
|
|
581
|
+
out_path = REPO_ROOT / out_path
|
|
582
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
583
|
+
out_path.write_text(payload, encoding="utf-8")
|
|
584
|
+
print(payload, end="")
|
|
585
|
+
return 0 if report["summary"]["all_claims_pass"] else 1
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
KERNEL_REQUIREMENTS = _load_kernel_requirements()
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
if __name__ == "__main__":
|
|
592
|
+
raise SystemExit(main())
|