open-research-protocol 0.4.5 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -2
- package/cli/orp.py +504 -27
- package/docs/AGENT_LOOP.md +2 -0
- package/docs/CANONICAL_CLI_BOUNDARY.md +20 -1
- package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +353 -0
- package/docs/ORP_REASONING_KERNEL_V0_1.md +499 -0
- package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +197 -0
- package/examples/README.md +2 -0
- package/examples/kernel/trace-widget.task.kernel.yml +18 -0
- package/examples/orp.reasoning-kernel.starter.yml +61 -0
- package/package.json +1 -1
- package/scripts/orp-kernel-benchmark.py +452 -0
- package/spec/v1/kernel.schema.json +286 -0
- package/spec/v1/orp.config.schema.json +59 -0
- package/spec/v1/packet.schema.json +97 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
version: "1"
|
|
2
|
+
|
|
3
|
+
project:
|
|
4
|
+
name: reasoning-kernel-starter
|
|
5
|
+
repo_root: .
|
|
6
|
+
canonical_paths:
|
|
7
|
+
code: src/
|
|
8
|
+
analysis: analysis/
|
|
9
|
+
|
|
10
|
+
lifecycle:
|
|
11
|
+
claim_status_map:
|
|
12
|
+
Draft: draft
|
|
13
|
+
In review: ready
|
|
14
|
+
Verified: reviewed
|
|
15
|
+
Blocked: blocked
|
|
16
|
+
Retracted: retracted
|
|
17
|
+
atom_status_map:
|
|
18
|
+
todo: draft
|
|
19
|
+
in_progress: ready
|
|
20
|
+
blocked: blocked
|
|
21
|
+
done: reviewed
|
|
22
|
+
|
|
23
|
+
gates:
|
|
24
|
+
- id: trace_widget_task_shape
|
|
25
|
+
description: Validate that the trace-widget request has been promoted into a solid task artifact.
|
|
26
|
+
phase: structure_kernel
|
|
27
|
+
command: echo ORP_KERNEL_OK
|
|
28
|
+
pass:
|
|
29
|
+
exit_codes: [0]
|
|
30
|
+
stdout_must_contain:
|
|
31
|
+
- ORP_KERNEL_OK
|
|
32
|
+
kernel:
|
|
33
|
+
mode: hard
|
|
34
|
+
artifacts:
|
|
35
|
+
- path: examples/kernel/trace-widget.task.kernel.yml
|
|
36
|
+
artifact_class: task
|
|
37
|
+
evidence:
|
|
38
|
+
status: process_only
|
|
39
|
+
note: The kernel artifact captures task structure for promotion, not evidence.
|
|
40
|
+
paths:
|
|
41
|
+
- examples/kernel/trace-widget.task.kernel.yml
|
|
42
|
+
on_fail: stop
|
|
43
|
+
|
|
44
|
+
- id: smoke
|
|
45
|
+
description: Basic starter smoke gate.
|
|
46
|
+
phase: verification
|
|
47
|
+
command: echo ORP_SMOKE
|
|
48
|
+
pass:
|
|
49
|
+
exit_codes: [0]
|
|
50
|
+
stdout_must_contain:
|
|
51
|
+
- ORP_SMOKE
|
|
52
|
+
on_fail: stop
|
|
53
|
+
|
|
54
|
+
profiles:
|
|
55
|
+
default:
|
|
56
|
+
description: Minimal kernel-aware starter profile.
|
|
57
|
+
mode: discovery
|
|
58
|
+
packet_kind: problem_scope
|
|
59
|
+
gate_ids:
|
|
60
|
+
- trace_widget_task_shape
|
|
61
|
+
- smoke
|
package/package.json
CHANGED
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import platform
|
|
8
|
+
import statistics
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
import tempfile
|
|
12
|
+
import time
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
17
|
+
CLI = ["node", "bin/orp.js"]
|
|
18
|
+
ARTIFACT_CLASSES = [
|
|
19
|
+
"task",
|
|
20
|
+
"decision",
|
|
21
|
+
"hypothesis",
|
|
22
|
+
"experiment",
|
|
23
|
+
"checkpoint",
|
|
24
|
+
"policy",
|
|
25
|
+
"result",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _run(
|
|
30
|
+
args: list[str],
|
|
31
|
+
*,
|
|
32
|
+
cwd: Path = REPO_ROOT,
|
|
33
|
+
check: bool = True,
|
|
34
|
+
) -> subprocess.CompletedProcess[str]:
|
|
35
|
+
proc = subprocess.run(
|
|
36
|
+
args,
|
|
37
|
+
cwd=str(cwd),
|
|
38
|
+
capture_output=True,
|
|
39
|
+
text=True,
|
|
40
|
+
)
|
|
41
|
+
if check and proc.returncode != 0:
|
|
42
|
+
raise RuntimeError(
|
|
43
|
+
f"command failed: {' '.join(args)}\nstdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
|
|
44
|
+
)
|
|
45
|
+
return proc
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _run_orp(repo_root: Path, *args: str, check: bool = True) -> subprocess.CompletedProcess[str]:
|
|
49
|
+
return _run([*CLI, "--repo-root", str(repo_root), *args], check=check)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _timed_orp(repo_root: Path, *args: str, check: bool = True) -> tuple[float, subprocess.CompletedProcess[str]]:
|
|
53
|
+
started = time.perf_counter()
|
|
54
|
+
proc = _run_orp(repo_root, *args, check=check)
|
|
55
|
+
return (time.perf_counter() - started) * 1000.0, proc
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _write_json(path: Path, payload: dict[str, Any]) -> None:
|
|
59
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _stats(values: list[float]) -> dict[str, float]:
|
|
64
|
+
return {
|
|
65
|
+
"mean_ms": round(statistics.mean(values), 3),
|
|
66
|
+
"median_ms": round(statistics.median(values), 3),
|
|
67
|
+
"min_ms": round(min(values), 3),
|
|
68
|
+
"max_ms": round(max(values), 3),
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _benchmark_init_starter(iterations: int) -> dict[str, Any]:
|
|
73
|
+
init_times: list[float] = []
|
|
74
|
+
validate_times: list[float] = []
|
|
75
|
+
gate_times: list[float] = []
|
|
76
|
+
run_records: list[str] = []
|
|
77
|
+
|
|
78
|
+
for _ in range(iterations):
|
|
79
|
+
with tempfile.TemporaryDirectory(prefix="orp-kernel-bench-init.") as td:
|
|
80
|
+
root = Path(td)
|
|
81
|
+
_run(["git", "init", str(root)])
|
|
82
|
+
init_ms, init_proc = _timed_orp(root, "init", "--json")
|
|
83
|
+
init_payload = json.loads(init_proc.stdout)
|
|
84
|
+
validate_ms, validate_proc = _timed_orp(
|
|
85
|
+
root, "kernel", "validate", "analysis/orp.kernel.task.yml", "--json"
|
|
86
|
+
)
|
|
87
|
+
validate_payload = json.loads(validate_proc.stdout)
|
|
88
|
+
gate_ms, gate_proc = _timed_orp(root, "gate", "run", "--profile", "default", "--json")
|
|
89
|
+
gate_payload = json.loads(gate_proc.stdout)
|
|
90
|
+
|
|
91
|
+
if not init_payload.get("ok"):
|
|
92
|
+
raise RuntimeError("orp init benchmark did not report ok=true")
|
|
93
|
+
if not validate_payload.get("ok"):
|
|
94
|
+
raise RuntimeError("starter kernel validate benchmark did not report ok=true")
|
|
95
|
+
if gate_payload.get("overall") != "PASS":
|
|
96
|
+
raise RuntimeError("starter kernel gate benchmark did not pass")
|
|
97
|
+
|
|
98
|
+
init_times.append(init_ms)
|
|
99
|
+
validate_times.append(validate_ms)
|
|
100
|
+
gate_times.append(gate_ms)
|
|
101
|
+
run_records.append(gate_payload["run_record"])
|
|
102
|
+
|
|
103
|
+
targets = {
|
|
104
|
+
"init_mean_lt_ms": 350.0,
|
|
105
|
+
"validate_mean_lt_ms": 200.0,
|
|
106
|
+
"gate_mean_lt_ms": 300.0,
|
|
107
|
+
}
|
|
108
|
+
observed = {
|
|
109
|
+
"init": _stats(init_times),
|
|
110
|
+
"validate": _stats(validate_times),
|
|
111
|
+
"gate_run": _stats(gate_times),
|
|
112
|
+
}
|
|
113
|
+
return {
|
|
114
|
+
"iterations": iterations,
|
|
115
|
+
"observed": observed,
|
|
116
|
+
"targets": targets,
|
|
117
|
+
"meets_targets": {
|
|
118
|
+
"init": observed["init"]["mean_ms"] < targets["init_mean_lt_ms"],
|
|
119
|
+
"validate": observed["validate"]["mean_ms"] < targets["validate_mean_lt_ms"],
|
|
120
|
+
"gate_run": observed["gate_run"]["mean_ms"] < targets["gate_mean_lt_ms"],
|
|
121
|
+
},
|
|
122
|
+
"sample_run_records": run_records[:2],
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _benchmark_artifact_roundtrip() -> dict[str, Any]:
|
|
127
|
+
rows: list[dict[str, Any]] = []
|
|
128
|
+
scaffold_times: list[float] = []
|
|
129
|
+
validate_times: list[float] = []
|
|
130
|
+
|
|
131
|
+
for artifact_class in ARTIFACT_CLASSES:
|
|
132
|
+
with tempfile.TemporaryDirectory(prefix=f"orp-kernel-bench-{artifact_class}.") as td:
|
|
133
|
+
root = Path(td)
|
|
134
|
+
path = f"analysis/{artifact_class}.kernel.yml"
|
|
135
|
+
scaffold_ms, scaffold_proc = _timed_orp(
|
|
136
|
+
root,
|
|
137
|
+
"kernel",
|
|
138
|
+
"scaffold",
|
|
139
|
+
"--artifact-class",
|
|
140
|
+
artifact_class,
|
|
141
|
+
"--out",
|
|
142
|
+
path,
|
|
143
|
+
"--name",
|
|
144
|
+
f"{artifact_class} benchmark",
|
|
145
|
+
"--json",
|
|
146
|
+
)
|
|
147
|
+
validate_ms, validate_proc = _timed_orp(root, "kernel", "validate", path, "--json")
|
|
148
|
+
scaffold_payload = json.loads(scaffold_proc.stdout)
|
|
149
|
+
validate_payload = json.loads(validate_proc.stdout)
|
|
150
|
+
if not scaffold_payload.get("ok") or not validate_payload.get("ok"):
|
|
151
|
+
raise RuntimeError(f"roundtrip benchmark failed for artifact_class={artifact_class}")
|
|
152
|
+
scaffold_times.append(scaffold_ms)
|
|
153
|
+
validate_times.append(validate_ms)
|
|
154
|
+
rows.append(
|
|
155
|
+
{
|
|
156
|
+
"artifact_class": artifact_class,
|
|
157
|
+
"scaffold_ms": round(scaffold_ms, 3),
|
|
158
|
+
"validate_ms": round(validate_ms, 3),
|
|
159
|
+
}
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
observed = {
|
|
163
|
+
"scaffold": _stats(scaffold_times),
|
|
164
|
+
"validate": _stats(validate_times),
|
|
165
|
+
}
|
|
166
|
+
targets = {
|
|
167
|
+
"scaffold_mean_lt_ms": 200.0,
|
|
168
|
+
"validate_mean_lt_ms": 200.0,
|
|
169
|
+
}
|
|
170
|
+
return {
|
|
171
|
+
"artifact_classes_total": len(rows),
|
|
172
|
+
"rows": rows,
|
|
173
|
+
"observed": observed,
|
|
174
|
+
"targets": targets,
|
|
175
|
+
"meets_targets": {
|
|
176
|
+
"scaffold": observed["scaffold"]["mean_ms"] < targets["scaffold_mean_lt_ms"],
|
|
177
|
+
"validate": observed["validate"]["mean_ms"] < targets["validate_mean_lt_ms"],
|
|
178
|
+
},
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _benchmark_gate_modes() -> dict[str, Any]:
|
|
183
|
+
with tempfile.TemporaryDirectory(prefix="orp-kernel-bench-gates.") as td:
|
|
184
|
+
root = Path(td)
|
|
185
|
+
_write_json(
|
|
186
|
+
root / "analysis" / "invalid-task.kernel.json",
|
|
187
|
+
{
|
|
188
|
+
"schema_version": "1.0.0",
|
|
189
|
+
"artifact_class": "task",
|
|
190
|
+
"object": "terminal trace widget",
|
|
191
|
+
"goal": "surface lane state and drift",
|
|
192
|
+
"boundary": "terminal-first workflow",
|
|
193
|
+
},
|
|
194
|
+
)
|
|
195
|
+
_write_json(
|
|
196
|
+
root / "orp.kernel.bench.json",
|
|
197
|
+
{
|
|
198
|
+
"profiles": {
|
|
199
|
+
"hard": {
|
|
200
|
+
"description": "hard kernel gate",
|
|
201
|
+
"mode": "test",
|
|
202
|
+
"packet_kind": "problem_scope",
|
|
203
|
+
"gate_ids": ["kernel_hard"],
|
|
204
|
+
},
|
|
205
|
+
"soft": {
|
|
206
|
+
"description": "soft kernel gate",
|
|
207
|
+
"mode": "test",
|
|
208
|
+
"packet_kind": "problem_scope",
|
|
209
|
+
"gate_ids": ["kernel_soft"],
|
|
210
|
+
},
|
|
211
|
+
"legacy": {
|
|
212
|
+
"description": "legacy structure kernel gate",
|
|
213
|
+
"mode": "test",
|
|
214
|
+
"packet_kind": "problem_scope",
|
|
215
|
+
"gate_ids": ["kernel_legacy"],
|
|
216
|
+
},
|
|
217
|
+
},
|
|
218
|
+
"gates": [
|
|
219
|
+
{
|
|
220
|
+
"id": "kernel_hard",
|
|
221
|
+
"phase": "structure_kernel",
|
|
222
|
+
"command": "true",
|
|
223
|
+
"pass": {"exit_codes": [0]},
|
|
224
|
+
"kernel": {
|
|
225
|
+
"mode": "hard",
|
|
226
|
+
"artifacts": [
|
|
227
|
+
{
|
|
228
|
+
"path": "analysis/invalid-task.kernel.json",
|
|
229
|
+
"artifact_class": "task",
|
|
230
|
+
}
|
|
231
|
+
],
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
"id": "kernel_soft",
|
|
236
|
+
"phase": "structure_kernel",
|
|
237
|
+
"command": "true",
|
|
238
|
+
"pass": {"exit_codes": [0]},
|
|
239
|
+
"kernel": {
|
|
240
|
+
"mode": "soft",
|
|
241
|
+
"artifacts": [
|
|
242
|
+
{
|
|
243
|
+
"path": "analysis/invalid-task.kernel.json",
|
|
244
|
+
"artifact_class": "task",
|
|
245
|
+
}
|
|
246
|
+
],
|
|
247
|
+
},
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
"id": "kernel_legacy",
|
|
251
|
+
"phase": "structure_kernel",
|
|
252
|
+
"command": "true",
|
|
253
|
+
"pass": {"exit_codes": [0]},
|
|
254
|
+
},
|
|
255
|
+
],
|
|
256
|
+
},
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
hard_ms, hard_proc = _timed_orp(
|
|
260
|
+
root,
|
|
261
|
+
"--config",
|
|
262
|
+
"orp.kernel.bench.json",
|
|
263
|
+
"gate",
|
|
264
|
+
"run",
|
|
265
|
+
"--profile",
|
|
266
|
+
"hard",
|
|
267
|
+
"--json",
|
|
268
|
+
check=False,
|
|
269
|
+
)
|
|
270
|
+
soft_ms, soft_proc = _timed_orp(
|
|
271
|
+
root,
|
|
272
|
+
"--config",
|
|
273
|
+
"orp.kernel.bench.json",
|
|
274
|
+
"gate",
|
|
275
|
+
"run",
|
|
276
|
+
"--profile",
|
|
277
|
+
"soft",
|
|
278
|
+
"--json",
|
|
279
|
+
)
|
|
280
|
+
legacy_ms, legacy_proc = _timed_orp(
|
|
281
|
+
root,
|
|
282
|
+
"--config",
|
|
283
|
+
"orp.kernel.bench.json",
|
|
284
|
+
"gate",
|
|
285
|
+
"run",
|
|
286
|
+
"--profile",
|
|
287
|
+
"legacy",
|
|
288
|
+
"--json",
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
hard_payload = json.loads(hard_proc.stdout)
|
|
292
|
+
soft_payload = json.loads(soft_proc.stdout)
|
|
293
|
+
legacy_payload = json.loads(legacy_proc.stdout)
|
|
294
|
+
|
|
295
|
+
hard_result = json.loads((root / hard_payload["run_record"]).read_text(encoding="utf-8"))["results"][0]
|
|
296
|
+
soft_result = json.loads((root / soft_payload["run_record"]).read_text(encoding="utf-8"))["results"][0]
|
|
297
|
+
legacy_result = json.loads((root / legacy_payload["run_record"]).read_text(encoding="utf-8"))["results"][0]
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
"hard_mode": {
|
|
301
|
+
"ms": round(hard_ms, 3),
|
|
302
|
+
"exit_code": hard_proc.returncode,
|
|
303
|
+
"overall": hard_payload["overall"],
|
|
304
|
+
"kernel_valid": hard_result["kernel_validation"]["valid"],
|
|
305
|
+
"missing_fields": hard_result["kernel_validation"]["artifacts"][0]["missing_fields"],
|
|
306
|
+
},
|
|
307
|
+
"soft_mode": {
|
|
308
|
+
"ms": round(soft_ms, 3),
|
|
309
|
+
"exit_code": soft_proc.returncode,
|
|
310
|
+
"overall": soft_payload["overall"],
|
|
311
|
+
"kernel_valid": soft_result["kernel_validation"]["valid"],
|
|
312
|
+
},
|
|
313
|
+
"legacy_compatibility": {
|
|
314
|
+
"ms": round(legacy_ms, 3),
|
|
315
|
+
"exit_code": legacy_proc.returncode,
|
|
316
|
+
"overall": legacy_payload["overall"],
|
|
317
|
+
"has_kernel_validation": "kernel_validation" in legacy_result,
|
|
318
|
+
},
|
|
319
|
+
"meets_expectations": {
|
|
320
|
+
"hard_blocks_invalid_artifact": hard_proc.returncode == 1
|
|
321
|
+
and hard_payload["overall"] == "FAIL"
|
|
322
|
+
and hard_result["kernel_validation"]["valid"] is False,
|
|
323
|
+
"soft_allows_invalid_artifact_with_advisory": soft_proc.returncode == 0
|
|
324
|
+
and soft_payload["overall"] == "PASS"
|
|
325
|
+
and soft_result["kernel_validation"]["valid"] is False,
|
|
326
|
+
"legacy_structure_kernel_remains_compatible": legacy_proc.returncode == 0
|
|
327
|
+
and legacy_payload["overall"] == "PASS"
|
|
328
|
+
and "kernel_validation" not in legacy_result,
|
|
329
|
+
},
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _gather_metadata() -> dict[str, Any]:
|
|
334
|
+
package_version = json.loads((REPO_ROOT / "package.json").read_text(encoding="utf-8"))["version"]
|
|
335
|
+
commit = _run(["git", "rev-parse", "HEAD"]).stdout.strip()
|
|
336
|
+
branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"]).stdout.strip()
|
|
337
|
+
node_version = _run(["node", "--version"]).stdout.strip()
|
|
338
|
+
return {
|
|
339
|
+
"generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
340
|
+
"repo_commit": commit,
|
|
341
|
+
"repo_branch": branch,
|
|
342
|
+
"package_version": package_version,
|
|
343
|
+
"python_version": sys.version.split()[0],
|
|
344
|
+
"node_version": node_version,
|
|
345
|
+
"platform": platform.platform(),
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def build_report(iterations: int) -> dict[str, Any]:
|
|
350
|
+
init_benchmark = _benchmark_init_starter(iterations)
|
|
351
|
+
roundtrip_benchmark = _benchmark_artifact_roundtrip()
|
|
352
|
+
gate_mode_benchmark = _benchmark_gate_modes()
|
|
353
|
+
|
|
354
|
+
claims = [
|
|
355
|
+
{
|
|
356
|
+
"id": "starter_kernel_bootstrap",
|
|
357
|
+
"claim": "orp init seeds a valid starter kernel artifact and a passing default structure_kernel gate.",
|
|
358
|
+
"status": "pass",
|
|
359
|
+
"evidence": [
|
|
360
|
+
"benchmarks.init_starter_kernel",
|
|
361
|
+
"cli/orp.py",
|
|
362
|
+
"tests/test_orp_init.py",
|
|
363
|
+
],
|
|
364
|
+
},
|
|
365
|
+
{
|
|
366
|
+
"id": "typed_artifact_roundtrip",
|
|
367
|
+
"claim": "All seven v0.1 artifact classes can be scaffolded and validated through the CLI.",
|
|
368
|
+
"status": "pass" if roundtrip_benchmark["artifact_classes_total"] == 7 else "fail",
|
|
369
|
+
"evidence": [
|
|
370
|
+
"benchmarks.artifact_roundtrip",
|
|
371
|
+
"spec/v1/kernel.schema.json",
|
|
372
|
+
"tests/test_orp_kernel.py",
|
|
373
|
+
],
|
|
374
|
+
},
|
|
375
|
+
{
|
|
376
|
+
"id": "promotion_enforcement_modes",
|
|
377
|
+
"claim": "Hard mode blocks invalid promotable artifacts, while soft mode records advisory issues without blocking.",
|
|
378
|
+
"status": "pass"
|
|
379
|
+
if gate_mode_benchmark["meets_expectations"]["hard_blocks_invalid_artifact"]
|
|
380
|
+
and gate_mode_benchmark["meets_expectations"]["soft_allows_invalid_artifact_with_advisory"]
|
|
381
|
+
else "fail",
|
|
382
|
+
"evidence": [
|
|
383
|
+
"benchmarks.gate_modes",
|
|
384
|
+
"tests/test_orp_kernel.py",
|
|
385
|
+
],
|
|
386
|
+
},
|
|
387
|
+
{
|
|
388
|
+
"id": "legacy_structure_kernel_compatibility",
|
|
389
|
+
"claim": "Existing structure_kernel gates without explicit kernel config remain compatible.",
|
|
390
|
+
"status": "pass"
|
|
391
|
+
if gate_mode_benchmark["meets_expectations"]["legacy_structure_kernel_remains_compatible"]
|
|
392
|
+
else "fail",
|
|
393
|
+
"evidence": [
|
|
394
|
+
"benchmarks.gate_modes",
|
|
395
|
+
"cli/orp.py",
|
|
396
|
+
],
|
|
397
|
+
},
|
|
398
|
+
{
|
|
399
|
+
"id": "local_cli_kernel_ergonomics",
|
|
400
|
+
"claim": "One-shot kernel CLI operations remain within human-scale local ergonomics targets on the reference machine.",
|
|
401
|
+
"status": "pass"
|
|
402
|
+
if all(init_benchmark["meets_targets"].values())
|
|
403
|
+
and all(roundtrip_benchmark["meets_targets"].values())
|
|
404
|
+
else "fail",
|
|
405
|
+
"evidence": [
|
|
406
|
+
"benchmarks.init_starter_kernel",
|
|
407
|
+
"benchmarks.artifact_roundtrip",
|
|
408
|
+
],
|
|
409
|
+
},
|
|
410
|
+
]
|
|
411
|
+
|
|
412
|
+
return {
|
|
413
|
+
"schema_version": "1.0.0",
|
|
414
|
+
"kind": "orp_reasoning_kernel_validation_report",
|
|
415
|
+
"metadata": _gather_metadata(),
|
|
416
|
+
"benchmarks": {
|
|
417
|
+
"init_starter_kernel": init_benchmark,
|
|
418
|
+
"artifact_roundtrip": roundtrip_benchmark,
|
|
419
|
+
"gate_modes": gate_mode_benchmark,
|
|
420
|
+
},
|
|
421
|
+
"claims": claims,
|
|
422
|
+
"summary": {
|
|
423
|
+
"all_claims_pass": all(row["status"] == "pass" for row in claims),
|
|
424
|
+
"artifact_classes_total": roundtrip_benchmark["artifact_classes_total"],
|
|
425
|
+
"all_performance_targets_met": all(init_benchmark["meets_targets"].values())
|
|
426
|
+
and all(roundtrip_benchmark["meets_targets"].values()),
|
|
427
|
+
},
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def main() -> int:
|
|
432
|
+
parser = argparse.ArgumentParser(description="Benchmark and validate ORP Reasoning Kernel v0.1")
|
|
433
|
+
parser.add_argument("--out", default="", help="Optional JSON output path")
|
|
434
|
+
parser.add_argument("--iterations", type=int, default=5, help="Iterations for bootstrap benchmark")
|
|
435
|
+
parser.add_argument("--quick", action="store_true", help="Use a single bootstrap iteration for fast checks")
|
|
436
|
+
args = parser.parse_args()
|
|
437
|
+
|
|
438
|
+
iterations = 1 if args.quick else max(1, args.iterations)
|
|
439
|
+
report = build_report(iterations)
|
|
440
|
+
payload = json.dumps(report, indent=2) + "\n"
|
|
441
|
+
if args.out:
|
|
442
|
+
out_path = Path(args.out)
|
|
443
|
+
if not out_path.is_absolute():
|
|
444
|
+
out_path = REPO_ROOT / out_path
|
|
445
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
446
|
+
out_path.write_text(payload, encoding="utf-8")
|
|
447
|
+
print(payload, end="")
|
|
448
|
+
return 0 if report["summary"]["all_claims_pass"] else 1
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
if __name__ == "__main__":
|
|
452
|
+
raise SystemExit(main())
|