open-research-protocol 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +10 -0
  2. package/cli/orp.py +668 -43
  3. package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
  4. package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
  5. package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
  6. package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
  7. package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
  8. package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
  9. package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
  10. package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
  11. package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
  12. package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +471 -0
  13. package/docs/ORP_REASONING_KERNEL_V0_1.md +15 -0
  14. package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
  15. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
  16. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
  17. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
  18. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
  19. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
  20. package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
  21. package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
  22. package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
  23. package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
  24. package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +925 -0
  25. package/examples/README.md +2 -0
  26. package/examples/kernel/comparison/comparison-corpus.json +337 -0
  27. package/examples/kernel/comparison/next-task-continuation.json +55 -0
  28. package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
  29. package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
  30. package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
  31. package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
  32. package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
  33. package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
  34. package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
  35. package/package.json +4 -1
  36. package/scripts/orp-kernel-agent-pilot.py +673 -0
  37. package/scripts/orp-kernel-agent-replication.py +307 -0
  38. package/scripts/orp-kernel-benchmark.py +921 -0
  39. package/scripts/orp-kernel-canonical-continuation.py +381 -0
  40. package/scripts/orp-kernel-ci-check.py +138 -0
  41. package/scripts/orp-kernel-comparison.py +592 -0
  42. package/scripts/orp-kernel-continuation-pilot.py +384 -0
  43. package/scripts/orp-kernel-pickup.py +401 -0
  44. package/spec/v1/kernel-extension.schema.json +96 -0
  45. package/spec/v1/kernel-proposal.schema.json +115 -0
  46. package/spec/v1/kernel.schema.json +2 -1
@@ -0,0 +1,921 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import importlib.util
6
+ import json
7
+ from pathlib import Path
8
+ import platform
9
+ import statistics
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ import time
14
+ from typing import Any
15
+
16
+
17
+ REPO_ROOT = Path(__file__).resolve().parents[1]
18
+ CLI = ["node", "bin/orp.js"]
19
+ CORPUS_ROOT = REPO_ROOT / "examples" / "kernel" / "corpus"
20
+ ARTIFACT_CLASSES = [
21
+ "task",
22
+ "decision",
23
+ "hypothesis",
24
+ "experiment",
25
+ "checkpoint",
26
+ "policy",
27
+ "result",
28
+ ]
29
+ VALID_REQUIREMENT_FIXTURES: dict[str, dict[str, Any]] = {
30
+ "task": {
31
+ "schema_version": "1.0.0",
32
+ "artifact_class": "task",
33
+ "object": "terminal trace widget",
34
+ "goal": "surface lane drift",
35
+ "boundary": "terminal-first lane visibility",
36
+ "constraints": ["low friction"],
37
+ "success_criteria": ["operator spots drift quickly"],
38
+ },
39
+ "decision": {
40
+ "schema_version": "1.0.0",
41
+ "artifact_class": "decision",
42
+ "question": "what should the home screen emphasize first?",
43
+ "chosen_path": "linked projects first",
44
+ "rejected_alternatives": ["idea board default"],
45
+ "rationale": "active work should be foregrounded",
46
+ "consequences": ["idea browsing becomes secondary navigation"],
47
+ },
48
+ "hypothesis": {
49
+ "schema_version": "1.0.0",
50
+ "artifact_class": "hypothesis",
51
+ "claim": "drift summaries reduce missed stalled lanes",
52
+ "boundary": "terminal-first multi-lane workflows",
53
+ "assumptions": ["operators consult summaries while working"],
54
+ "test_path": "compare stalled-lane detection with and without summaries",
55
+ "falsifiers": ["no measurable pickup improvement"],
56
+ },
57
+ "experiment": {
58
+ "schema_version": "1.0.0",
59
+ "artifact_class": "experiment",
60
+ "objective": "measure whether kernel tasks improve handoff pickup",
61
+ "method": "run matched handoff trials",
62
+ "inputs": ["task prompts", "reviewers"],
63
+ "outputs": ["pickup scores", "clarification counts"],
64
+ "evidence_expectations": ["ratings", "artifact corpus"],
65
+ "interpretation_limits": ["small internal sample"],
66
+ },
67
+ "checkpoint": {
68
+ "schema_version": "1.0.0",
69
+ "artifact_class": "checkpoint",
70
+ "completed_unit": "restored canonical runner routing",
71
+ "current_state": "linked project and primary session are synchronized",
72
+ "risks": ["inactive machines may still need a sync"],
73
+ "next_handoff_target": "rerun runner sync on active machines",
74
+ "artifact_refs": [".git/orp/link/project.json", "orp/HANDOFF.md"],
75
+ },
76
+ "policy": {
77
+ "schema_version": "1.0.0",
78
+ "artifact_class": "policy",
79
+ "scope": "hosted runner job pickup",
80
+ "rule": "route only to linked projects with routeable local sessions",
81
+ "rationale": "prevent unroutable job claims",
82
+ "invariants": ["claimed jobs must have a real local execution target"],
83
+ "enforcement_surface": "runner sync poll and work lifecycle",
84
+ },
85
+ "result": {
86
+ "schema_version": "1.0.0",
87
+ "artifact_class": "result",
88
+ "claim": "ORP ships a real reasoning kernel with enforceable promotion semantics",
89
+ "evidence_paths": [
90
+ "docs/ORP_REASONING_KERNEL_V0_1.md",
91
+ "docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md",
92
+ ],
93
+ "status": "shipped in ORP CLI",
94
+ "interpretation_limits": ["comparative superiority is not yet proven"],
95
+ "next_follow_up": "run comparative artifact and handoff studies",
96
+ },
97
+ }
98
+
99
+
100
+ def _run(
101
+ args: list[str],
102
+ *,
103
+ cwd: Path = REPO_ROOT,
104
+ check: bool = True,
105
+ ) -> subprocess.CompletedProcess[str]:
106
+ proc = subprocess.run(
107
+ args,
108
+ cwd=str(cwd),
109
+ capture_output=True,
110
+ text=True,
111
+ )
112
+ if check and proc.returncode != 0:
113
+ raise RuntimeError(
114
+ f"command failed: {' '.join(args)}\nstdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
115
+ )
116
+ return proc
117
+
118
+
119
+ def _run_orp(repo_root: Path, *args: str, check: bool = True) -> subprocess.CompletedProcess[str]:
120
+ return _run([*CLI, "--repo-root", str(repo_root), *args], check=check)
121
+
122
+
123
+ def _timed_orp(repo_root: Path, *args: str, check: bool = True) -> tuple[float, subprocess.CompletedProcess[str]]:
124
+ started = time.perf_counter()
125
+ proc = _run_orp(repo_root, *args, check=check)
126
+ return (time.perf_counter() - started) * 1000.0, proc
127
+
128
+
129
+ def _write_json(path: Path, payload: dict[str, Any]) -> None:
130
+ path.parent.mkdir(parents=True, exist_ok=True)
131
+ path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
132
+
133
+
134
+ def _stats(values: list[float]) -> dict[str, float]:
135
+ return {
136
+ "mean_ms": round(statistics.mean(values), 3),
137
+ "median_ms": round(statistics.median(values), 3),
138
+ "min_ms": round(min(values), 3),
139
+ "max_ms": round(max(values), 3),
140
+ }
141
+
142
+
143
+ def _load_cli_module() -> Any:
144
+ module_path = REPO_ROOT / "cli" / "orp.py"
145
+ spec = importlib.util.spec_from_file_location("orp_cli_kernel_benchmark", module_path)
146
+ if spec is None or spec.loader is None:
147
+ raise RuntimeError(f"failed to load CLI module from {module_path}")
148
+ module = importlib.util.module_from_spec(spec)
149
+ spec.loader.exec_module(module)
150
+ return module
151
+
152
+
153
+ def _load_kernel_schema_requirements() -> dict[str, list[str]]:
154
+ schema_path = REPO_ROOT / "spec" / "v1" / "kernel.schema.json"
155
+ payload = json.loads(schema_path.read_text(encoding="utf-8"))
156
+ out: dict[str, list[str]] = {}
157
+ for clause in payload.get("allOf", []):
158
+ if not isinstance(clause, dict):
159
+ continue
160
+ const = (
161
+ clause.get("if", {})
162
+ .get("properties", {})
163
+ .get("artifact_class", {})
164
+ .get("const")
165
+ )
166
+ required = clause.get("then", {}).get("required")
167
+ if isinstance(const, str) and isinstance(required, list):
168
+ out[const] = [str(x) for x in required if isinstance(x, str)]
169
+ return out
170
+
171
+
172
+ def _benchmark_init_starter(iterations: int) -> dict[str, Any]:
173
+ init_times: list[float] = []
174
+ validate_times: list[float] = []
175
+ gate_times: list[float] = []
176
+ run_records: list[str] = []
177
+
178
+ for _ in range(iterations):
179
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-bench-init.") as td:
180
+ root = Path(td)
181
+ _run(["git", "init", str(root)])
182
+ init_ms, init_proc = _timed_orp(root, "init", "--json")
183
+ init_payload = json.loads(init_proc.stdout)
184
+ validate_ms, validate_proc = _timed_orp(
185
+ root, "kernel", "validate", "analysis/orp.kernel.task.yml", "--json"
186
+ )
187
+ validate_payload = json.loads(validate_proc.stdout)
188
+ gate_ms, gate_proc = _timed_orp(root, "gate", "run", "--profile", "default", "--json")
189
+ gate_payload = json.loads(gate_proc.stdout)
190
+
191
+ if not init_payload.get("ok"):
192
+ raise RuntimeError("orp init benchmark did not report ok=true")
193
+ if not validate_payload.get("ok"):
194
+ raise RuntimeError("starter kernel validate benchmark did not report ok=true")
195
+ if gate_payload.get("overall") != "PASS":
196
+ raise RuntimeError("starter kernel gate benchmark did not pass")
197
+
198
+ init_times.append(init_ms)
199
+ validate_times.append(validate_ms)
200
+ gate_times.append(gate_ms)
201
+ run_records.append(gate_payload["run_record"])
202
+
203
+ targets = {
204
+ "init_mean_lt_ms": 350.0,
205
+ "validate_mean_lt_ms": 200.0,
206
+ "gate_mean_lt_ms": 325.0,
207
+ }
208
+ observed = {
209
+ "init": _stats(init_times),
210
+ "validate": _stats(validate_times),
211
+ "gate_run": _stats(gate_times),
212
+ }
213
+ return {
214
+ "iterations": iterations,
215
+ "observed": observed,
216
+ "targets": targets,
217
+ "meets_targets": {
218
+ "init": observed["init"]["mean_ms"] < targets["init_mean_lt_ms"],
219
+ "validate": observed["validate"]["mean_ms"] < targets["validate_mean_lt_ms"],
220
+ "gate_run": observed["gate_run"]["mean_ms"] < targets["gate_mean_lt_ms"],
221
+ },
222
+ "sample_run_records": run_records[:2],
223
+ }
224
+
225
+
226
+ def _benchmark_artifact_roundtrip() -> dict[str, Any]:
227
+ rows: list[dict[str, Any]] = []
228
+ scaffold_times: list[float] = []
229
+ validate_times: list[float] = []
230
+
231
+ for artifact_class in ARTIFACT_CLASSES:
232
+ with tempfile.TemporaryDirectory(prefix=f"orp-kernel-bench-{artifact_class}.") as td:
233
+ root = Path(td)
234
+ path = f"analysis/{artifact_class}.kernel.yml"
235
+ scaffold_ms, scaffold_proc = _timed_orp(
236
+ root,
237
+ "kernel",
238
+ "scaffold",
239
+ "--artifact-class",
240
+ artifact_class,
241
+ "--out",
242
+ path,
243
+ "--name",
244
+ f"{artifact_class} benchmark",
245
+ "--json",
246
+ )
247
+ validate_ms, validate_proc = _timed_orp(root, "kernel", "validate", path, "--json")
248
+ scaffold_payload = json.loads(scaffold_proc.stdout)
249
+ validate_payload = json.loads(validate_proc.stdout)
250
+ if not scaffold_payload.get("ok") or not validate_payload.get("ok"):
251
+ raise RuntimeError(f"roundtrip benchmark failed for artifact_class={artifact_class}")
252
+ scaffold_times.append(scaffold_ms)
253
+ validate_times.append(validate_ms)
254
+ rows.append(
255
+ {
256
+ "artifact_class": artifact_class,
257
+ "scaffold_ms": round(scaffold_ms, 3),
258
+ "validate_ms": round(validate_ms, 3),
259
+ }
260
+ )
261
+
262
+ observed = {
263
+ "scaffold": _stats(scaffold_times),
264
+ "validate": _stats(validate_times),
265
+ }
266
+ targets = {
267
+ "scaffold_mean_lt_ms": 200.0,
268
+ "validate_mean_lt_ms": 200.0,
269
+ }
270
+ return {
271
+ "artifact_classes_total": len(rows),
272
+ "rows": rows,
273
+ "observed": observed,
274
+ "targets": targets,
275
+ "meets_targets": {
276
+ "scaffold": observed["scaffold"]["mean_ms"] < targets["scaffold_mean_lt_ms"],
277
+ "validate": observed["validate"]["mean_ms"] < targets["validate_mean_lt_ms"],
278
+ },
279
+ }
280
+
281
+
282
+ def _benchmark_gate_modes() -> dict[str, Any]:
283
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-bench-gates.") as td:
284
+ root = Path(td)
285
+ _write_json(
286
+ root / "analysis" / "invalid-task.kernel.json",
287
+ {
288
+ "schema_version": "1.0.0",
289
+ "artifact_class": "task",
290
+ "object": "terminal trace widget",
291
+ "goal": "surface lane state and drift",
292
+ "boundary": "terminal-first workflow",
293
+ },
294
+ )
295
+ _write_json(
296
+ root / "orp.kernel.bench.json",
297
+ {
298
+ "profiles": {
299
+ "hard": {
300
+ "description": "hard kernel gate",
301
+ "mode": "test",
302
+ "packet_kind": "problem_scope",
303
+ "gate_ids": ["kernel_hard"],
304
+ },
305
+ "soft": {
306
+ "description": "soft kernel gate",
307
+ "mode": "test",
308
+ "packet_kind": "problem_scope",
309
+ "gate_ids": ["kernel_soft"],
310
+ },
311
+ "legacy": {
312
+ "description": "legacy structure kernel gate",
313
+ "mode": "test",
314
+ "packet_kind": "problem_scope",
315
+ "gate_ids": ["kernel_legacy"],
316
+ },
317
+ },
318
+ "gates": [
319
+ {
320
+ "id": "kernel_hard",
321
+ "phase": "structure_kernel",
322
+ "command": "true",
323
+ "pass": {"exit_codes": [0]},
324
+ "kernel": {
325
+ "mode": "hard",
326
+ "artifacts": [
327
+ {
328
+ "path": "analysis/invalid-task.kernel.json",
329
+ "artifact_class": "task",
330
+ }
331
+ ],
332
+ },
333
+ },
334
+ {
335
+ "id": "kernel_soft",
336
+ "phase": "structure_kernel",
337
+ "command": "true",
338
+ "pass": {"exit_codes": [0]},
339
+ "kernel": {
340
+ "mode": "soft",
341
+ "artifacts": [
342
+ {
343
+ "path": "analysis/invalid-task.kernel.json",
344
+ "artifact_class": "task",
345
+ }
346
+ ],
347
+ },
348
+ },
349
+ {
350
+ "id": "kernel_legacy",
351
+ "phase": "structure_kernel",
352
+ "command": "true",
353
+ "pass": {"exit_codes": [0]},
354
+ },
355
+ ],
356
+ },
357
+ )
358
+
359
+ hard_ms, hard_proc = _timed_orp(
360
+ root,
361
+ "--config",
362
+ "orp.kernel.bench.json",
363
+ "gate",
364
+ "run",
365
+ "--profile",
366
+ "hard",
367
+ "--json",
368
+ check=False,
369
+ )
370
+ soft_ms, soft_proc = _timed_orp(
371
+ root,
372
+ "--config",
373
+ "orp.kernel.bench.json",
374
+ "gate",
375
+ "run",
376
+ "--profile",
377
+ "soft",
378
+ "--json",
379
+ )
380
+ legacy_ms, legacy_proc = _timed_orp(
381
+ root,
382
+ "--config",
383
+ "orp.kernel.bench.json",
384
+ "gate",
385
+ "run",
386
+ "--profile",
387
+ "legacy",
388
+ "--json",
389
+ )
390
+
391
+ hard_payload = json.loads(hard_proc.stdout)
392
+ soft_payload = json.loads(soft_proc.stdout)
393
+ legacy_payload = json.loads(legacy_proc.stdout)
394
+
395
+ hard_result = json.loads((root / hard_payload["run_record"]).read_text(encoding="utf-8"))["results"][0]
396
+ soft_result = json.loads((root / soft_payload["run_record"]).read_text(encoding="utf-8"))["results"][0]
397
+ legacy_result = json.loads((root / legacy_payload["run_record"]).read_text(encoding="utf-8"))["results"][0]
398
+
399
+ return {
400
+ "hard_mode": {
401
+ "ms": round(hard_ms, 3),
402
+ "exit_code": hard_proc.returncode,
403
+ "overall": hard_payload["overall"],
404
+ "kernel_valid": hard_result["kernel_validation"]["valid"],
405
+ "missing_fields": hard_result["kernel_validation"]["artifacts"][0]["missing_fields"],
406
+ },
407
+ "soft_mode": {
408
+ "ms": round(soft_ms, 3),
409
+ "exit_code": soft_proc.returncode,
410
+ "overall": soft_payload["overall"],
411
+ "kernel_valid": soft_result["kernel_validation"]["valid"],
412
+ },
413
+ "legacy_compatibility": {
414
+ "ms": round(legacy_ms, 3),
415
+ "exit_code": legacy_proc.returncode,
416
+ "overall": legacy_payload["overall"],
417
+ "has_kernel_validation": "kernel_validation" in legacy_result,
418
+ },
419
+ "meets_expectations": {
420
+ "hard_blocks_invalid_artifact": hard_proc.returncode == 1
421
+ and hard_payload["overall"] == "FAIL"
422
+ and hard_result["kernel_validation"]["valid"] is False,
423
+ "soft_allows_invalid_artifact_with_advisory": soft_proc.returncode == 0
424
+ and soft_payload["overall"] == "PASS"
425
+ and soft_result["kernel_validation"]["valid"] is False,
426
+ "legacy_structure_kernel_remains_compatible": legacy_proc.returncode == 0
427
+ and legacy_payload["overall"] == "PASS"
428
+ and "kernel_validation" not in legacy_result,
429
+ },
430
+ }
431
+
432
+
433
+ def _benchmark_schema_alignment() -> dict[str, Any]:
434
+ cli_module = _load_cli_module()
435
+ schema_requirements = _load_kernel_schema_requirements()
436
+ cli_requirements = dict(getattr(cli_module, "KERNEL_ARTIFACT_CLASS_REQUIREMENTS", {}))
437
+ schema_fields = set(json.loads((REPO_ROOT / "spec" / "v1" / "kernel.schema.json").read_text(encoding="utf-8")).get("properties", {}).keys())
438
+ cli_fields = set(getattr(cli_module, "KERNEL_ALLOWED_FIELDS", set()))
439
+ return {
440
+ "schema_requirements": schema_requirements,
441
+ "cli_requirements": cli_requirements,
442
+ "schema_fields_total": len(schema_fields),
443
+ "cli_fields_total": len(cli_fields),
444
+ "meets_expectations": {
445
+ "requirements_match": schema_requirements == cli_requirements,
446
+ "fields_match": schema_fields == cli_fields,
447
+ },
448
+ }
449
+
450
+
451
+ def _benchmark_cross_domain_corpus() -> dict[str, Any]:
452
+ if not CORPUS_ROOT.exists():
453
+ raise RuntimeError(f"kernel corpus root is missing: {CORPUS_ROOT}")
454
+
455
+ rows: list[dict[str, Any]] = []
456
+ validate_times: list[float] = []
457
+ domains: set[str] = set()
458
+ classes: set[str] = set()
459
+ files = sorted(
460
+ path for path in CORPUS_ROOT.rglob("*") if path.is_file() and path.suffix.lower() in {".yml", ".yaml", ".json"}
461
+ )
462
+ if not files:
463
+ raise RuntimeError(f"kernel corpus root has no fixtures: {CORPUS_ROOT}")
464
+
465
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-bench-corpus.") as td:
466
+ root = Path(td)
467
+ for path in files:
468
+ rel = path.relative_to(CORPUS_ROOT)
469
+ domain = rel.parts[0] if len(rel.parts) > 1 else "unknown"
470
+ domains.add(domain)
471
+ target = root / "analysis" / rel.name
472
+ target.parent.mkdir(parents=True, exist_ok=True)
473
+ target.write_text(path.read_text(encoding="utf-8"), encoding="utf-8")
474
+ validate_ms, validate_proc = _timed_orp(root, "kernel", "validate", str(target.relative_to(root)), "--json")
475
+ validate_payload = json.loads(validate_proc.stdout)
476
+ if not validate_payload.get("ok"):
477
+ raise RuntimeError(f"corpus benchmark failed for fixture={rel}")
478
+ classes.add(validate_payload["artifact_result"]["artifact_class"])
479
+ validate_times.append(validate_ms)
480
+ rows.append(
481
+ {
482
+ "fixture": rel.as_posix(),
483
+ "domain": domain,
484
+ "artifact_class": validate_payload["artifact_result"]["artifact_class"],
485
+ "validate_ms": round(validate_ms, 3),
486
+ }
487
+ )
488
+
489
+ observed = {"validate": _stats(validate_times)}
490
+ targets = {
491
+ "domains_min": 5,
492
+ "fixtures_min": 7,
493
+ "validate_mean_lt_ms": 200.0,
494
+ }
495
+ return {
496
+ "fixtures_total": len(rows),
497
+ "domains_total": len(domains),
498
+ "artifact_classes_total": len(classes),
499
+ "rows": rows,
500
+ "observed": observed,
501
+ "targets": targets,
502
+ "meets_targets": {
503
+ "domains": len(domains) >= targets["domains_min"],
504
+ "fixtures": len(rows) >= targets["fixtures_min"],
505
+ "validate": observed["validate"]["mean_ms"] < targets["validate_mean_lt_ms"],
506
+ },
507
+ }
508
+
509
+
510
+ def _benchmark_requirement_enforcement() -> dict[str, Any]:
511
+ rows: list[dict[str, Any]] = []
512
+ validate_times: list[float] = []
513
+ total_missing_cases = 0
514
+ requirements = _load_kernel_schema_requirements()
515
+
516
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-bench-requirements.") as td:
517
+ root = Path(td)
518
+ for artifact_class, payload in VALID_REQUIREMENT_FIXTURES.items():
519
+ for removed_field in requirements[artifact_class]:
520
+ invalid_payload = dict(payload)
521
+ invalid_payload.pop(removed_field, None)
522
+ target = root / "analysis" / f"{artifact_class}.{removed_field}.invalid.kernel.json"
523
+ _write_json(target, invalid_payload)
524
+ validate_ms, validate_proc = _timed_orp(
525
+ root,
526
+ "kernel",
527
+ "validate",
528
+ str(target.relative_to(root)),
529
+ "--artifact-class",
530
+ artifact_class,
531
+ "--json",
532
+ check=False,
533
+ )
534
+ validate_payload = json.loads(validate_proc.stdout)
535
+ validate_times.append(validate_ms)
536
+ artifact_result = validate_payload["artifact_result"]
537
+ total_missing_cases += 1 if removed_field in artifact_result.get("missing_fields", []) else 0
538
+ rows.append(
539
+ {
540
+ "artifact_class": artifact_class,
541
+ "removed_field": removed_field,
542
+ "exit_code": validate_proc.returncode,
543
+ "valid": artifact_result.get("valid", validate_payload.get("ok", False)),
544
+ "missing_fields": artifact_result.get("missing_fields", []),
545
+ "validate_ms": round(validate_ms, 3),
546
+ }
547
+ )
548
+
549
+ observed = {"validate": _stats(validate_times)}
550
+ targets = {
551
+ "all_cases_detected": sum(len(fields) for fields in requirements.values()),
552
+ "validate_mean_lt_ms": 200.0,
553
+ }
554
+ return {
555
+ "cases_total": len(rows),
556
+ "rows": rows,
557
+ "observed": observed,
558
+ "targets": targets,
559
+ "meets_targets": {
560
+ "all_cases_detected": total_missing_cases == targets["all_cases_detected"]
561
+ and all(row["exit_code"] == 1 for row in rows)
562
+ and all(row["valid"] is False for row in rows),
563
+ "validate": observed["validate"]["mean_ms"] < targets["validate_mean_lt_ms"],
564
+ },
565
+ }
566
+
567
+
568
+ def _benchmark_representation_invariance() -> dict[str, Any]:
569
+ yaml_body = (
570
+ 'schema_version: "1.0.0"\n'
571
+ "artifact_class: task\n"
572
+ "object: terminal trace widget\n"
573
+ "goal: surface lane drift\n"
574
+ "boundary:\n"
575
+ " - terminal-first lane visibility\n"
576
+ "constraints:\n"
577
+ " - low friction\n"
578
+ "success_criteria:\n"
579
+ " - operator spots drift quickly\n"
580
+ )
581
+ json_body = {
582
+ "schema_version": "1.0.0",
583
+ "artifact_class": "task",
584
+ "object": "terminal trace widget",
585
+ "goal": "surface lane drift",
586
+ "boundary": ["terminal-first lane visibility"],
587
+ "constraints": ["low friction"],
588
+ "success_criteria": ["operator spots drift quickly"],
589
+ }
590
+
591
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-bench-invariance.") as td:
592
+ root = Path(td)
593
+ yaml_path = root / "analysis" / "task.kernel.yml"
594
+ json_path = root / "analysis" / "task.kernel.json"
595
+ yaml_path.parent.mkdir(parents=True, exist_ok=True)
596
+ yaml_path.write_text(yaml_body, encoding="utf-8")
597
+ _write_json(json_path, json_body)
598
+
599
+ yaml_ms, yaml_proc = _timed_orp(root, "kernel", "validate", str(yaml_path.relative_to(root)), "--json")
600
+ json_ms, json_proc = _timed_orp(root, "kernel", "validate", str(json_path.relative_to(root)), "--json")
601
+ yaml_payload = json.loads(yaml_proc.stdout)
602
+ json_payload = json.loads(json_proc.stdout)
603
+ yaml_result = yaml_payload["artifact_result"]
604
+ json_result = json_payload["artifact_result"]
605
+
606
+ comparable_yaml = {k: v for k, v in yaml_result.items() if k != "path"}
607
+ comparable_json = {k: v for k, v in json_result.items() if k != "path"}
608
+ return {
609
+ "yaml_ms": round(yaml_ms, 3),
610
+ "json_ms": round(json_ms, 3),
611
+ "yaml_result": yaml_result,
612
+ "json_result": json_result,
613
+ "meets_expectations": {
614
+ "both_valid": yaml_payload["ok"] and json_payload["ok"],
615
+ "equivalent_results": comparable_yaml == comparable_json,
616
+ },
617
+ }
618
+
619
+
620
+ def _benchmark_mutation_stress() -> dict[str, Any]:
621
+ cases = [
622
+ {
623
+ "id": "unexpected_field",
624
+ "artifact_class": "task",
625
+ "payload": {
626
+ **VALID_REQUIREMENT_FIXTURES["task"],
627
+ "mystery_field": "should not be allowed",
628
+ },
629
+ "expected_fragment": "unexpected field",
630
+ },
631
+ {
632
+ "id": "whitespace_only_text",
633
+ "artifact_class": "task",
634
+ "payload": {
635
+ **VALID_REQUIREMENT_FIXTURES["task"],
636
+ "object": " ",
637
+ },
638
+ "expected_fragment": "field `object` must be a non-empty string",
639
+ },
640
+ {
641
+ "id": "wrong_text_list_type",
642
+ "artifact_class": "task",
643
+ "payload": {
644
+ **VALID_REQUIREMENT_FIXTURES["task"],
645
+ "constraints": {"bad": True},
646
+ },
647
+ "expected_fragment": "field `constraints` must be a non-empty string or a non-empty list",
648
+ },
649
+ {
650
+ "id": "non_string_list_item",
651
+ "artifact_class": "result",
652
+ "payload": {
653
+ **VALID_REQUIREMENT_FIXTURES["result"],
654
+ "evidence_paths": ["docs/ORP_REASONING_KERNEL_V0_1.md", 42],
655
+ },
656
+ "expected_fragment": "field `evidence_paths` must be a non-empty list of non-empty strings",
657
+ },
658
+ {
659
+ "id": "unsupported_artifact_class",
660
+ "artifact_class": "task",
661
+ "payload": {
662
+ **VALID_REQUIREMENT_FIXTURES["task"],
663
+ "artifact_class": "memo",
664
+ },
665
+ "expected_fragment": "unsupported artifact_class",
666
+ },
667
+ {
668
+ "id": "wrong_schema_version",
669
+ "artifact_class": "task",
670
+ "payload": {
671
+ **VALID_REQUIREMENT_FIXTURES["task"],
672
+ "schema_version": "9.9.9",
673
+ },
674
+ "expected_fragment": "field `schema_version` must equal `1.0.0`",
675
+ },
676
+ {
677
+ "id": "empty_list",
678
+ "artifact_class": "task",
679
+ "payload": {
680
+ **VALID_REQUIREMENT_FIXTURES["task"],
681
+ "boundary": [],
682
+ },
683
+ "expected_fragment": "missing required fields: boundary",
684
+ },
685
+ ]
686
+ rows: list[dict[str, Any]] = []
687
+ validate_times: list[float] = []
688
+
689
+ with tempfile.TemporaryDirectory(prefix="orp-kernel-bench-mutations.") as td:
690
+ root = Path(td)
691
+ for case in cases:
692
+ target = root / "analysis" / f"{case['id']}.kernel.json"
693
+ _write_json(target, case["payload"])
694
+ validate_ms, validate_proc = _timed_orp(
695
+ root,
696
+ "kernel",
697
+ "validate",
698
+ str(target.relative_to(root)),
699
+ "--artifact-class",
700
+ case["artifact_class"],
701
+ "--json",
702
+ check=False,
703
+ )
704
+ validate_payload = json.loads(validate_proc.stdout)
705
+ issues = validate_payload["artifact_result"]["issues"]
706
+ validate_times.append(validate_ms)
707
+ rows.append(
708
+ {
709
+ "id": case["id"],
710
+ "exit_code": validate_proc.returncode,
711
+ "issues": issues,
712
+ "validate_ms": round(validate_ms, 3),
713
+ "matched_expected_issue": any(case["expected_fragment"] in issue for issue in issues),
714
+ }
715
+ )
716
+
717
+ observed = {"validate": _stats(validate_times)}
718
+ targets = {
719
+ "cases_total": len(cases),
720
+ "validate_mean_lt_ms": 200.0,
721
+ }
722
+ return {
723
+ "cases_total": len(rows),
724
+ "rows": rows,
725
+ "observed": observed,
726
+ "targets": targets,
727
+ "meets_targets": {
728
+ "all_cases_detected": all(row["exit_code"] == 1 and row["matched_expected_issue"] for row in rows),
729
+ "validate": observed["validate"]["mean_ms"] < targets["validate_mean_lt_ms"],
730
+ },
731
+ }
732
+
733
+
734
+ def _gather_metadata() -> dict[str, Any]:
735
+ package_version = json.loads((REPO_ROOT / "package.json").read_text(encoding="utf-8"))["version"]
736
+ commit = _run(["git", "rev-parse", "HEAD"]).stdout.strip()
737
+ branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"]).stdout.strip()
738
+ node_version = _run(["node", "--version"]).stdout.strip()
739
+ return {
740
+ "generated_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
741
+ "repo_commit": commit,
742
+ "repo_branch": branch,
743
+ "package_version": package_version,
744
+ "python_version": sys.version.split()[0],
745
+ "node_version": node_version,
746
+ "platform": platform.platform(),
747
+ }
748
+
749
+
750
+ def build_report(iterations: int) -> dict[str, Any]:
751
+ init_benchmark = _benchmark_init_starter(iterations)
752
+ roundtrip_benchmark = _benchmark_artifact_roundtrip()
753
+ gate_mode_benchmark = _benchmark_gate_modes()
754
+ schema_alignment = _benchmark_schema_alignment()
755
+ corpus_benchmark = _benchmark_cross_domain_corpus()
756
+ requirement_benchmark = _benchmark_requirement_enforcement()
757
+ representation_invariance = _benchmark_representation_invariance()
758
+ mutation_stress = _benchmark_mutation_stress()
759
+
760
+ claims = [
761
+ {
762
+ "id": "schema_validator_alignment",
763
+ "claim": "The CLI kernel requirements and allowed fields stay aligned with the published kernel schema.",
764
+ "status": "pass" if all(schema_alignment["meets_expectations"].values()) else "fail",
765
+ "evidence": [
766
+ "benchmarks.schema_alignment",
767
+ "spec/v1/kernel.schema.json",
768
+ "cli/orp.py",
769
+ ],
770
+ },
771
+ {
772
+ "id": "starter_kernel_bootstrap",
773
+ "claim": "orp init seeds a valid starter kernel artifact and a passing default structure_kernel gate.",
774
+ "status": "pass",
775
+ "evidence": [
776
+ "benchmarks.init_starter_kernel",
777
+ "cli/orp.py",
778
+ "tests/test_orp_init.py",
779
+ ],
780
+ },
781
+ {
782
+ "id": "typed_artifact_roundtrip",
783
+ "claim": "All seven v0.1 artifact classes can be scaffolded and validated through the CLI.",
784
+ "status": "pass" if roundtrip_benchmark["artifact_classes_total"] == 7 else "fail",
785
+ "evidence": [
786
+ "benchmarks.artifact_roundtrip",
787
+ "spec/v1/kernel.schema.json",
788
+ "tests/test_orp_kernel.py",
789
+ ],
790
+ },
791
+ {
792
+ "id": "promotion_enforcement_modes",
793
+ "claim": "Hard mode blocks invalid promotable artifacts, while soft mode records advisory issues without blocking.",
794
+ "status": "pass"
795
+ if gate_mode_benchmark["meets_expectations"]["hard_blocks_invalid_artifact"]
796
+ and gate_mode_benchmark["meets_expectations"]["soft_allows_invalid_artifact_with_advisory"]
797
+ else "fail",
798
+ "evidence": [
799
+ "benchmarks.gate_modes",
800
+ "tests/test_orp_kernel.py",
801
+ ],
802
+ },
803
+ {
804
+ "id": "legacy_structure_kernel_compatibility",
805
+ "claim": "Existing structure_kernel gates without explicit kernel config remain compatible.",
806
+ "status": "pass"
807
+ if gate_mode_benchmark["meets_expectations"]["legacy_structure_kernel_remains_compatible"]
808
+ else "fail",
809
+ "evidence": [
810
+ "benchmarks.gate_modes",
811
+ "cli/orp.py",
812
+ ],
813
+ },
814
+ {
815
+ "id": "local_cli_kernel_ergonomics",
816
+ "claim": "One-shot kernel CLI operations remain within human-scale local ergonomics targets on the reference machine.",
817
+ "status": "pass"
818
+ if all(init_benchmark["meets_targets"].values())
819
+ and all(roundtrip_benchmark["meets_targets"].values())
820
+ else "fail",
821
+ "evidence": [
822
+ "benchmarks.init_starter_kernel",
823
+ "benchmarks.artifact_roundtrip",
824
+ ],
825
+ },
826
+ {
827
+ "id": "cross_domain_corpus_fit",
828
+ "claim": "The current v0.1 kernel class set fits a small cross-domain reference corpus cleanly.",
829
+ "status": "pass"
830
+ if all(corpus_benchmark["meets_targets"].values())
831
+ and corpus_benchmark["artifact_classes_total"] >= 7
832
+ else "fail",
833
+ "evidence": [
834
+ "benchmarks.cross_domain_corpus",
835
+ "examples/kernel/corpus",
836
+ ],
837
+ },
838
+ {
839
+ "id": "class_specific_requirement_enforcement",
840
+ "claim": "Each kernel artifact class rejects a candidate artifact when a required field is removed.",
841
+ "status": "pass"
842
+ if all(requirement_benchmark["meets_targets"].values())
843
+ else "fail",
844
+ "evidence": [
845
+ "benchmarks.requirement_enforcement",
846
+ "spec/v1/kernel.schema.json",
847
+ ],
848
+ },
849
+ {
850
+ "id": "representation_invariance",
851
+ "claim": "Equivalent YAML and JSON kernel artifacts validate to the same semantic result.",
852
+ "status": "pass"
853
+ if all(representation_invariance["meets_expectations"].values())
854
+ else "fail",
855
+ "evidence": [
856
+ "benchmarks.representation_invariance",
857
+ ],
858
+ },
859
+ {
860
+ "id": "adversarial_mutation_detection",
861
+ "claim": "The validator rejects adversarial near-miss artifacts such as unknown fields, wrong types, whitespace-only text, and bad schema metadata.",
862
+ "status": "pass"
863
+ if all(mutation_stress["meets_targets"].values())
864
+ else "fail",
865
+ "evidence": [
866
+ "benchmarks.mutation_stress",
867
+ "spec/v1/kernel.schema.json",
868
+ ],
869
+ },
870
+ ]
871
+
872
+ return {
873
+ "schema_version": "1.0.0",
874
+ "kind": "orp_reasoning_kernel_validation_report",
875
+ "metadata": _gather_metadata(),
876
+ "benchmarks": {
877
+ "init_starter_kernel": init_benchmark,
878
+ "artifact_roundtrip": roundtrip_benchmark,
879
+ "gate_modes": gate_mode_benchmark,
880
+ "schema_alignment": schema_alignment,
881
+ "cross_domain_corpus": corpus_benchmark,
882
+ "requirement_enforcement": requirement_benchmark,
883
+ "representation_invariance": representation_invariance,
884
+ "mutation_stress": mutation_stress,
885
+ },
886
+ "claims": claims,
887
+ "summary": {
888
+ "all_claims_pass": all(row["status"] == "pass" for row in claims),
889
+ "artifact_classes_total": roundtrip_benchmark["artifact_classes_total"],
890
+ "cross_domain_corpus_domains_total": corpus_benchmark["domains_total"],
891
+ "all_performance_targets_met": all(init_benchmark["meets_targets"].values())
892
+ and all(roundtrip_benchmark["meets_targets"].values())
893
+ and corpus_benchmark["meets_targets"]["validate"]
894
+ and requirement_benchmark["meets_targets"]["validate"]
895
+ and mutation_stress["meets_targets"]["validate"],
896
+ },
897
+ }
898
+
899
+
900
+ def main() -> int:
901
+ parser = argparse.ArgumentParser(description="Benchmark and validate ORP Reasoning Kernel v0.1")
902
+ parser.add_argument("--out", default="", help="Optional JSON output path")
903
+ parser.add_argument("--iterations", type=int, default=5, help="Iterations for bootstrap benchmark")
904
+ parser.add_argument("--quick", action="store_true", help="Use a single bootstrap iteration for fast checks")
905
+ args = parser.parse_args()
906
+
907
+ iterations = 1 if args.quick else max(1, args.iterations)
908
+ report = build_report(iterations)
909
+ payload = json.dumps(report, indent=2) + "\n"
910
+ if args.out:
911
+ out_path = Path(args.out)
912
+ if not out_path.is_absolute():
913
+ out_path = REPO_ROOT / out_path
914
+ out_path.parent.mkdir(parents=True, exist_ok=True)
915
+ out_path.write_text(payload, encoding="utf-8")
916
+ print(payload, end="")
917
+ return 0 if report["summary"]["all_claims_pass"] else 1
918
+
919
+
920
+ if __name__ == "__main__":
921
+ raise SystemExit(main())