open-research-protocol 0.4.7 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +9 -0
  2. package/cli/orp.py +668 -43
  3. package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
  4. package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
  5. package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
  6. package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
  7. package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
  8. package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
  9. package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
  10. package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
  11. package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
  12. package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
  13. package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
  14. package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
  15. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
  16. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
  17. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
  18. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
  19. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
  20. package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
  21. package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
  22. package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
  23. package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
  24. package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
  25. package/examples/README.md +2 -0
  26. package/examples/kernel/comparison/comparison-corpus.json +337 -0
  27. package/examples/kernel/comparison/next-task-continuation.json +55 -0
  28. package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
  29. package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
  30. package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
  31. package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
  32. package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
  33. package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
  34. package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
  35. package/package.json +4 -1
  36. package/scripts/orp-kernel-agent-pilot.py +673 -0
  37. package/scripts/orp-kernel-agent-replication.py +307 -0
  38. package/scripts/orp-kernel-benchmark.py +471 -2
  39. package/scripts/orp-kernel-canonical-continuation.py +381 -0
  40. package/scripts/orp-kernel-ci-check.py +138 -0
  41. package/scripts/orp-kernel-comparison.py +592 -0
  42. package/scripts/orp-kernel-continuation-pilot.py +384 -0
  43. package/scripts/orp-kernel-pickup.py +401 -0
  44. package/spec/v1/kernel-extension.schema.json +96 -0
  45. package/spec/v1/kernel-proposal.schema.json +115 -0
  46. package/spec/v1/kernel.schema.json +2 -1
@@ -0,0 +1,688 @@
1
+ {
2
+ "schema_version": "1.0.0",
3
+ "kind": "orp_reasoning_kernel_comparison_report",
4
+ "metadata": {
5
+ "generated_at_utc": "2026-03-23T06:06:17Z",
6
+ "repo_commit": "c2f7f2a52744a00fb719d37de583da1f4ae615bd",
7
+ "repo_branch": "main",
8
+ "package_version": "0.4.7",
9
+ "python_version": "3.9.6",
10
+ "node_version": "v24.10.0",
11
+ "platform": "macOS-26.3-arm64-arm-64bit"
12
+ },
13
+ "corpus": {
14
+ "source": "examples/kernel/comparison/comparison-corpus.json",
15
+ "cases_total": 7,
16
+ "domains_total": 5,
17
+ "domains": [
18
+ "operations",
19
+ "product",
20
+ "research",
21
+ "software",
22
+ "writing"
23
+ ],
24
+ "artifact_classes_total": 7,
25
+ "artifact_classes": [
26
+ "checkpoint",
27
+ "decision",
28
+ "experiment",
29
+ "hypothesis",
30
+ "policy",
31
+ "result",
32
+ "task"
33
+ ]
34
+ },
35
+ "conditions": {
36
+ "freeform": {
37
+ "condition": "freeform",
38
+ "cases_total": 7,
39
+ "rows": [
40
+ {
41
+ "id": "software_trace_widget",
42
+ "domain": "software",
43
+ "artifact_class": "task",
44
+ "total_score": 0.208,
45
+ "class_specific_completeness": 0.27,
46
+ "ambiguity_remaining": 0.73,
47
+ "present_fields": [
48
+ "constraints",
49
+ "goal",
50
+ "object"
51
+ ],
52
+ "missing_fields": [
53
+ "boundary",
54
+ "success_criteria"
55
+ ]
56
+ },
57
+ {
58
+ "id": "product_project_home",
59
+ "domain": "product",
60
+ "artifact_class": "decision",
61
+ "total_score": 0.323,
62
+ "class_specific_completeness": 0.36,
63
+ "ambiguity_remaining": 0.64,
64
+ "present_fields": [
65
+ "chosen_path",
66
+ "consequences",
67
+ "question",
68
+ "rationale"
69
+ ],
70
+ "missing_fields": [
71
+ "rejected_alternatives"
72
+ ]
73
+ },
74
+ {
75
+ "id": "research_drift_hypothesis",
76
+ "domain": "research",
77
+ "artifact_class": "hypothesis",
78
+ "total_score": 0.323,
79
+ "class_specific_completeness": 0.36,
80
+ "ambiguity_remaining": 0.64,
81
+ "present_fields": [
82
+ "assumptions",
83
+ "boundary",
84
+ "claim",
85
+ "test_path"
86
+ ],
87
+ "missing_fields": [
88
+ "falsifiers"
89
+ ]
90
+ },
91
+ {
92
+ "id": "research_handoff_experiment",
93
+ "domain": "research",
94
+ "artifact_class": "experiment",
95
+ "total_score": 0.2,
96
+ "class_specific_completeness": 0.225,
97
+ "ambiguity_remaining": 0.775,
98
+ "present_fields": [
99
+ "interpretation_limits",
100
+ "method",
101
+ "objective"
102
+ ],
103
+ "missing_fields": [
104
+ "inputs",
105
+ "outputs",
106
+ "evidence_expectations"
107
+ ]
108
+ },
109
+ {
110
+ "id": "operations_habanero_checkpoint",
111
+ "domain": "operations",
112
+ "artifact_class": "checkpoint",
113
+ "total_score": 0.285,
114
+ "class_specific_completeness": 0.36,
115
+ "ambiguity_remaining": 0.64,
116
+ "present_fields": [
117
+ "completed_unit",
118
+ "current_state",
119
+ "next_handoff_target",
120
+ "risks"
121
+ ],
122
+ "missing_fields": [
123
+ "artifact_refs"
124
+ ]
125
+ },
126
+ {
127
+ "id": "operations_runner_policy",
128
+ "domain": "operations",
129
+ "artifact_class": "policy",
130
+ "total_score": 0.298,
131
+ "class_specific_completeness": 0.36,
132
+ "ambiguity_remaining": 0.64,
133
+ "present_fields": [
134
+ "invariants",
135
+ "rationale",
136
+ "rule",
137
+ "scope"
138
+ ],
139
+ "missing_fields": [
140
+ "enforcement_surface"
141
+ ]
142
+ },
143
+ {
144
+ "id": "writing_kernel_launch_result",
145
+ "domain": "writing",
146
+ "artifact_class": "result",
147
+ "total_score": 0.285,
148
+ "class_specific_completeness": 0.36,
149
+ "ambiguity_remaining": 0.64,
150
+ "present_fields": [
151
+ "claim",
152
+ "evidence_paths",
153
+ "next_follow_up",
154
+ "status"
155
+ ],
156
+ "missing_fields": [
157
+ "interpretation_limits"
158
+ ]
159
+ }
160
+ ],
161
+ "mean_total_score": 0.275,
162
+ "mean_class_specific_completeness": 0.328,
163
+ "mean_ambiguity_remaining": 0.672,
164
+ "mean_dimension_scores": {
165
+ "artifact_type_clarity": 0.0,
166
+ "objective_clarity": 0.45,
167
+ "limits_clarity": 0.289,
168
+ "evaluation_clarity": 0.193,
169
+ "handoff_readiness": 0.386,
170
+ "class_specific_completeness": 0.328
171
+ }
172
+ },
173
+ "generic_checklist": {
174
+ "condition": "generic_checklist",
175
+ "cases_total": 7,
176
+ "rows": [
177
+ {
178
+ "id": "software_trace_widget",
179
+ "domain": "software",
180
+ "artifact_class": "task",
181
+ "total_score": 0.697,
182
+ "class_specific_completeness": 0.68,
183
+ "ambiguity_remaining": 0.32,
184
+ "present_fields": [
185
+ "boundary",
186
+ "constraints",
187
+ "goal",
188
+ "object",
189
+ "success_criteria"
190
+ ],
191
+ "missing_fields": []
192
+ },
193
+ {
194
+ "id": "product_project_home",
195
+ "domain": "product",
196
+ "artifact_class": "decision",
197
+ "total_score": 0.627,
198
+ "class_specific_completeness": 0.58,
199
+ "ambiguity_remaining": 0.42,
200
+ "present_fields": [
201
+ "chosen_path",
202
+ "consequences",
203
+ "question",
204
+ "rationale",
205
+ "rejected_alternatives"
206
+ ],
207
+ "missing_fields": []
208
+ },
209
+ {
210
+ "id": "research_drift_hypothesis",
211
+ "domain": "research",
212
+ "artifact_class": "hypothesis",
213
+ "total_score": 0.675,
214
+ "class_specific_completeness": 0.64,
215
+ "ambiguity_remaining": 0.36,
216
+ "present_fields": [
217
+ "assumptions",
218
+ "boundary",
219
+ "claim",
220
+ "falsifiers",
221
+ "test_path"
222
+ ],
223
+ "missing_fields": []
224
+ },
225
+ {
226
+ "id": "research_handoff_experiment",
227
+ "domain": "research",
228
+ "artifact_class": "experiment",
229
+ "total_score": 0.711,
230
+ "class_specific_completeness": 0.692,
231
+ "ambiguity_remaining": 0.308,
232
+ "present_fields": [
233
+ "evidence_expectations",
234
+ "inputs",
235
+ "interpretation_limits",
236
+ "method",
237
+ "objective",
238
+ "outputs"
239
+ ],
240
+ "missing_fields": []
241
+ },
242
+ {
243
+ "id": "operations_habanero_checkpoint",
244
+ "domain": "operations",
245
+ "artifact_class": "checkpoint",
246
+ "total_score": 0.679,
247
+ "class_specific_completeness": 0.65,
248
+ "ambiguity_remaining": 0.35,
249
+ "present_fields": [
250
+ "artifact_refs",
251
+ "completed_unit",
252
+ "current_state",
253
+ "next_handoff_target",
254
+ "risks"
255
+ ],
256
+ "missing_fields": []
257
+ },
258
+ {
259
+ "id": "operations_runner_policy",
260
+ "domain": "operations",
261
+ "artifact_class": "policy",
262
+ "total_score": 0.713,
263
+ "class_specific_completeness": 0.67,
264
+ "ambiguity_remaining": 0.33,
265
+ "present_fields": [
266
+ "enforcement_surface",
267
+ "invariants",
268
+ "rationale",
269
+ "rule",
270
+ "scope"
271
+ ],
272
+ "missing_fields": []
273
+ },
274
+ {
275
+ "id": "writing_kernel_launch_result",
276
+ "domain": "writing",
277
+ "artifact_class": "result",
278
+ "total_score": 0.708,
279
+ "class_specific_completeness": 0.69,
280
+ "ambiguity_remaining": 0.31,
281
+ "present_fields": [
282
+ "claim",
283
+ "evidence_paths",
284
+ "interpretation_limits",
285
+ "next_follow_up",
286
+ "status"
287
+ ],
288
+ "missing_fields": []
289
+ }
290
+ ],
291
+ "mean_total_score": 0.687,
292
+ "mean_class_specific_completeness": 0.657,
293
+ "mean_ambiguity_remaining": 0.343,
294
+ "mean_dimension_scores": {
295
+ "artifact_type_clarity": 0.85,
296
+ "objective_clarity": 0.596,
297
+ "limits_clarity": 0.693,
298
+ "evaluation_clarity": 0.671,
299
+ "handoff_readiness": 0.655,
300
+ "class_specific_completeness": 0.657
301
+ }
302
+ },
303
+ "kernel": {
304
+ "condition": "kernel",
305
+ "cases_total": 7,
306
+ "rows": [
307
+ {
308
+ "id": "software_trace_widget",
309
+ "domain": "software",
310
+ "artifact_class": "task",
311
+ "total_score": 1.0,
312
+ "class_specific_completeness": 1.0,
313
+ "ambiguity_remaining": 0.0,
314
+ "present_fields": [
315
+ "boundary",
316
+ "constraints",
317
+ "goal",
318
+ "object",
319
+ "success_criteria"
320
+ ],
321
+ "missing_fields": []
322
+ },
323
+ {
324
+ "id": "product_project_home",
325
+ "domain": "product",
326
+ "artifact_class": "decision",
327
+ "total_score": 1.0,
328
+ "class_specific_completeness": 1.0,
329
+ "ambiguity_remaining": 0.0,
330
+ "present_fields": [
331
+ "chosen_path",
332
+ "consequences",
333
+ "question",
334
+ "rationale",
335
+ "rejected_alternatives"
336
+ ],
337
+ "missing_fields": []
338
+ },
339
+ {
340
+ "id": "research_drift_hypothesis",
341
+ "domain": "research",
342
+ "artifact_class": "hypothesis",
343
+ "total_score": 1.0,
344
+ "class_specific_completeness": 1.0,
345
+ "ambiguity_remaining": 0.0,
346
+ "present_fields": [
347
+ "assumptions",
348
+ "boundary",
349
+ "claim",
350
+ "falsifiers",
351
+ "test_path"
352
+ ],
353
+ "missing_fields": []
354
+ },
355
+ {
356
+ "id": "research_handoff_experiment",
357
+ "domain": "research",
358
+ "artifact_class": "experiment",
359
+ "total_score": 1.0,
360
+ "class_specific_completeness": 1.0,
361
+ "ambiguity_remaining": 0.0,
362
+ "present_fields": [
363
+ "evidence_expectations",
364
+ "inputs",
365
+ "interpretation_limits",
366
+ "method",
367
+ "objective",
368
+ "outputs"
369
+ ],
370
+ "missing_fields": []
371
+ },
372
+ {
373
+ "id": "operations_habanero_checkpoint",
374
+ "domain": "operations",
375
+ "artifact_class": "checkpoint",
376
+ "total_score": 1.0,
377
+ "class_specific_completeness": 1.0,
378
+ "ambiguity_remaining": 0.0,
379
+ "present_fields": [
380
+ "artifact_refs",
381
+ "completed_unit",
382
+ "current_state",
383
+ "next_handoff_target",
384
+ "risks"
385
+ ],
386
+ "missing_fields": []
387
+ },
388
+ {
389
+ "id": "operations_runner_policy",
390
+ "domain": "operations",
391
+ "artifact_class": "policy",
392
+ "total_score": 1.0,
393
+ "class_specific_completeness": 1.0,
394
+ "ambiguity_remaining": 0.0,
395
+ "present_fields": [
396
+ "enforcement_surface",
397
+ "invariants",
398
+ "rationale",
399
+ "rule",
400
+ "scope"
401
+ ],
402
+ "missing_fields": []
403
+ },
404
+ {
405
+ "id": "writing_kernel_launch_result",
406
+ "domain": "writing",
407
+ "artifact_class": "result",
408
+ "total_score": 1.0,
409
+ "class_specific_completeness": 1.0,
410
+ "ambiguity_remaining": 0.0,
411
+ "present_fields": [
412
+ "claim",
413
+ "evidence_paths",
414
+ "interpretation_limits",
415
+ "next_follow_up",
416
+ "status"
417
+ ],
418
+ "missing_fields": []
419
+ }
420
+ ],
421
+ "mean_total_score": 1.0,
422
+ "mean_class_specific_completeness": 1.0,
423
+ "mean_ambiguity_remaining": 0.0,
424
+ "mean_dimension_scores": {
425
+ "artifact_type_clarity": 1.0,
426
+ "objective_clarity": 1.0,
427
+ "limits_clarity": 1.0,
428
+ "evaluation_clarity": 1.0,
429
+ "handoff_readiness": 1.0,
430
+ "class_specific_completeness": 1.0
431
+ }
432
+ }
433
+ },
434
+ "pairwise": {
435
+ "kernel_vs_generic_checklist": {
436
+ "left": "kernel",
437
+ "right": "generic_checklist",
438
+ "wins": 7,
439
+ "ties": 0,
440
+ "losses": 0,
441
+ "mean_total_score_delta": 0.313,
442
+ "by_case": [
443
+ {
444
+ "id": "software_trace_widget",
445
+ "domain": "software",
446
+ "artifact_class": "task",
447
+ "left_score": 1.0,
448
+ "right_score": 0.697,
449
+ "delta": 0.303,
450
+ "outcome": "win"
451
+ },
452
+ {
453
+ "id": "product_project_home",
454
+ "domain": "product",
455
+ "artifact_class": "decision",
456
+ "left_score": 1.0,
457
+ "right_score": 0.627,
458
+ "delta": 0.373,
459
+ "outcome": "win"
460
+ },
461
+ {
462
+ "id": "research_drift_hypothesis",
463
+ "domain": "research",
464
+ "artifact_class": "hypothesis",
465
+ "left_score": 1.0,
466
+ "right_score": 0.675,
467
+ "delta": 0.325,
468
+ "outcome": "win"
469
+ },
470
+ {
471
+ "id": "research_handoff_experiment",
472
+ "domain": "research",
473
+ "artifact_class": "experiment",
474
+ "left_score": 1.0,
475
+ "right_score": 0.711,
476
+ "delta": 0.289,
477
+ "outcome": "win"
478
+ },
479
+ {
480
+ "id": "operations_habanero_checkpoint",
481
+ "domain": "operations",
482
+ "artifact_class": "checkpoint",
483
+ "left_score": 1.0,
484
+ "right_score": 0.679,
485
+ "delta": 0.321,
486
+ "outcome": "win"
487
+ },
488
+ {
489
+ "id": "operations_runner_policy",
490
+ "domain": "operations",
491
+ "artifact_class": "policy",
492
+ "left_score": 1.0,
493
+ "right_score": 0.713,
494
+ "delta": 0.287,
495
+ "outcome": "win"
496
+ },
497
+ {
498
+ "id": "writing_kernel_launch_result",
499
+ "domain": "writing",
500
+ "artifact_class": "result",
501
+ "left_score": 1.0,
502
+ "right_score": 0.708,
503
+ "delta": 0.292,
504
+ "outcome": "win"
505
+ }
506
+ ]
507
+ },
508
+ "kernel_vs_freeform": {
509
+ "left": "kernel",
510
+ "right": "freeform",
511
+ "wins": 7,
512
+ "ties": 0,
513
+ "losses": 0,
514
+ "mean_total_score_delta": 0.725,
515
+ "by_case": [
516
+ {
517
+ "id": "software_trace_widget",
518
+ "domain": "software",
519
+ "artifact_class": "task",
520
+ "left_score": 1.0,
521
+ "right_score": 0.208,
522
+ "delta": 0.792,
523
+ "outcome": "win"
524
+ },
525
+ {
526
+ "id": "product_project_home",
527
+ "domain": "product",
528
+ "artifact_class": "decision",
529
+ "left_score": 1.0,
530
+ "right_score": 0.323,
531
+ "delta": 0.677,
532
+ "outcome": "win"
533
+ },
534
+ {
535
+ "id": "research_drift_hypothesis",
536
+ "domain": "research",
537
+ "artifact_class": "hypothesis",
538
+ "left_score": 1.0,
539
+ "right_score": 0.323,
540
+ "delta": 0.677,
541
+ "outcome": "win"
542
+ },
543
+ {
544
+ "id": "research_handoff_experiment",
545
+ "domain": "research",
546
+ "artifact_class": "experiment",
547
+ "left_score": 1.0,
548
+ "right_score": 0.2,
549
+ "delta": 0.8,
550
+ "outcome": "win"
551
+ },
552
+ {
553
+ "id": "operations_habanero_checkpoint",
554
+ "domain": "operations",
555
+ "artifact_class": "checkpoint",
556
+ "left_score": 1.0,
557
+ "right_score": 0.285,
558
+ "delta": 0.715,
559
+ "outcome": "win"
560
+ },
561
+ {
562
+ "id": "operations_runner_policy",
563
+ "domain": "operations",
564
+ "artifact_class": "policy",
565
+ "left_score": 1.0,
566
+ "right_score": 0.298,
567
+ "delta": 0.702,
568
+ "outcome": "win"
569
+ },
570
+ {
571
+ "id": "writing_kernel_launch_result",
572
+ "domain": "writing",
573
+ "artifact_class": "result",
574
+ "left_score": 1.0,
575
+ "right_score": 0.285,
576
+ "delta": 0.715,
577
+ "outcome": "win"
578
+ }
579
+ ]
580
+ },
581
+ "generic_checklist_vs_freeform": {
582
+ "left": "generic_checklist",
583
+ "right": "freeform",
584
+ "wins": 7,
585
+ "ties": 0,
586
+ "losses": 0,
587
+ "mean_total_score_delta": 0.413,
588
+ "by_case": [
589
+ {
590
+ "id": "software_trace_widget",
591
+ "domain": "software",
592
+ "artifact_class": "task",
593
+ "left_score": 0.697,
594
+ "right_score": 0.208,
595
+ "delta": 0.489,
596
+ "outcome": "win"
597
+ },
598
+ {
599
+ "id": "product_project_home",
600
+ "domain": "product",
601
+ "artifact_class": "decision",
602
+ "left_score": 0.627,
603
+ "right_score": 0.323,
604
+ "delta": 0.304,
605
+ "outcome": "win"
606
+ },
607
+ {
608
+ "id": "research_drift_hypothesis",
609
+ "domain": "research",
610
+ "artifact_class": "hypothesis",
611
+ "left_score": 0.675,
612
+ "right_score": 0.323,
613
+ "delta": 0.352,
614
+ "outcome": "win"
615
+ },
616
+ {
617
+ "id": "research_handoff_experiment",
618
+ "domain": "research",
619
+ "artifact_class": "experiment",
620
+ "left_score": 0.711,
621
+ "right_score": 0.2,
622
+ "delta": 0.511,
623
+ "outcome": "win"
624
+ },
625
+ {
626
+ "id": "operations_habanero_checkpoint",
627
+ "domain": "operations",
628
+ "artifact_class": "checkpoint",
629
+ "left_score": 0.679,
630
+ "right_score": 0.285,
631
+ "delta": 0.394,
632
+ "outcome": "win"
633
+ },
634
+ {
635
+ "id": "operations_runner_policy",
636
+ "domain": "operations",
637
+ "artifact_class": "policy",
638
+ "left_score": 0.713,
639
+ "right_score": 0.298,
640
+ "delta": 0.415,
641
+ "outcome": "win"
642
+ },
643
+ {
644
+ "id": "writing_kernel_launch_result",
645
+ "domain": "writing",
646
+ "artifact_class": "result",
647
+ "left_score": 0.708,
648
+ "right_score": 0.285,
649
+ "delta": 0.423,
650
+ "outcome": "win"
651
+ }
652
+ ]
653
+ }
654
+ },
655
+ "claims": [
656
+ {
657
+ "id": "matched_internal_corpus_exists",
658
+ "claim": "ORP has a matched internal comparison corpus spanning multiple domains and all seven kernel artifact classes.",
659
+ "status": "pass"
660
+ },
661
+ {
662
+ "id": "kernel_outscores_generic_checklist_on_matched_corpus",
663
+ "claim": "On the matched internal comparison corpus, kernel artifacts achieve higher mean structural scores than generic checklist artifacts.",
664
+ "status": "pass"
665
+ },
666
+ {
667
+ "id": "kernel_outscores_freeform_on_matched_corpus",
668
+ "claim": "On the matched internal comparison corpus, kernel artifacts achieve higher mean structural scores than free-form artifacts.",
669
+ "status": "pass"
670
+ },
671
+ {
672
+ "id": "generic_checklist_improves_on_freeform_for_structure",
673
+ "claim": "On the matched internal comparison corpus, a generic checklist condition improves structural scores over free-form artifacts.",
674
+ "status": "pass"
675
+ },
676
+ {
677
+ "id": "kernel_preserves_full_required_coverage",
678
+ "claim": "On the matched internal comparison corpus, kernel artifacts preserve full class-specific required-field coverage.",
679
+ "status": "pass"
680
+ }
681
+ ],
682
+ "summary": {
683
+ "all_claims_pass": true,
684
+ "kernel_mean_total_score": 1.0,
685
+ "generic_checklist_mean_total_score": 0.687,
686
+ "freeform_mean_total_score": 0.275
687
+ }
688
+ }