open-research-protocol 0.4.7 → 0.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -0
- package/cli/orp.py +1158 -43
- package/docs/AGENT_LOOP.md +3 -0
- package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
- package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
- package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
- package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
- package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
- package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
- package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
- package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
- package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
- package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
- package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
- package/docs/ORP_YOUTUBE_INSPECT.md +97 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
- package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
- package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
- package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
- package/examples/README.md +2 -0
- package/examples/kernel/comparison/comparison-corpus.json +337 -0
- package/examples/kernel/comparison/next-task-continuation.json +55 -0
- package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
- package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
- package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
- package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
- package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
- package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
- package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
- package/llms.txt +3 -0
- package/package.json +4 -1
- package/scripts/orp-kernel-agent-pilot.py +673 -0
- package/scripts/orp-kernel-agent-replication.py +307 -0
- package/scripts/orp-kernel-benchmark.py +471 -2
- package/scripts/orp-kernel-canonical-continuation.py +381 -0
- package/scripts/orp-kernel-ci-check.py +138 -0
- package/scripts/orp-kernel-comparison.py +592 -0
- package/scripts/orp-kernel-continuation-pilot.py +384 -0
- package/scripts/orp-kernel-pickup.py +401 -0
- package/spec/v1/kernel-extension.schema.json +96 -0
- package/spec/v1/kernel-proposal.schema.json +115 -0
- package/spec/v1/kernel.schema.json +2 -1
- package/spec/v1/youtube-source.schema.json +151 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema_version": "1.0.0",
|
|
3
|
+
"kind": "orp_reasoning_kernel_canonical_continuation_report",
|
|
4
|
+
"metadata": {
|
|
5
|
+
"generated_at_utc": "2026-03-23T08:59:16Z",
|
|
6
|
+
"repo_commit": "c2f7f2a52744a00fb719d37de583da1f4ae615bd",
|
|
7
|
+
"repo_branch": "main",
|
|
8
|
+
"package_version": "0.4.7",
|
|
9
|
+
"python_version": "3.9.6",
|
|
10
|
+
"codex_version": "codex-cli 0.116.0",
|
|
11
|
+
"platform": "macOS-26.3-arm64-arm-64bit",
|
|
12
|
+
"model": "default"
|
|
13
|
+
},
|
|
14
|
+
"corpus": {
|
|
15
|
+
"cases_total": 1,
|
|
16
|
+
"domains": [
|
|
17
|
+
"software"
|
|
18
|
+
],
|
|
19
|
+
"artifact_classes": [
|
|
20
|
+
"task"
|
|
21
|
+
]
|
|
22
|
+
},
|
|
23
|
+
"conditions": {
|
|
24
|
+
"freeform": {
|
|
25
|
+
"condition": "freeform",
|
|
26
|
+
"cases_total": 1,
|
|
27
|
+
"rows": [
|
|
28
|
+
{
|
|
29
|
+
"id": "software_trace_widget",
|
|
30
|
+
"domain": "software",
|
|
31
|
+
"artifact_class": "task",
|
|
32
|
+
"total_score": 0.633,
|
|
33
|
+
"alignment_score": 0.4,
|
|
34
|
+
"invention_rate": 0.5,
|
|
35
|
+
"missing_list_match": 1.0,
|
|
36
|
+
"answers": {
|
|
37
|
+
"object": "terminal trace widget for lane monitoring",
|
|
38
|
+
"goal": "let operators tell quickly when a lane is drifting",
|
|
39
|
+
"boundary": "sketch the event inputs and compact panel layout",
|
|
40
|
+
"constraints": "stay terminal-first and low friction",
|
|
41
|
+
"success_criteria": null
|
|
42
|
+
},
|
|
43
|
+
"field_similarity": {
|
|
44
|
+
"object": 1.0,
|
|
45
|
+
"goal": 0.333,
|
|
46
|
+
"boundary": 0.0,
|
|
47
|
+
"constraints": 0.5,
|
|
48
|
+
"success_criteria": 0.0
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
],
|
|
52
|
+
"mean_total_score": 0.633,
|
|
53
|
+
"mean_alignment_score": 0.4,
|
|
54
|
+
"mean_invention_rate": 0.5,
|
|
55
|
+
"mean_missing_list_match": 1.0,
|
|
56
|
+
"mean_confidence": 0.87,
|
|
57
|
+
"mean_elapsed_ms": 12235.633
|
|
58
|
+
},
|
|
59
|
+
"generic_checklist": {
|
|
60
|
+
"condition": "generic_checklist",
|
|
61
|
+
"cases_total": 1,
|
|
62
|
+
"rows": [
|
|
63
|
+
{
|
|
64
|
+
"id": "software_trace_widget",
|
|
65
|
+
"domain": "software",
|
|
66
|
+
"artifact_class": "task",
|
|
67
|
+
"total_score": 0.733,
|
|
68
|
+
"alignment_score": 0.6,
|
|
69
|
+
"invention_rate": 0.4,
|
|
70
|
+
"missing_list_match": 1.0,
|
|
71
|
+
"answers": {
|
|
72
|
+
"object": "Define the event schema and first rendering pass for the terminal trace widget.",
|
|
73
|
+
"goal": "Build the terminal trace widget for lane monitoring.",
|
|
74
|
+
"boundary": "Terminal-first lane visibility in active ORP sessions.",
|
|
75
|
+
"constraints": "Low friction; no GUI dependency.",
|
|
76
|
+
"success_criteria": "An operator can identify a drifting lane quickly."
|
|
77
|
+
},
|
|
78
|
+
"field_similarity": {
|
|
79
|
+
"object": 1.0,
|
|
80
|
+
"goal": 0.167,
|
|
81
|
+
"boundary": 1.0,
|
|
82
|
+
"constraints": 1.0,
|
|
83
|
+
"success_criteria": 0.333
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
],
|
|
87
|
+
"mean_total_score": 0.733,
|
|
88
|
+
"mean_alignment_score": 0.6,
|
|
89
|
+
"mean_invention_rate": 0.4,
|
|
90
|
+
"mean_missing_list_match": 1.0,
|
|
91
|
+
"mean_confidence": 0.93,
|
|
92
|
+
"mean_elapsed_ms": 39052.112
|
|
93
|
+
},
|
|
94
|
+
"kernel": {
|
|
95
|
+
"condition": "kernel",
|
|
96
|
+
"cases_total": 1,
|
|
97
|
+
"rows": [
|
|
98
|
+
{
|
|
99
|
+
"id": "software_trace_widget",
|
|
100
|
+
"domain": "software",
|
|
101
|
+
"artifact_class": "task",
|
|
102
|
+
"total_score": 1.0,
|
|
103
|
+
"alignment_score": 1.0,
|
|
104
|
+
"invention_rate": 0.0,
|
|
105
|
+
"missing_list_match": 1.0,
|
|
106
|
+
"answers": {
|
|
107
|
+
"object": "terminal trace widget",
|
|
108
|
+
"goal": "surface lane drift and state clearly for operators",
|
|
109
|
+
"boundary": "terminal-first lane visibility; active ORP sessions only",
|
|
110
|
+
"constraints": "low friction; no GUI dependency",
|
|
111
|
+
"success_criteria": "an operator can identify a drifting lane within 10 seconds; the widget does not overload the terminal surface"
|
|
112
|
+
},
|
|
113
|
+
"field_similarity": {
|
|
114
|
+
"object": 1.0,
|
|
115
|
+
"goal": 1.0,
|
|
116
|
+
"boundary": 1.0,
|
|
117
|
+
"constraints": 1.0,
|
|
118
|
+
"success_criteria": 0.778
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
],
|
|
122
|
+
"mean_total_score": 1.0,
|
|
123
|
+
"mean_alignment_score": 1.0,
|
|
124
|
+
"mean_invention_rate": 0.0,
|
|
125
|
+
"mean_missing_list_match": 1.0,
|
|
126
|
+
"mean_confidence": 0.99,
|
|
127
|
+
"mean_elapsed_ms": 10485.518
|
|
128
|
+
}
|
|
129
|
+
},
|
|
130
|
+
"pairwise": {
|
|
131
|
+
"kernel_vs_generic_checklist": {
|
|
132
|
+
"left": "kernel",
|
|
133
|
+
"right": "generic_checklist",
|
|
134
|
+
"wins": 1,
|
|
135
|
+
"ties": 0,
|
|
136
|
+
"losses": 0,
|
|
137
|
+
"mean_total_score_delta": 0.267
|
|
138
|
+
},
|
|
139
|
+
"kernel_vs_freeform": {
|
|
140
|
+
"left": "kernel",
|
|
141
|
+
"right": "freeform",
|
|
142
|
+
"wins": 1,
|
|
143
|
+
"ties": 0,
|
|
144
|
+
"losses": 0,
|
|
145
|
+
"mean_total_score_delta": 0.367
|
|
146
|
+
}
|
|
147
|
+
},
|
|
148
|
+
"claims": [
|
|
149
|
+
{
|
|
150
|
+
"id": "kernel_outscores_generic_checklist_on_canonical_task_continuation",
|
|
151
|
+
"claim": "On the matched live canonical-task continuation benchmark, kernel artifacts produce task artifacts that meet or exceed generic checklist quality without a higher invention rate.",
|
|
152
|
+
"status": "pass"
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
"id": "kernel_outscores_freeform_on_canonical_task_continuation",
|
|
156
|
+
"claim": "On the matched live canonical-task continuation benchmark, kernel artifacts produce stronger next-task artifacts than free-form artifacts.",
|
|
157
|
+
"status": "pass"
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
"id": "kernel_minimizes_invention_on_canonical_task_continuation",
|
|
161
|
+
"claim": "On the matched live canonical-task continuation benchmark, kernel artifacts minimize unsupported task-field invention.",
|
|
162
|
+
"status": "pass"
|
|
163
|
+
}
|
|
164
|
+
],
|
|
165
|
+
"summary": {
|
|
166
|
+
"all_claims_pass": true,
|
|
167
|
+
"kernel_mean_total_score": 1.0,
|
|
168
|
+
"generic_checklist_mean_total_score": 0.733,
|
|
169
|
+
"freeform_mean_total_score": 0.633,
|
|
170
|
+
"kernel_mean_invention_rate": 0.0,
|
|
171
|
+
"generic_checklist_mean_invention_rate": 0.4,
|
|
172
|
+
"freeform_mean_invention_rate": 0.5
|
|
173
|
+
}
|
|
174
|
+
}
|