open-research-protocol 0.4.7 → 0.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -0
- package/cli/orp.py +1158 -43
- package/docs/AGENT_LOOP.md +3 -0
- package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
- package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
- package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
- package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
- package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
- package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
- package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
- package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
- package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
- package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
- package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
- package/docs/ORP_YOUTUBE_INSPECT.md +97 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
- package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
- package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
- package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
- package/examples/README.md +2 -0
- package/examples/kernel/comparison/comparison-corpus.json +337 -0
- package/examples/kernel/comparison/next-task-continuation.json +55 -0
- package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
- package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
- package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
- package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
- package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
- package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
- package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
- package/llms.txt +3 -0
- package/package.json +4 -1
- package/scripts/orp-kernel-agent-pilot.py +673 -0
- package/scripts/orp-kernel-agent-replication.py +307 -0
- package/scripts/orp-kernel-benchmark.py +471 -2
- package/scripts/orp-kernel-canonical-continuation.py +381 -0
- package/scripts/orp-kernel-ci-check.py +138 -0
- package/scripts/orp-kernel-comparison.py +592 -0
- package/scripts/orp-kernel-continuation-pilot.py +384 -0
- package/scripts/orp-kernel-pickup.py +401 -0
- package/spec/v1/kernel-extension.schema.json +96 -0
- package/spec/v1/kernel-proposal.schema.json +115 -0
- package/spec/v1/kernel.schema.json +2 -1
- package/spec/v1/youtube-source.schema.json +151 -0
|
@@ -0,0 +1,594 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema_version": "1.0.0",
|
|
3
|
+
"kind": "orp_reasoning_kernel_pickup_report",
|
|
4
|
+
"metadata": {
|
|
5
|
+
"generated_at_utc": "2026-03-23T06:15:21Z",
|
|
6
|
+
"repo_commit": "c2f7f2a52744a00fb719d37de583da1f4ae615bd",
|
|
7
|
+
"repo_branch": "main",
|
|
8
|
+
"package_version": "0.4.7",
|
|
9
|
+
"python_version": "3.9.6",
|
|
10
|
+
"node_version": "v24.10.0",
|
|
11
|
+
"platform": "macOS-26.3-arm64-arm-64bit"
|
|
12
|
+
},
|
|
13
|
+
"corpus": {
|
|
14
|
+
"source": "examples/kernel/comparison/comparison-corpus.json",
|
|
15
|
+
"cases_total": 7
|
|
16
|
+
},
|
|
17
|
+
"conditions": {
|
|
18
|
+
"freeform": {
|
|
19
|
+
"condition": "freeform",
|
|
20
|
+
"cases_total": 7,
|
|
21
|
+
"rows": [
|
|
22
|
+
{
|
|
23
|
+
"id": "software_trace_widget",
|
|
24
|
+
"domain": "software",
|
|
25
|
+
"artifact_class": "task",
|
|
26
|
+
"pickup_score": 0.333,
|
|
27
|
+
"ambiguity_remaining": 0.667,
|
|
28
|
+
"answered_targets": 2,
|
|
29
|
+
"pickup_targets_total": 3,
|
|
30
|
+
"answers": {
|
|
31
|
+
"object": "terminal trace widget for lane monitoring.",
|
|
32
|
+
"constraints": "stay terminal-first and low friction.",
|
|
33
|
+
"success_criteria": null
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"id": "product_project_home",
|
|
38
|
+
"domain": "product",
|
|
39
|
+
"artifact_class": "decision",
|
|
40
|
+
"pickup_score": 0.5,
|
|
41
|
+
"ambiguity_remaining": 0.5,
|
|
42
|
+
"answered_targets": 3,
|
|
43
|
+
"pickup_targets_total": 3,
|
|
44
|
+
"answers": {
|
|
45
|
+
"question": "should the web app lead with linked projects or the old idea board?",
|
|
46
|
+
"chosen_path": "lead with linked projects first.",
|
|
47
|
+
"consequences": "the old idea board becomes secondary navigation."
|
|
48
|
+
}
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"id": "research_drift_hypothesis",
|
|
52
|
+
"domain": "research",
|
|
53
|
+
"artifact_class": "hypothesis",
|
|
54
|
+
"pickup_score": 0.5,
|
|
55
|
+
"ambiguity_remaining": 0.5,
|
|
56
|
+
"answered_targets": 3,
|
|
57
|
+
"pickup_targets_total": 3,
|
|
58
|
+
"answers": {
|
|
59
|
+
"claim": "short drift summaries will help operators notice stalled lanes faster.",
|
|
60
|
+
"boundary": "terminal-first multi-lane work.",
|
|
61
|
+
"test_path": "compare stalled-lane pickup with and without summaries."
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
"id": "research_handoff_experiment",
|
|
66
|
+
"domain": "research",
|
|
67
|
+
"artifact_class": "experiment",
|
|
68
|
+
"pickup_score": 0.5,
|
|
69
|
+
"ambiguity_remaining": 0.5,
|
|
70
|
+
"answered_targets": 3,
|
|
71
|
+
"pickup_targets_total": 3,
|
|
72
|
+
"answers": {
|
|
73
|
+
"objective": "compare free-form tasks, checklist tasks, and kernel tasks during handoff pickup.",
|
|
74
|
+
"method": "give matched task artifacts to a second operator and time correct interpretation.",
|
|
75
|
+
"outputs": "collect scores and clarification counts."
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"id": "operations_habanero_checkpoint",
|
|
80
|
+
"domain": "operations",
|
|
81
|
+
"artifact_class": "checkpoint",
|
|
82
|
+
"pickup_score": 0.5,
|
|
83
|
+
"ambiguity_remaining": 0.5,
|
|
84
|
+
"answered_targets": 3,
|
|
85
|
+
"pickup_targets_total": 3,
|
|
86
|
+
"answers": {
|
|
87
|
+
"current_state": "the repo is bound and the primary session is routable again.",
|
|
88
|
+
"risks": "other machines may still need a sync.",
|
|
89
|
+
"next_handoff_target": "rerun runner sync on active machines."
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
"id": "operations_runner_policy",
|
|
94
|
+
"domain": "operations",
|
|
95
|
+
"artifact_class": "policy",
|
|
96
|
+
"pickup_score": 0.333,
|
|
97
|
+
"ambiguity_remaining": 0.667,
|
|
98
|
+
"answered_targets": 2,
|
|
99
|
+
"pickup_targets_total": 3,
|
|
100
|
+
"answers": {
|
|
101
|
+
"scope": "hosted runner pickup.",
|
|
102
|
+
"rule": "only claim hosted jobs for linked projects that have a routeable local session.",
|
|
103
|
+
"enforcement_surface": null
|
|
104
|
+
}
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"id": "writing_kernel_launch_result",
|
|
108
|
+
"domain": "writing",
|
|
109
|
+
"artifact_class": "result",
|
|
110
|
+
"pickup_score": 0.5,
|
|
111
|
+
"ambiguity_remaining": 0.5,
|
|
112
|
+
"answered_targets": 3,
|
|
113
|
+
"pickup_targets_total": 3,
|
|
114
|
+
"answers": {
|
|
115
|
+
"claim": "ORP shipped the first reasoning kernel release.",
|
|
116
|
+
"status": "shipped in the CLI.",
|
|
117
|
+
"next_follow_up": "run comparative studies against free-form artifacts and checklist artifacts."
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
],
|
|
121
|
+
"mean_pickup_score": 0.452,
|
|
122
|
+
"mean_ambiguity_remaining": 0.548,
|
|
123
|
+
"mean_answered_target_rate": 0.905
|
|
124
|
+
},
|
|
125
|
+
"generic_checklist": {
|
|
126
|
+
"condition": "generic_checklist",
|
|
127
|
+
"cases_total": 7,
|
|
128
|
+
"rows": [
|
|
129
|
+
{
|
|
130
|
+
"id": "software_trace_widget",
|
|
131
|
+
"domain": "software",
|
|
132
|
+
"artifact_class": "task",
|
|
133
|
+
"pickup_score": 0.75,
|
|
134
|
+
"ambiguity_remaining": 0.25,
|
|
135
|
+
"answered_targets": 3,
|
|
136
|
+
"pickup_targets_total": 3,
|
|
137
|
+
"answers": {
|
|
138
|
+
"object": "Build the terminal trace widget for lane monitoring.",
|
|
139
|
+
"constraints": "low friction; no GUI dependency",
|
|
140
|
+
"success_criteria": "An operator can identify a drifting lane quickly."
|
|
141
|
+
}
|
|
142
|
+
},
|
|
143
|
+
{
|
|
144
|
+
"id": "product_project_home",
|
|
145
|
+
"domain": "product",
|
|
146
|
+
"artifact_class": "decision",
|
|
147
|
+
"pickup_score": 0.717,
|
|
148
|
+
"ambiguity_remaining": 0.283,
|
|
149
|
+
"answered_targets": 3,
|
|
150
|
+
"pickup_targets_total": 3,
|
|
151
|
+
"answers": {
|
|
152
|
+
"question": "Choose the linked-project home instead of the old idea-board-first home.",
|
|
153
|
+
"chosen_path": "Lead with linked projects and move broader idea browsing into Pensieve.",
|
|
154
|
+
"consequences": "Idea discovery becomes less prominent on the home screen."
|
|
155
|
+
}
|
|
156
|
+
},
|
|
157
|
+
{
|
|
158
|
+
"id": "research_drift_hypothesis",
|
|
159
|
+
"domain": "research",
|
|
160
|
+
"artifact_class": "hypothesis",
|
|
161
|
+
"pickup_score": 0.75,
|
|
162
|
+
"ambiguity_remaining": 0.25,
|
|
163
|
+
"answered_targets": 3,
|
|
164
|
+
"pickup_targets_total": 3,
|
|
165
|
+
"answers": {
|
|
166
|
+
"claim": "Drift summaries will improve stalled-lane pickup speed.",
|
|
167
|
+
"boundary": "Terminal-first multi-lane workflows.",
|
|
168
|
+
"test_path": "Measure stalled-lane detection time."
|
|
169
|
+
}
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"id": "research_handoff_experiment",
|
|
173
|
+
"domain": "research",
|
|
174
|
+
"artifact_class": "experiment",
|
|
175
|
+
"pickup_score": 0.717,
|
|
176
|
+
"ambiguity_remaining": 0.283,
|
|
177
|
+
"answered_targets": 3,
|
|
178
|
+
"pickup_targets_total": 3,
|
|
179
|
+
"answers": {
|
|
180
|
+
"objective": "Compare handoff pickup across free-form, checklist, and kernel task artifacts.",
|
|
181
|
+
"method": "Give a second operator one artifact at a time and record time to correct interpretation.",
|
|
182
|
+
"outputs": "Capture time to correct interpretation.; Capture clarification counts."
|
|
183
|
+
}
|
|
184
|
+
},
|
|
185
|
+
{
|
|
186
|
+
"id": "operations_habanero_checkpoint",
|
|
187
|
+
"domain": "operations",
|
|
188
|
+
"artifact_class": "checkpoint",
|
|
189
|
+
"pickup_score": 0.75,
|
|
190
|
+
"ambiguity_remaining": 0.25,
|
|
191
|
+
"answered_targets": 3,
|
|
192
|
+
"pickup_targets_total": 3,
|
|
193
|
+
"answers": {
|
|
194
|
+
"current_state": "The local repo, primary session, and hosted world are aligned again.",
|
|
195
|
+
"risks": "Other active machines may still carry stale routing state.",
|
|
196
|
+
"next_handoff_target": "Rerun runner sync on active machines and verify live pickup."
|
|
197
|
+
}
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
"id": "operations_runner_policy",
|
|
201
|
+
"domain": "operations",
|
|
202
|
+
"artifact_class": "policy",
|
|
203
|
+
"pickup_score": 0.75,
|
|
204
|
+
"ambiguity_remaining": 0.25,
|
|
205
|
+
"answered_targets": 3,
|
|
206
|
+
"pickup_targets_total": 3,
|
|
207
|
+
"answers": {
|
|
208
|
+
"scope": "Hosted runner job pickup.",
|
|
209
|
+
"rule": "Only claim hosted jobs for linked projects that have a routeable local session.",
|
|
210
|
+
"enforcement_surface": "Runner pickup rejects unroutable jobs."
|
|
211
|
+
}
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
"id": "writing_kernel_launch_result",
|
|
215
|
+
"domain": "writing",
|
|
216
|
+
"artifact_class": "result",
|
|
217
|
+
"pickup_score": 0.767,
|
|
218
|
+
"ambiguity_remaining": 0.233,
|
|
219
|
+
"answered_targets": 3,
|
|
220
|
+
"pickup_targets_total": 3,
|
|
221
|
+
"answers": {
|
|
222
|
+
"claim": "ORP shipped the first reasoning kernel release in the CLI.",
|
|
223
|
+
"status": "Release shipped; validation docs published",
|
|
224
|
+
"next_follow_up": "Run comparative artifact and handoff studies next."
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
],
|
|
228
|
+
"mean_pickup_score": 0.743,
|
|
229
|
+
"mean_ambiguity_remaining": 0.257,
|
|
230
|
+
"mean_answered_target_rate": 1.0
|
|
231
|
+
},
|
|
232
|
+
"kernel": {
|
|
233
|
+
"condition": "kernel",
|
|
234
|
+
"cases_total": 7,
|
|
235
|
+
"rows": [
|
|
236
|
+
{
|
|
237
|
+
"id": "software_trace_widget",
|
|
238
|
+
"domain": "software",
|
|
239
|
+
"artifact_class": "task",
|
|
240
|
+
"pickup_score": 1.0,
|
|
241
|
+
"ambiguity_remaining": 0.0,
|
|
242
|
+
"answered_targets": 3,
|
|
243
|
+
"pickup_targets_total": 3,
|
|
244
|
+
"answers": {
|
|
245
|
+
"object": "terminal trace widget",
|
|
246
|
+
"constraints": "low friction; no GUI dependency",
|
|
247
|
+
"success_criteria": "an operator can identify a drifting lane within 10 seconds; the widget does not overload the terminal surface"
|
|
248
|
+
}
|
|
249
|
+
},
|
|
250
|
+
{
|
|
251
|
+
"id": "product_project_home",
|
|
252
|
+
"domain": "product",
|
|
253
|
+
"artifact_class": "decision",
|
|
254
|
+
"pickup_score": 1.0,
|
|
255
|
+
"ambiguity_remaining": 0.0,
|
|
256
|
+
"answered_targets": 3,
|
|
257
|
+
"pickup_targets_total": 3,
|
|
258
|
+
"answers": {
|
|
259
|
+
"question": "Should the web app home foreground linked projects or the old idea board?",
|
|
260
|
+
"chosen_path": "Foreground linked projects and move broad idea browsing into Pensieve.",
|
|
261
|
+
"consequences": "linked projects become the primary home object; idea browsing becomes one click deeper"
|
|
262
|
+
}
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
"id": "research_drift_hypothesis",
|
|
266
|
+
"domain": "research",
|
|
267
|
+
"artifact_class": "hypothesis",
|
|
268
|
+
"pickup_score": 1.0,
|
|
269
|
+
"ambiguity_remaining": 0.0,
|
|
270
|
+
"answered_targets": 3,
|
|
271
|
+
"pickup_targets_total": 3,
|
|
272
|
+
"answers": {
|
|
273
|
+
"claim": "Short drift summaries reduce the time needed to identify stalled lanes.",
|
|
274
|
+
"boundary": "terminal-first multi-lane workflows; operators already monitoring active lanes",
|
|
275
|
+
"test_path": "Run matched stalled-lane pickup trials with and without summaries and compare detection time."
|
|
276
|
+
}
|
|
277
|
+
},
|
|
278
|
+
{
|
|
279
|
+
"id": "research_handoff_experiment",
|
|
280
|
+
"domain": "research",
|
|
281
|
+
"artifact_class": "experiment",
|
|
282
|
+
"pickup_score": 1.0,
|
|
283
|
+
"ambiguity_remaining": 0.0,
|
|
284
|
+
"answered_targets": 3,
|
|
285
|
+
"pickup_targets_total": 3,
|
|
286
|
+
"answers": {
|
|
287
|
+
"objective": "Measure whether kernel task artifacts improve handoff pickup quality over free-form and generic checklist alternatives.",
|
|
288
|
+
"method": "Run matched handoff trials where a second operator receives one artifact at a time and explains the task, constraints, and next action.",
|
|
289
|
+
"outputs": "pickup scores; clarification counts; time to correct interpretation"
|
|
290
|
+
}
|
|
291
|
+
},
|
|
292
|
+
{
|
|
293
|
+
"id": "operations_habanero_checkpoint",
|
|
294
|
+
"domain": "operations",
|
|
295
|
+
"artifact_class": "checkpoint",
|
|
296
|
+
"pickup_score": 1.0,
|
|
297
|
+
"ambiguity_remaining": 0.0,
|
|
298
|
+
"answered_targets": 3,
|
|
299
|
+
"pickup_targets_total": 3,
|
|
300
|
+
"answers": {
|
|
301
|
+
"current_state": "The local project link, primary session, and hosted world are synchronized and routable again.",
|
|
302
|
+
"risks": "inactive machines may still hold stale routing state; older queued jobs may need a fresh sync before pickup",
|
|
303
|
+
"next_handoff_target": "Rerun runner sync on active machines and verify one fresh hosted job pickup."
|
|
304
|
+
}
|
|
305
|
+
},
|
|
306
|
+
{
|
|
307
|
+
"id": "operations_runner_policy",
|
|
308
|
+
"domain": "operations",
|
|
309
|
+
"artifact_class": "policy",
|
|
310
|
+
"pickup_score": 1.0,
|
|
311
|
+
"ambiguity_remaining": 0.0,
|
|
312
|
+
"answered_targets": 3,
|
|
313
|
+
"pickup_targets_total": 3,
|
|
314
|
+
"answers": {
|
|
315
|
+
"scope": "Hosted runner job pickup and claim behavior.",
|
|
316
|
+
"rule": "Only claim a hosted job when the linked project has a routeable local session on the current machine.",
|
|
317
|
+
"enforcement_surface": "runner sync, poll, and work lifecycle"
|
|
318
|
+
}
|
|
319
|
+
},
|
|
320
|
+
{
|
|
321
|
+
"id": "writing_kernel_launch_result",
|
|
322
|
+
"domain": "writing",
|
|
323
|
+
"artifact_class": "result",
|
|
324
|
+
"pickup_score": 1.0,
|
|
325
|
+
"ambiguity_remaining": 0.0,
|
|
326
|
+
"answered_targets": 3,
|
|
327
|
+
"pickup_targets_total": 3,
|
|
328
|
+
"answers": {
|
|
329
|
+
"claim": "ORP shipped the first reasoning kernel release as a real CLI protocol surface.",
|
|
330
|
+
"status": "shipped in the ORP CLI and published to npm",
|
|
331
|
+
"next_follow_up": "Run the comparative artifact, pickup, and corpus-fit studies."
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
],
|
|
335
|
+
"mean_pickup_score": 1.0,
|
|
336
|
+
"mean_ambiguity_remaining": 0.0,
|
|
337
|
+
"mean_answered_target_rate": 1.0
|
|
338
|
+
}
|
|
339
|
+
},
|
|
340
|
+
"pairwise": {
|
|
341
|
+
"kernel_vs_generic_checklist": {
|
|
342
|
+
"left": "kernel",
|
|
343
|
+
"right": "generic_checklist",
|
|
344
|
+
"wins": 7,
|
|
345
|
+
"ties": 0,
|
|
346
|
+
"losses": 0,
|
|
347
|
+
"mean_pickup_score_delta": 0.257,
|
|
348
|
+
"by_case": [
|
|
349
|
+
{
|
|
350
|
+
"id": "software_trace_widget",
|
|
351
|
+
"domain": "software",
|
|
352
|
+
"artifact_class": "task",
|
|
353
|
+
"left_score": 1.0,
|
|
354
|
+
"right_score": 0.75,
|
|
355
|
+
"delta": 0.25,
|
|
356
|
+
"outcome": "win"
|
|
357
|
+
},
|
|
358
|
+
{
|
|
359
|
+
"id": "product_project_home",
|
|
360
|
+
"domain": "product",
|
|
361
|
+
"artifact_class": "decision",
|
|
362
|
+
"left_score": 1.0,
|
|
363
|
+
"right_score": 0.717,
|
|
364
|
+
"delta": 0.283,
|
|
365
|
+
"outcome": "win"
|
|
366
|
+
},
|
|
367
|
+
{
|
|
368
|
+
"id": "research_drift_hypothesis",
|
|
369
|
+
"domain": "research",
|
|
370
|
+
"artifact_class": "hypothesis",
|
|
371
|
+
"left_score": 1.0,
|
|
372
|
+
"right_score": 0.75,
|
|
373
|
+
"delta": 0.25,
|
|
374
|
+
"outcome": "win"
|
|
375
|
+
},
|
|
376
|
+
{
|
|
377
|
+
"id": "research_handoff_experiment",
|
|
378
|
+
"domain": "research",
|
|
379
|
+
"artifact_class": "experiment",
|
|
380
|
+
"left_score": 1.0,
|
|
381
|
+
"right_score": 0.717,
|
|
382
|
+
"delta": 0.283,
|
|
383
|
+
"outcome": "win"
|
|
384
|
+
},
|
|
385
|
+
{
|
|
386
|
+
"id": "operations_habanero_checkpoint",
|
|
387
|
+
"domain": "operations",
|
|
388
|
+
"artifact_class": "checkpoint",
|
|
389
|
+
"left_score": 1.0,
|
|
390
|
+
"right_score": 0.75,
|
|
391
|
+
"delta": 0.25,
|
|
392
|
+
"outcome": "win"
|
|
393
|
+
},
|
|
394
|
+
{
|
|
395
|
+
"id": "operations_runner_policy",
|
|
396
|
+
"domain": "operations",
|
|
397
|
+
"artifact_class": "policy",
|
|
398
|
+
"left_score": 1.0,
|
|
399
|
+
"right_score": 0.75,
|
|
400
|
+
"delta": 0.25,
|
|
401
|
+
"outcome": "win"
|
|
402
|
+
},
|
|
403
|
+
{
|
|
404
|
+
"id": "writing_kernel_launch_result",
|
|
405
|
+
"domain": "writing",
|
|
406
|
+
"artifact_class": "result",
|
|
407
|
+
"left_score": 1.0,
|
|
408
|
+
"right_score": 0.767,
|
|
409
|
+
"delta": 0.233,
|
|
410
|
+
"outcome": "win"
|
|
411
|
+
}
|
|
412
|
+
]
|
|
413
|
+
},
|
|
414
|
+
"kernel_vs_freeform": {
|
|
415
|
+
"left": "kernel",
|
|
416
|
+
"right": "freeform",
|
|
417
|
+
"wins": 7,
|
|
418
|
+
"ties": 0,
|
|
419
|
+
"losses": 0,
|
|
420
|
+
"mean_pickup_score_delta": 0.548,
|
|
421
|
+
"by_case": [
|
|
422
|
+
{
|
|
423
|
+
"id": "software_trace_widget",
|
|
424
|
+
"domain": "software",
|
|
425
|
+
"artifact_class": "task",
|
|
426
|
+
"left_score": 1.0,
|
|
427
|
+
"right_score": 0.333,
|
|
428
|
+
"delta": 0.667,
|
|
429
|
+
"outcome": "win"
|
|
430
|
+
},
|
|
431
|
+
{
|
|
432
|
+
"id": "product_project_home",
|
|
433
|
+
"domain": "product",
|
|
434
|
+
"artifact_class": "decision",
|
|
435
|
+
"left_score": 1.0,
|
|
436
|
+
"right_score": 0.5,
|
|
437
|
+
"delta": 0.5,
|
|
438
|
+
"outcome": "win"
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"id": "research_drift_hypothesis",
|
|
442
|
+
"domain": "research",
|
|
443
|
+
"artifact_class": "hypothesis",
|
|
444
|
+
"left_score": 1.0,
|
|
445
|
+
"right_score": 0.5,
|
|
446
|
+
"delta": 0.5,
|
|
447
|
+
"outcome": "win"
|
|
448
|
+
},
|
|
449
|
+
{
|
|
450
|
+
"id": "research_handoff_experiment",
|
|
451
|
+
"domain": "research",
|
|
452
|
+
"artifact_class": "experiment",
|
|
453
|
+
"left_score": 1.0,
|
|
454
|
+
"right_score": 0.5,
|
|
455
|
+
"delta": 0.5,
|
|
456
|
+
"outcome": "win"
|
|
457
|
+
},
|
|
458
|
+
{
|
|
459
|
+
"id": "operations_habanero_checkpoint",
|
|
460
|
+
"domain": "operations",
|
|
461
|
+
"artifact_class": "checkpoint",
|
|
462
|
+
"left_score": 1.0,
|
|
463
|
+
"right_score": 0.5,
|
|
464
|
+
"delta": 0.5,
|
|
465
|
+
"outcome": "win"
|
|
466
|
+
},
|
|
467
|
+
{
|
|
468
|
+
"id": "operations_runner_policy",
|
|
469
|
+
"domain": "operations",
|
|
470
|
+
"artifact_class": "policy",
|
|
471
|
+
"left_score": 1.0,
|
|
472
|
+
"right_score": 0.333,
|
|
473
|
+
"delta": 0.667,
|
|
474
|
+
"outcome": "win"
|
|
475
|
+
},
|
|
476
|
+
{
|
|
477
|
+
"id": "writing_kernel_launch_result",
|
|
478
|
+
"domain": "writing",
|
|
479
|
+
"artifact_class": "result",
|
|
480
|
+
"left_score": 1.0,
|
|
481
|
+
"right_score": 0.5,
|
|
482
|
+
"delta": 0.5,
|
|
483
|
+
"outcome": "win"
|
|
484
|
+
}
|
|
485
|
+
]
|
|
486
|
+
},
|
|
487
|
+
"generic_checklist_vs_freeform": {
|
|
488
|
+
"left": "generic_checklist",
|
|
489
|
+
"right": "freeform",
|
|
490
|
+
"wins": 7,
|
|
491
|
+
"ties": 0,
|
|
492
|
+
"losses": 0,
|
|
493
|
+
"mean_pickup_score_delta": 0.291,
|
|
494
|
+
"by_case": [
|
|
495
|
+
{
|
|
496
|
+
"id": "software_trace_widget",
|
|
497
|
+
"domain": "software",
|
|
498
|
+
"artifact_class": "task",
|
|
499
|
+
"left_score": 0.75,
|
|
500
|
+
"right_score": 0.333,
|
|
501
|
+
"delta": 0.417,
|
|
502
|
+
"outcome": "win"
|
|
503
|
+
},
|
|
504
|
+
{
|
|
505
|
+
"id": "product_project_home",
|
|
506
|
+
"domain": "product",
|
|
507
|
+
"artifact_class": "decision",
|
|
508
|
+
"left_score": 0.717,
|
|
509
|
+
"right_score": 0.5,
|
|
510
|
+
"delta": 0.217,
|
|
511
|
+
"outcome": "win"
|
|
512
|
+
},
|
|
513
|
+
{
|
|
514
|
+
"id": "research_drift_hypothesis",
|
|
515
|
+
"domain": "research",
|
|
516
|
+
"artifact_class": "hypothesis",
|
|
517
|
+
"left_score": 0.75,
|
|
518
|
+
"right_score": 0.5,
|
|
519
|
+
"delta": 0.25,
|
|
520
|
+
"outcome": "win"
|
|
521
|
+
},
|
|
522
|
+
{
|
|
523
|
+
"id": "research_handoff_experiment",
|
|
524
|
+
"domain": "research",
|
|
525
|
+
"artifact_class": "experiment",
|
|
526
|
+
"left_score": 0.717,
|
|
527
|
+
"right_score": 0.5,
|
|
528
|
+
"delta": 0.217,
|
|
529
|
+
"outcome": "win"
|
|
530
|
+
},
|
|
531
|
+
{
|
|
532
|
+
"id": "operations_habanero_checkpoint",
|
|
533
|
+
"domain": "operations",
|
|
534
|
+
"artifact_class": "checkpoint",
|
|
535
|
+
"left_score": 0.75,
|
|
536
|
+
"right_score": 0.5,
|
|
537
|
+
"delta": 0.25,
|
|
538
|
+
"outcome": "win"
|
|
539
|
+
},
|
|
540
|
+
{
|
|
541
|
+
"id": "operations_runner_policy",
|
|
542
|
+
"domain": "operations",
|
|
543
|
+
"artifact_class": "policy",
|
|
544
|
+
"left_score": 0.75,
|
|
545
|
+
"right_score": 0.333,
|
|
546
|
+
"delta": 0.417,
|
|
547
|
+
"outcome": "win"
|
|
548
|
+
},
|
|
549
|
+
{
|
|
550
|
+
"id": "writing_kernel_launch_result",
|
|
551
|
+
"domain": "writing",
|
|
552
|
+
"artifact_class": "result",
|
|
553
|
+
"left_score": 0.767,
|
|
554
|
+
"right_score": 0.5,
|
|
555
|
+
"delta": 0.267,
|
|
556
|
+
"outcome": "win"
|
|
557
|
+
}
|
|
558
|
+
]
|
|
559
|
+
}
|
|
560
|
+
},
|
|
561
|
+
"claims": [
|
|
562
|
+
{
|
|
563
|
+
"id": "matched_pickup_corpus_exists",
|
|
564
|
+
"claim": "ORP has a matched internal pickup corpus spanning all seven kernel artifact classes.",
|
|
565
|
+
"status": "pass"
|
|
566
|
+
},
|
|
567
|
+
{
|
|
568
|
+
"id": "kernel_outscores_generic_checklist_on_pickup_proxy",
|
|
569
|
+
"claim": "On the matched internal pickup proxy, kernel artifacts preserve more explicit pickup-ready information than generic checklist artifacts.",
|
|
570
|
+
"status": "pass"
|
|
571
|
+
},
|
|
572
|
+
{
|
|
573
|
+
"id": "kernel_outscores_freeform_on_pickup_proxy",
|
|
574
|
+
"claim": "On the matched internal pickup proxy, kernel artifacts preserve more explicit pickup-ready information than free-form artifacts.",
|
|
575
|
+
"status": "pass"
|
|
576
|
+
},
|
|
577
|
+
{
|
|
578
|
+
"id": "generic_checklist_improves_on_freeform_on_pickup_proxy",
|
|
579
|
+
"claim": "On the matched internal pickup proxy, a generic checklist preserves more explicit pickup-ready information than free-form artifacts.",
|
|
580
|
+
"status": "pass"
|
|
581
|
+
},
|
|
582
|
+
{
|
|
583
|
+
"id": "kernel_preserves_full_pickup_targets",
|
|
584
|
+
"claim": "On the matched internal pickup proxy, kernel artifacts keep all pickup targets explicitly answerable.",
|
|
585
|
+
"status": "pass"
|
|
586
|
+
}
|
|
587
|
+
],
|
|
588
|
+
"summary": {
|
|
589
|
+
"all_claims_pass": true,
|
|
590
|
+
"kernel_mean_pickup_score": 1.0,
|
|
591
|
+
"generic_checklist_mean_pickup_score": 0.743,
|
|
592
|
+
"freeform_mean_pickup_score": 0.452
|
|
593
|
+
}
|
|
594
|
+
}
|