open-research-protocol 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -0
- package/cli/orp.py +668 -43
- package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
- package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
- package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
- package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
- package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
- package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
- package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
- package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
- package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
- package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
- package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
- package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
- package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
- package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
- package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
- package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
- package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
- package/examples/README.md +2 -0
- package/examples/kernel/comparison/comparison-corpus.json +337 -0
- package/examples/kernel/comparison/next-task-continuation.json +55 -0
- package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
- package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
- package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
- package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
- package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
- package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
- package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
- package/package.json +4 -1
- package/scripts/orp-kernel-agent-pilot.py +673 -0
- package/scripts/orp-kernel-agent-replication.py +307 -0
- package/scripts/orp-kernel-benchmark.py +471 -2
- package/scripts/orp-kernel-canonical-continuation.py +381 -0
- package/scripts/orp-kernel-ci-check.py +138 -0
- package/scripts/orp-kernel-comparison.py +592 -0
- package/scripts/orp-kernel-continuation-pilot.py +384 -0
- package/scripts/orp-kernel-pickup.py +401 -0
- package/spec/v1/kernel-extension.schema.json +96 -0
- package/spec/v1/kernel-proposal.schema.json +115 -0
- package/spec/v1/kernel.schema.json +2 -1
|
@@ -0,0 +1,1927 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema_version": "1.0.0",
|
|
3
|
+
"kind": "orp_reasoning_kernel_agent_replication_report",
|
|
4
|
+
"metadata": {
|
|
5
|
+
"generated_at_utc": "2026-03-23T08:40:58Z",
|
|
6
|
+
"repo_commit": "c2f7f2a52744a00fb719d37de583da1f4ae615bd",
|
|
7
|
+
"repo_branch": "main",
|
|
8
|
+
"package_version": "0.4.7",
|
|
9
|
+
"python_version": "3.9.6",
|
|
10
|
+
"codex_version": "codex-cli 0.116.0",
|
|
11
|
+
"platform": "macOS-26.3-arm64-arm-64bit",
|
|
12
|
+
"model": "default",
|
|
13
|
+
"repeats": 2
|
|
14
|
+
},
|
|
15
|
+
"runs": [
|
|
16
|
+
{
|
|
17
|
+
"run_index": 1,
|
|
18
|
+
"summary": {
|
|
19
|
+
"all_claims_pass": true,
|
|
20
|
+
"kernel_mean_pickup_score": 1.0,
|
|
21
|
+
"generic_checklist_mean_pickup_score": 0.781,
|
|
22
|
+
"freeform_mean_pickup_score": 0.724,
|
|
23
|
+
"kernel_mean_invention_rate": 0.0,
|
|
24
|
+
"generic_checklist_mean_invention_rate": 0.0,
|
|
25
|
+
"freeform_mean_invention_rate": 0.0
|
|
26
|
+
},
|
|
27
|
+
"conditions": {
|
|
28
|
+
"freeform": {
|
|
29
|
+
"condition": "freeform",
|
|
30
|
+
"cases_total": 7,
|
|
31
|
+
"rows": [
|
|
32
|
+
{
|
|
33
|
+
"id": "software_trace_widget",
|
|
34
|
+
"domain": "software",
|
|
35
|
+
"artifact_class": "task",
|
|
36
|
+
"pickup_score": 0.6,
|
|
37
|
+
"ambiguity_remaining": 0.4,
|
|
38
|
+
"answered_targets": 3,
|
|
39
|
+
"pickup_targets_total": 5,
|
|
40
|
+
"expected_present_fields": [
|
|
41
|
+
"constraints",
|
|
42
|
+
"goal",
|
|
43
|
+
"object"
|
|
44
|
+
],
|
|
45
|
+
"answers": {
|
|
46
|
+
"object": "terminal trace widget for lane monitoring.",
|
|
47
|
+
"goal": "let operators tell quickly when a lane is drifting.",
|
|
48
|
+
"boundary": null,
|
|
49
|
+
"constraints": "stay terminal-first and low friction.",
|
|
50
|
+
"success_criteria": null
|
|
51
|
+
},
|
|
52
|
+
"invented_fields": [],
|
|
53
|
+
"invention_rate": 0.0,
|
|
54
|
+
"artifact_type_guess": "brief product note",
|
|
55
|
+
"confidence": 0.97,
|
|
56
|
+
"ambiguities_count": 3,
|
|
57
|
+
"elapsed_ms": 13484.248,
|
|
58
|
+
"tokens_used": null,
|
|
59
|
+
"session_id": ""
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"id": "product_project_home",
|
|
63
|
+
"domain": "product",
|
|
64
|
+
"artifact_class": "decision",
|
|
65
|
+
"pickup_score": 0.8,
|
|
66
|
+
"ambiguity_remaining": 0.2,
|
|
67
|
+
"answered_targets": 4,
|
|
68
|
+
"pickup_targets_total": 5,
|
|
69
|
+
"expected_present_fields": [
|
|
70
|
+
"chosen_path",
|
|
71
|
+
"consequences",
|
|
72
|
+
"question",
|
|
73
|
+
"rationale"
|
|
74
|
+
],
|
|
75
|
+
"answers": {
|
|
76
|
+
"question": "should the web app lead with linked projects or the old idea board?",
|
|
77
|
+
"chosen_path": "lead with linked projects first.",
|
|
78
|
+
"rejected_alternatives": null,
|
|
79
|
+
"rationale": "active work should be foregrounded and idea browsing can move into Pensieve.",
|
|
80
|
+
"consequences": "the old idea board becomes secondary navigation."
|
|
81
|
+
},
|
|
82
|
+
"invented_fields": [],
|
|
83
|
+
"invention_rate": 0.0,
|
|
84
|
+
"artifact_type_guess": "product decision note",
|
|
85
|
+
"confidence": 0.96,
|
|
86
|
+
"ambiguities_count": 3,
|
|
87
|
+
"elapsed_ms": 15078.676,
|
|
88
|
+
"tokens_used": null,
|
|
89
|
+
"session_id": ""
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"id": "research_drift_hypothesis",
|
|
93
|
+
"domain": "research",
|
|
94
|
+
"artifact_class": "hypothesis",
|
|
95
|
+
"pickup_score": 0.8,
|
|
96
|
+
"ambiguity_remaining": 0.2,
|
|
97
|
+
"answered_targets": 4,
|
|
98
|
+
"pickup_targets_total": 5,
|
|
99
|
+
"expected_present_fields": [
|
|
100
|
+
"assumptions",
|
|
101
|
+
"boundary",
|
|
102
|
+
"claim",
|
|
103
|
+
"test_path"
|
|
104
|
+
],
|
|
105
|
+
"answers": {
|
|
106
|
+
"claim": "short drift summaries will help operators notice stalled lanes faster.",
|
|
107
|
+
"boundary": "terminal-first multi-lane work.",
|
|
108
|
+
"assumptions": "operators glance at summaries while they work.",
|
|
109
|
+
"test_path": "compare stalled-lane pickup with and without summaries.",
|
|
110
|
+
"falsifiers": null
|
|
111
|
+
},
|
|
112
|
+
"invented_fields": [],
|
|
113
|
+
"invention_rate": 0.0,
|
|
114
|
+
"artifact_type_guess": "hypothesis summary",
|
|
115
|
+
"confidence": 0.94,
|
|
116
|
+
"ambiguities_count": 3,
|
|
117
|
+
"elapsed_ms": 16733.104,
|
|
118
|
+
"tokens_used": null,
|
|
119
|
+
"session_id": ""
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
"id": "research_handoff_experiment",
|
|
123
|
+
"domain": "research",
|
|
124
|
+
"artifact_class": "experiment",
|
|
125
|
+
"pickup_score": 0.667,
|
|
126
|
+
"ambiguity_remaining": 0.333,
|
|
127
|
+
"answered_targets": 4,
|
|
128
|
+
"pickup_targets_total": 6,
|
|
129
|
+
"expected_present_fields": [
|
|
130
|
+
"evidence_expectations",
|
|
131
|
+
"interpretation_limits",
|
|
132
|
+
"method",
|
|
133
|
+
"objective",
|
|
134
|
+
"outputs"
|
|
135
|
+
],
|
|
136
|
+
"answers": {
|
|
137
|
+
"objective": "compare free-form tasks, checklist tasks, and kernel tasks during handoff pickup.",
|
|
138
|
+
"method": "give matched task artifacts to a second operator and time correct interpretation.",
|
|
139
|
+
"inputs": null,
|
|
140
|
+
"outputs": null,
|
|
141
|
+
"evidence_expectations": "collect scores and clarification counts.",
|
|
142
|
+
"interpretation_limits": "internal sample only."
|
|
143
|
+
},
|
|
144
|
+
"invented_fields": [],
|
|
145
|
+
"invention_rate": 0.0,
|
|
146
|
+
"artifact_type_guess": "experiment handoff note",
|
|
147
|
+
"confidence": 0.95,
|
|
148
|
+
"ambiguities_count": 3,
|
|
149
|
+
"elapsed_ms": 13288.149,
|
|
150
|
+
"tokens_used": null,
|
|
151
|
+
"session_id": ""
|
|
152
|
+
},
|
|
153
|
+
{
|
|
154
|
+
"id": "operations_habanero_checkpoint",
|
|
155
|
+
"domain": "operations",
|
|
156
|
+
"artifact_class": "checkpoint",
|
|
157
|
+
"pickup_score": 0.8,
|
|
158
|
+
"ambiguity_remaining": 0.2,
|
|
159
|
+
"answered_targets": 4,
|
|
160
|
+
"pickup_targets_total": 5,
|
|
161
|
+
"expected_present_fields": [
|
|
162
|
+
"completed_unit",
|
|
163
|
+
"current_state",
|
|
164
|
+
"next_handoff_target",
|
|
165
|
+
"risks"
|
|
166
|
+
],
|
|
167
|
+
"answers": {
|
|
168
|
+
"completed_unit": "restored linked-project routing for Habanero.",
|
|
169
|
+
"current_state": "the repo is bound and the primary session is routable again.",
|
|
170
|
+
"risks": "other machines may still need a sync.",
|
|
171
|
+
"next_handoff_target": "rerun runner sync on active machines.",
|
|
172
|
+
"artifact_refs": null
|
|
173
|
+
},
|
|
174
|
+
"invented_fields": [],
|
|
175
|
+
"invention_rate": 0.0,
|
|
176
|
+
"artifact_type_guess": "checkpoint note",
|
|
177
|
+
"confidence": 0.97,
|
|
178
|
+
"ambiguities_count": 2,
|
|
179
|
+
"elapsed_ms": 30009.015,
|
|
180
|
+
"tokens_used": null,
|
|
181
|
+
"session_id": ""
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
"id": "operations_runner_policy",
|
|
185
|
+
"domain": "operations",
|
|
186
|
+
"artifact_class": "policy",
|
|
187
|
+
"pickup_score": 0.8,
|
|
188
|
+
"ambiguity_remaining": 0.2,
|
|
189
|
+
"answered_targets": 4,
|
|
190
|
+
"pickup_targets_total": 5,
|
|
191
|
+
"expected_present_fields": [
|
|
192
|
+
"invariants",
|
|
193
|
+
"rationale",
|
|
194
|
+
"rule",
|
|
195
|
+
"scope"
|
|
196
|
+
],
|
|
197
|
+
"answers": {
|
|
198
|
+
"scope": "hosted runner pickup",
|
|
199
|
+
"rule": "only claim hosted jobs for linked projects that have a routeable local session",
|
|
200
|
+
"rationale": "avoid claiming work with nowhere real to execute",
|
|
201
|
+
"invariants": "a claimed job must resolve to an actual local session",
|
|
202
|
+
"enforcement_surface": null
|
|
203
|
+
},
|
|
204
|
+
"invented_fields": [],
|
|
205
|
+
"invention_rate": 0.0,
|
|
206
|
+
"artifact_type_guess": "policy",
|
|
207
|
+
"confidence": 0.95,
|
|
208
|
+
"ambiguities_count": 3,
|
|
209
|
+
"elapsed_ms": 15977.877,
|
|
210
|
+
"tokens_used": null,
|
|
211
|
+
"session_id": ""
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
"id": "writing_kernel_launch_result",
|
|
215
|
+
"domain": "writing",
|
|
216
|
+
"artifact_class": "result",
|
|
217
|
+
"pickup_score": 0.6,
|
|
218
|
+
"ambiguity_remaining": 0.4,
|
|
219
|
+
"answered_targets": 3,
|
|
220
|
+
"pickup_targets_total": 5,
|
|
221
|
+
"expected_present_fields": [
|
|
222
|
+
"claim",
|
|
223
|
+
"evidence_paths",
|
|
224
|
+
"next_follow_up",
|
|
225
|
+
"status"
|
|
226
|
+
],
|
|
227
|
+
"answers": {
|
|
228
|
+
"claim": "ORP shipped the first reasoning kernel release.",
|
|
229
|
+
"evidence_paths": null,
|
|
230
|
+
"status": "shipped in the CLI.",
|
|
231
|
+
"interpretation_limits": null,
|
|
232
|
+
"next_follow_up": "run comparative studies against free-form artifacts and checklist artifacts."
|
|
233
|
+
},
|
|
234
|
+
"invented_fields": [],
|
|
235
|
+
"invention_rate": 0.0,
|
|
236
|
+
"artifact_type_guess": "kernel launch result summary",
|
|
237
|
+
"confidence": 0.94,
|
|
238
|
+
"ambiguities_count": 2,
|
|
239
|
+
"elapsed_ms": 22163.621,
|
|
240
|
+
"tokens_used": null,
|
|
241
|
+
"session_id": ""
|
|
242
|
+
}
|
|
243
|
+
],
|
|
244
|
+
"mean_pickup_score": 0.724,
|
|
245
|
+
"mean_ambiguity_remaining": 0.276,
|
|
246
|
+
"mean_answered_target_rate": 0.724,
|
|
247
|
+
"mean_confidence": 0.954,
|
|
248
|
+
"mean_ambiguities_count": 2.714,
|
|
249
|
+
"mean_invention_rate": 0.0,
|
|
250
|
+
"mean_elapsed_ms": 18104.956,
|
|
251
|
+
"mean_tokens_used": null
|
|
252
|
+
},
|
|
253
|
+
"generic_checklist": {
|
|
254
|
+
"condition": "generic_checklist",
|
|
255
|
+
"cases_total": 7,
|
|
256
|
+
"rows": [
|
|
257
|
+
{
|
|
258
|
+
"id": "software_trace_widget",
|
|
259
|
+
"domain": "software",
|
|
260
|
+
"artifact_class": "task",
|
|
261
|
+
"pickup_score": 0.8,
|
|
262
|
+
"ambiguity_remaining": 0.2,
|
|
263
|
+
"answered_targets": 4,
|
|
264
|
+
"pickup_targets_total": 5,
|
|
265
|
+
"expected_present_fields": [
|
|
266
|
+
"boundary",
|
|
267
|
+
"constraints",
|
|
268
|
+
"goal",
|
|
269
|
+
"object",
|
|
270
|
+
"success_criteria"
|
|
271
|
+
],
|
|
272
|
+
"answers": {
|
|
273
|
+
"object": null,
|
|
274
|
+
"goal": "Build the terminal trace widget for lane monitoring.",
|
|
275
|
+
"boundary": "Terminal-first lane visibility in active ORP sessions.",
|
|
276
|
+
"constraints": "low friction; no GUI dependency",
|
|
277
|
+
"success_criteria": "An operator can identify a drifting lane quickly."
|
|
278
|
+
},
|
|
279
|
+
"invented_fields": [],
|
|
280
|
+
"invention_rate": 0.0,
|
|
281
|
+
"artifact_type_guess": "task",
|
|
282
|
+
"confidence": 0.86,
|
|
283
|
+
"ambiguities_count": 2,
|
|
284
|
+
"elapsed_ms": 50571.657,
|
|
285
|
+
"tokens_used": null,
|
|
286
|
+
"session_id": ""
|
|
287
|
+
},
|
|
288
|
+
{
|
|
289
|
+
"id": "product_project_home",
|
|
290
|
+
"domain": "product",
|
|
291
|
+
"artifact_class": "decision",
|
|
292
|
+
"pickup_score": 0.6,
|
|
293
|
+
"ambiguity_remaining": 0.4,
|
|
294
|
+
"answered_targets": 3,
|
|
295
|
+
"pickup_targets_total": 5,
|
|
296
|
+
"expected_present_fields": [
|
|
297
|
+
"chosen_path",
|
|
298
|
+
"consequences",
|
|
299
|
+
"question",
|
|
300
|
+
"rationale",
|
|
301
|
+
"rejected_alternatives"
|
|
302
|
+
],
|
|
303
|
+
"answers": {
|
|
304
|
+
"question": null,
|
|
305
|
+
"chosen_path": "Lead with linked projects and move broader idea browsing into Pensieve.",
|
|
306
|
+
"rejected_alternatives": "keep the idea board as the front door.",
|
|
307
|
+
"rationale": null,
|
|
308
|
+
"consequences": "Idea discovery becomes less prominent on the home screen."
|
|
309
|
+
},
|
|
310
|
+
"invented_fields": [],
|
|
311
|
+
"invention_rate": 0.0,
|
|
312
|
+
"artifact_type_guess": "decision",
|
|
313
|
+
"confidence": 0.93,
|
|
314
|
+
"ambiguities_count": 3,
|
|
315
|
+
"elapsed_ms": 29773.129,
|
|
316
|
+
"tokens_used": null,
|
|
317
|
+
"session_id": ""
|
|
318
|
+
},
|
|
319
|
+
{
|
|
320
|
+
"id": "research_drift_hypothesis",
|
|
321
|
+
"domain": "research",
|
|
322
|
+
"artifact_class": "hypothesis",
|
|
323
|
+
"pickup_score": 0.6,
|
|
324
|
+
"ambiguity_remaining": 0.4,
|
|
325
|
+
"answered_targets": 3,
|
|
326
|
+
"pickup_targets_total": 5,
|
|
327
|
+
"expected_present_fields": [
|
|
328
|
+
"assumptions",
|
|
329
|
+
"boundary",
|
|
330
|
+
"claim",
|
|
331
|
+
"falsifiers",
|
|
332
|
+
"test_path"
|
|
333
|
+
],
|
|
334
|
+
"answers": {
|
|
335
|
+
"claim": "Drift summaries will improve stalled-lane pickup speed.",
|
|
336
|
+
"boundary": "Terminal-first multi-lane workflows.",
|
|
337
|
+
"assumptions": "Assumes operators actually consult the summary lane.",
|
|
338
|
+
"test_path": null,
|
|
339
|
+
"falsifiers": null
|
|
340
|
+
},
|
|
341
|
+
"invented_fields": [],
|
|
342
|
+
"invention_rate": 0.0,
|
|
343
|
+
"artifact_type_guess": "hypothesis",
|
|
344
|
+
"confidence": 0.93,
|
|
345
|
+
"ambiguities_count": 3,
|
|
346
|
+
"elapsed_ms": 28154.147,
|
|
347
|
+
"tokens_used": null,
|
|
348
|
+
"session_id": ""
|
|
349
|
+
},
|
|
350
|
+
{
|
|
351
|
+
"id": "research_handoff_experiment",
|
|
352
|
+
"domain": "research",
|
|
353
|
+
"artifact_class": "experiment",
|
|
354
|
+
"pickup_score": 0.667,
|
|
355
|
+
"ambiguity_remaining": 0.333,
|
|
356
|
+
"answered_targets": 4,
|
|
357
|
+
"pickup_targets_total": 6,
|
|
358
|
+
"expected_present_fields": [
|
|
359
|
+
"evidence_expectations",
|
|
360
|
+
"inputs",
|
|
361
|
+
"interpretation_limits",
|
|
362
|
+
"method",
|
|
363
|
+
"objective",
|
|
364
|
+
"outputs"
|
|
365
|
+
],
|
|
366
|
+
"answers": {
|
|
367
|
+
"objective": "Compare handoff pickup across free-form, checklist, and kernel task artifacts.",
|
|
368
|
+
"method": "Give a second operator one artifact at a time and record time to correct interpretation.",
|
|
369
|
+
"inputs": null,
|
|
370
|
+
"outputs": null,
|
|
371
|
+
"evidence_expectations": "[\"pickup timings\",\"clarification counts\"]",
|
|
372
|
+
"interpretation_limits": "Small sample may limit interpretation."
|
|
373
|
+
},
|
|
374
|
+
"invented_fields": [],
|
|
375
|
+
"invention_rate": 0.0,
|
|
376
|
+
"artifact_type_guess": "experiment",
|
|
377
|
+
"confidence": 0.88,
|
|
378
|
+
"ambiguities_count": 3,
|
|
379
|
+
"elapsed_ms": 57054.329,
|
|
380
|
+
"tokens_used": null,
|
|
381
|
+
"session_id": ""
|
|
382
|
+
},
|
|
383
|
+
{
|
|
384
|
+
"id": "operations_habanero_checkpoint",
|
|
385
|
+
"domain": "operations",
|
|
386
|
+
"artifact_class": "checkpoint",
|
|
387
|
+
"pickup_score": 1.0,
|
|
388
|
+
"ambiguity_remaining": 0.0,
|
|
389
|
+
"answered_targets": 5,
|
|
390
|
+
"pickup_targets_total": 5,
|
|
391
|
+
"expected_present_fields": [
|
|
392
|
+
"artifact_refs",
|
|
393
|
+
"completed_unit",
|
|
394
|
+
"current_state",
|
|
395
|
+
"next_handoff_target",
|
|
396
|
+
"risks"
|
|
397
|
+
],
|
|
398
|
+
"answers": {
|
|
399
|
+
"completed_unit": "Restored the Habanero linked-project routing path.",
|
|
400
|
+
"current_state": "The local repo, primary session, and hosted world are aligned again.",
|
|
401
|
+
"risks": "[\"Other active machines may still carry stale routing state.\"]",
|
|
402
|
+
"next_handoff_target": "Rerun runner sync on active machines and verify live pickup.",
|
|
403
|
+
"artifact_refs": "[\".git/orp/link/project.json\",\"runner sync output\"]"
|
|
404
|
+
},
|
|
405
|
+
"invented_fields": [],
|
|
406
|
+
"invention_rate": 0.0,
|
|
407
|
+
"artifact_type_guess": "checkpoint",
|
|
408
|
+
"confidence": 0.86,
|
|
409
|
+
"ambiguities_count": 4,
|
|
410
|
+
"elapsed_ms": 70680.529,
|
|
411
|
+
"tokens_used": null,
|
|
412
|
+
"session_id": ""
|
|
413
|
+
},
|
|
414
|
+
{
|
|
415
|
+
"id": "operations_runner_policy",
|
|
416
|
+
"domain": "operations",
|
|
417
|
+
"artifact_class": "policy",
|
|
418
|
+
"pickup_score": 1.0,
|
|
419
|
+
"ambiguity_remaining": 0.0,
|
|
420
|
+
"answered_targets": 5,
|
|
421
|
+
"pickup_targets_total": 5,
|
|
422
|
+
"expected_present_fields": [
|
|
423
|
+
"enforcement_surface",
|
|
424
|
+
"invariants",
|
|
425
|
+
"rationale",
|
|
426
|
+
"rule",
|
|
427
|
+
"scope"
|
|
428
|
+
],
|
|
429
|
+
"answers": {
|
|
430
|
+
"scope": "Hosted runner job pickup.",
|
|
431
|
+
"rule": "Only claim hosted jobs for linked projects that have a routeable local session.",
|
|
432
|
+
"rationale": "The rule exists to prevent dead-end job claims.",
|
|
433
|
+
"invariants": "do not claim unroutable jobs",
|
|
434
|
+
"enforcement_surface": "Runner pickup rejects unroutable jobs."
|
|
435
|
+
},
|
|
436
|
+
"invented_fields": [],
|
|
437
|
+
"invention_rate": 0.0,
|
|
438
|
+
"artifact_type_guess": "policy",
|
|
439
|
+
"confidence": 0.79,
|
|
440
|
+
"ambiguities_count": 4,
|
|
441
|
+
"elapsed_ms": 70217.317,
|
|
442
|
+
"tokens_used": null,
|
|
443
|
+
"session_id": ""
|
|
444
|
+
},
|
|
445
|
+
{
|
|
446
|
+
"id": "writing_kernel_launch_result",
|
|
447
|
+
"domain": "writing",
|
|
448
|
+
"artifact_class": "result",
|
|
449
|
+
"pickup_score": 0.8,
|
|
450
|
+
"ambiguity_remaining": 0.2,
|
|
451
|
+
"answered_targets": 4,
|
|
452
|
+
"pickup_targets_total": 5,
|
|
453
|
+
"expected_present_fields": [
|
|
454
|
+
"claim",
|
|
455
|
+
"evidence_paths",
|
|
456
|
+
"interpretation_limits",
|
|
457
|
+
"next_follow_up",
|
|
458
|
+
"status"
|
|
459
|
+
],
|
|
460
|
+
"answers": {
|
|
461
|
+
"claim": "ORP shipped the first reasoning kernel release in the CLI.",
|
|
462
|
+
"evidence_paths": "[\"docs/ORP_REASONING_KERNEL_V0_1.md\",\"docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md\"]",
|
|
463
|
+
"status": null,
|
|
464
|
+
"interpretation_limits": "comparative superiority is still unproven",
|
|
465
|
+
"next_follow_up": "Run comparative artifact and handoff studies next."
|
|
466
|
+
},
|
|
467
|
+
"invented_fields": [],
|
|
468
|
+
"invention_rate": 0.0,
|
|
469
|
+
"artifact_type_guess": "result",
|
|
470
|
+
"confidence": 0.9,
|
|
471
|
+
"ambiguities_count": 3,
|
|
472
|
+
"elapsed_ms": 45041.995,
|
|
473
|
+
"tokens_used": null,
|
|
474
|
+
"session_id": ""
|
|
475
|
+
}
|
|
476
|
+
],
|
|
477
|
+
"mean_pickup_score": 0.781,
|
|
478
|
+
"mean_ambiguity_remaining": 0.219,
|
|
479
|
+
"mean_answered_target_rate": 0.781,
|
|
480
|
+
"mean_confidence": 0.879,
|
|
481
|
+
"mean_ambiguities_count": 3.143,
|
|
482
|
+
"mean_invention_rate": 0.0,
|
|
483
|
+
"mean_elapsed_ms": 50213.3,
|
|
484
|
+
"mean_tokens_used": null
|
|
485
|
+
},
|
|
486
|
+
"kernel": {
|
|
487
|
+
"condition": "kernel",
|
|
488
|
+
"cases_total": 7,
|
|
489
|
+
"rows": [
|
|
490
|
+
{
|
|
491
|
+
"id": "software_trace_widget",
|
|
492
|
+
"domain": "software",
|
|
493
|
+
"artifact_class": "task",
|
|
494
|
+
"pickup_score": 1.0,
|
|
495
|
+
"ambiguity_remaining": 0.0,
|
|
496
|
+
"answered_targets": 5,
|
|
497
|
+
"pickup_targets_total": 5,
|
|
498
|
+
"expected_present_fields": [
|
|
499
|
+
"boundary",
|
|
500
|
+
"constraints",
|
|
501
|
+
"goal",
|
|
502
|
+
"object",
|
|
503
|
+
"success_criteria"
|
|
504
|
+
],
|
|
505
|
+
"answers": {
|
|
506
|
+
"object": "terminal trace widget",
|
|
507
|
+
"goal": "surface lane drift and state clearly for operators",
|
|
508
|
+
"boundary": "[\"terminal-first lane visibility\",\"active ORP sessions only\"]",
|
|
509
|
+
"constraints": "[\"low friction\",\"no GUI dependency\"]",
|
|
510
|
+
"success_criteria": "[\"an operator can identify a drifting lane within 10 seconds\",\"the widget does not overload the terminal surface\"]"
|
|
511
|
+
},
|
|
512
|
+
"invented_fields": [],
|
|
513
|
+
"invention_rate": 0.0,
|
|
514
|
+
"artifact_type_guess": "task",
|
|
515
|
+
"confidence": 0.99,
|
|
516
|
+
"ambiguities_count": 0,
|
|
517
|
+
"elapsed_ms": 33120.879,
|
|
518
|
+
"tokens_used": null,
|
|
519
|
+
"session_id": ""
|
|
520
|
+
},
|
|
521
|
+
{
|
|
522
|
+
"id": "product_project_home",
|
|
523
|
+
"domain": "product",
|
|
524
|
+
"artifact_class": "decision",
|
|
525
|
+
"pickup_score": 1.0,
|
|
526
|
+
"ambiguity_remaining": 0.0,
|
|
527
|
+
"answered_targets": 5,
|
|
528
|
+
"pickup_targets_total": 5,
|
|
529
|
+
"expected_present_fields": [
|
|
530
|
+
"chosen_path",
|
|
531
|
+
"consequences",
|
|
532
|
+
"question",
|
|
533
|
+
"rationale",
|
|
534
|
+
"rejected_alternatives"
|
|
535
|
+
],
|
|
536
|
+
"answers": {
|
|
537
|
+
"question": "Should the web app home foreground linked projects or the old idea board?",
|
|
538
|
+
"chosen_path": "Foreground linked projects and move broad idea browsing into Pensieve.",
|
|
539
|
+
"rejected_alternatives": "[\"keep the old idea board as the default home\",\"split the home evenly between ideas and projects\"]",
|
|
540
|
+
"rationale": "Active work should be reachable immediately, while the idea library can stay available as secondary navigation.",
|
|
541
|
+
"consequences": "[\"linked projects become the primary home object\",\"idea browsing becomes one click deeper\"]"
|
|
542
|
+
},
|
|
543
|
+
"invented_fields": [],
|
|
544
|
+
"invention_rate": 0.0,
|
|
545
|
+
"artifact_type_guess": "decision",
|
|
546
|
+
"confidence": 0.99,
|
|
547
|
+
"ambiguities_count": 0,
|
|
548
|
+
"elapsed_ms": 25177.396,
|
|
549
|
+
"tokens_used": null,
|
|
550
|
+
"session_id": ""
|
|
551
|
+
},
|
|
552
|
+
{
|
|
553
|
+
"id": "research_drift_hypothesis",
|
|
554
|
+
"domain": "research",
|
|
555
|
+
"artifact_class": "hypothesis",
|
|
556
|
+
"pickup_score": 1.0,
|
|
557
|
+
"ambiguity_remaining": 0.0,
|
|
558
|
+
"answered_targets": 5,
|
|
559
|
+
"pickup_targets_total": 5,
|
|
560
|
+
"expected_present_fields": [
|
|
561
|
+
"assumptions",
|
|
562
|
+
"boundary",
|
|
563
|
+
"claim",
|
|
564
|
+
"falsifiers",
|
|
565
|
+
"test_path"
|
|
566
|
+
],
|
|
567
|
+
"answers": {
|
|
568
|
+
"claim": "Short drift summaries reduce the time needed to identify stalled lanes.",
|
|
569
|
+
"boundary": "[\"terminal-first multi-lane workflows\",\"operators already monitoring active lanes\"]",
|
|
570
|
+
"assumptions": "[\"operators glance at summaries while they work\",\"summaries do not introduce excessive noise\"]",
|
|
571
|
+
"test_path": "Run matched stalled-lane pickup trials with and without summaries and compare detection time.",
|
|
572
|
+
"falsifiers": "[\"pickup time does not improve materially\",\"operators ignore the summaries\"]"
|
|
573
|
+
},
|
|
574
|
+
"invented_fields": [],
|
|
575
|
+
"invention_rate": 0.0,
|
|
576
|
+
"artifact_type_guess": "hypothesis",
|
|
577
|
+
"confidence": 0.99,
|
|
578
|
+
"ambiguities_count": 0,
|
|
579
|
+
"elapsed_ms": 14567.044,
|
|
580
|
+
"tokens_used": null,
|
|
581
|
+
"session_id": ""
|
|
582
|
+
},
|
|
583
|
+
{
|
|
584
|
+
"id": "research_handoff_experiment",
|
|
585
|
+
"domain": "research",
|
|
586
|
+
"artifact_class": "experiment",
|
|
587
|
+
"pickup_score": 1.0,
|
|
588
|
+
"ambiguity_remaining": 0.0,
|
|
589
|
+
"answered_targets": 6,
|
|
590
|
+
"pickup_targets_total": 6,
|
|
591
|
+
"expected_present_fields": [
|
|
592
|
+
"evidence_expectations",
|
|
593
|
+
"inputs",
|
|
594
|
+
"interpretation_limits",
|
|
595
|
+
"method",
|
|
596
|
+
"objective",
|
|
597
|
+
"outputs"
|
|
598
|
+
],
|
|
599
|
+
"answers": {
|
|
600
|
+
"objective": "Measure whether kernel task artifacts improve handoff pickup quality over free-form and generic checklist alternatives.",
|
|
601
|
+
"method": "Run matched handoff trials where a second operator receives one artifact at a time and explains the task, constraints, and next action.",
|
|
602
|
+
"inputs": "[\"matched prompt set\",\"second-operator reviewers\",\"three artifact conditions\"]",
|
|
603
|
+
"outputs": "[\"pickup scores\",\"clarification counts\",\"time to correct interpretation\"]",
|
|
604
|
+
"evidence_expectations": "[\"score sheets\",\"timing logs\",\"artifact corpus\"]",
|
|
605
|
+
"interpretation_limits": "[\"internal sample size is small\",\"the pilot measures structural pickup, not full downstream outcomes\"]"
|
|
606
|
+
},
|
|
607
|
+
"invented_fields": [],
|
|
608
|
+
"invention_rate": 0.0,
|
|
609
|
+
"artifact_type_guess": "experiment",
|
|
610
|
+
"confidence": 0.99,
|
|
611
|
+
"ambiguities_count": 0,
|
|
612
|
+
"elapsed_ms": 29925.466,
|
|
613
|
+
"tokens_used": null,
|
|
614
|
+
"session_id": ""
|
|
615
|
+
},
|
|
616
|
+
{
|
|
617
|
+
"id": "operations_habanero_checkpoint",
|
|
618
|
+
"domain": "operations",
|
|
619
|
+
"artifact_class": "checkpoint",
|
|
620
|
+
"pickup_score": 1.0,
|
|
621
|
+
"ambiguity_remaining": 0.0,
|
|
622
|
+
"answered_targets": 5,
|
|
623
|
+
"pickup_targets_total": 5,
|
|
624
|
+
"expected_present_fields": [
|
|
625
|
+
"artifact_refs",
|
|
626
|
+
"completed_unit",
|
|
627
|
+
"current_state",
|
|
628
|
+
"next_handoff_target",
|
|
629
|
+
"risks"
|
|
630
|
+
],
|
|
631
|
+
"answers": {
|
|
632
|
+
"completed_unit": "Restored canonical runner routing for Habanero.",
|
|
633
|
+
"current_state": "The local project link, primary session, and hosted world are synchronized and routable again.",
|
|
634
|
+
"risks": "[\"inactive machines may still hold stale routing state\",\"older queued jobs may need a fresh sync before pickup\"]",
|
|
635
|
+
"next_handoff_target": "Rerun runner sync on active machines and verify one fresh hosted job pickup.",
|
|
636
|
+
"artifact_refs": "[\".git/orp/link/project.json\",\".git/orp/link/sessions\",\"orp/artifacts\"]"
|
|
637
|
+
},
|
|
638
|
+
"invented_fields": [],
|
|
639
|
+
"invention_rate": 0.0,
|
|
640
|
+
"artifact_type_guess": "checkpoint",
|
|
641
|
+
"confidence": 0.99,
|
|
642
|
+
"ambiguities_count": 0,
|
|
643
|
+
"elapsed_ms": 22994.873,
|
|
644
|
+
"tokens_used": null,
|
|
645
|
+
"session_id": ""
|
|
646
|
+
},
|
|
647
|
+
{
|
|
648
|
+
"id": "operations_runner_policy",
|
|
649
|
+
"domain": "operations",
|
|
650
|
+
"artifact_class": "policy",
|
|
651
|
+
"pickup_score": 1.0,
|
|
652
|
+
"ambiguity_remaining": 0.0,
|
|
653
|
+
"answered_targets": 5,
|
|
654
|
+
"pickup_targets_total": 5,
|
|
655
|
+
"expected_present_fields": [
|
|
656
|
+
"enforcement_surface",
|
|
657
|
+
"invariants",
|
|
658
|
+
"rationale",
|
|
659
|
+
"rule",
|
|
660
|
+
"scope"
|
|
661
|
+
],
|
|
662
|
+
"answers": {
|
|
663
|
+
"scope": "Hosted runner job pickup and claim behavior.",
|
|
664
|
+
"rule": "Only claim a hosted job when the linked project has a routeable local session on the current machine.",
|
|
665
|
+
"rationale": "Jobs should only be claimed when the runner can execute them against a real local target.",
|
|
666
|
+
"invariants": "[\"a claimed job must resolve to an actual local session\",\"runner routing must stay machine-scoped\"]",
|
|
667
|
+
"enforcement_surface": "runner sync, poll, and work lifecycle"
|
|
668
|
+
},
|
|
669
|
+
"invented_fields": [],
|
|
670
|
+
"invention_rate": 0.0,
|
|
671
|
+
"artifact_type_guess": "policy",
|
|
672
|
+
"confidence": 0.99,
|
|
673
|
+
"ambiguities_count": 0,
|
|
674
|
+
"elapsed_ms": 16468.268,
|
|
675
|
+
"tokens_used": null,
|
|
676
|
+
"session_id": ""
|
|
677
|
+
},
|
|
678
|
+
{
|
|
679
|
+
"id": "writing_kernel_launch_result",
|
|
680
|
+
"domain": "writing",
|
|
681
|
+
"artifact_class": "result",
|
|
682
|
+
"pickup_score": 1.0,
|
|
683
|
+
"ambiguity_remaining": 0.0,
|
|
684
|
+
"answered_targets": 5,
|
|
685
|
+
"pickup_targets_total": 5,
|
|
686
|
+
"expected_present_fields": [
|
|
687
|
+
"claim",
|
|
688
|
+
"evidence_paths",
|
|
689
|
+
"interpretation_limits",
|
|
690
|
+
"next_follow_up",
|
|
691
|
+
"status"
|
|
692
|
+
],
|
|
693
|
+
"answers": {
|
|
694
|
+
"claim": "ORP shipped the first reasoning kernel release as a real CLI protocol surface.",
|
|
695
|
+
"evidence_paths": "[\"docs/ORP_REASONING_KERNEL_V0_1.md\",\"docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md\",\"spec/v1/kernel.schema.json\"]",
|
|
696
|
+
"status": "shipped in the ORP CLI and published to npm",
|
|
697
|
+
"interpretation_limits": "[\"comparative superiority over free-form and checklist alternatives is not yet proven\",\"the current evidence is strongest on internal validity\"]",
|
|
698
|
+
"next_follow_up": "Run the comparative artifact, pickup, and corpus-fit studies."
|
|
699
|
+
},
|
|
700
|
+
"invented_fields": [],
|
|
701
|
+
"invention_rate": 0.0,
|
|
702
|
+
"artifact_type_guess": "result",
|
|
703
|
+
"confidence": 0.99,
|
|
704
|
+
"ambiguities_count": 1,
|
|
705
|
+
"elapsed_ms": 31760.601,
|
|
706
|
+
"tokens_used": null,
|
|
707
|
+
"session_id": ""
|
|
708
|
+
}
|
|
709
|
+
],
|
|
710
|
+
"mean_pickup_score": 1.0,
|
|
711
|
+
"mean_ambiguity_remaining": 0.0,
|
|
712
|
+
"mean_answered_target_rate": 1.0,
|
|
713
|
+
"mean_confidence": 0.99,
|
|
714
|
+
"mean_ambiguities_count": 0.143,
|
|
715
|
+
"mean_invention_rate": 0.0,
|
|
716
|
+
"mean_elapsed_ms": 24859.218,
|
|
717
|
+
"mean_tokens_used": null
|
|
718
|
+
}
|
|
719
|
+
},
|
|
720
|
+
"pairwise": {
|
|
721
|
+
"kernel_vs_generic_checklist": {
|
|
722
|
+
"left": "kernel",
|
|
723
|
+
"right": "generic_checklist",
|
|
724
|
+
"wins": 5,
|
|
725
|
+
"ties": 2,
|
|
726
|
+
"losses": 0,
|
|
727
|
+
"mean_pickup_score_delta": 0.219,
|
|
728
|
+
"by_case": [
|
|
729
|
+
{
|
|
730
|
+
"id": "software_trace_widget",
|
|
731
|
+
"domain": "software",
|
|
732
|
+
"artifact_class": "task",
|
|
733
|
+
"left_score": 1.0,
|
|
734
|
+
"right_score": 0.8,
|
|
735
|
+
"delta": 0.2,
|
|
736
|
+
"outcome": "win"
|
|
737
|
+
},
|
|
738
|
+
{
|
|
739
|
+
"id": "product_project_home",
|
|
740
|
+
"domain": "product",
|
|
741
|
+
"artifact_class": "decision",
|
|
742
|
+
"left_score": 1.0,
|
|
743
|
+
"right_score": 0.6,
|
|
744
|
+
"delta": 0.4,
|
|
745
|
+
"outcome": "win"
|
|
746
|
+
},
|
|
747
|
+
{
|
|
748
|
+
"id": "research_drift_hypothesis",
|
|
749
|
+
"domain": "research",
|
|
750
|
+
"artifact_class": "hypothesis",
|
|
751
|
+
"left_score": 1.0,
|
|
752
|
+
"right_score": 0.6,
|
|
753
|
+
"delta": 0.4,
|
|
754
|
+
"outcome": "win"
|
|
755
|
+
},
|
|
756
|
+
{
|
|
757
|
+
"id": "research_handoff_experiment",
|
|
758
|
+
"domain": "research",
|
|
759
|
+
"artifact_class": "experiment",
|
|
760
|
+
"left_score": 1.0,
|
|
761
|
+
"right_score": 0.667,
|
|
762
|
+
"delta": 0.333,
|
|
763
|
+
"outcome": "win"
|
|
764
|
+
},
|
|
765
|
+
{
|
|
766
|
+
"id": "operations_habanero_checkpoint",
|
|
767
|
+
"domain": "operations",
|
|
768
|
+
"artifact_class": "checkpoint",
|
|
769
|
+
"left_score": 1.0,
|
|
770
|
+
"right_score": 1.0,
|
|
771
|
+
"delta": 0.0,
|
|
772
|
+
"outcome": "tie"
|
|
773
|
+
},
|
|
774
|
+
{
|
|
775
|
+
"id": "operations_runner_policy",
|
|
776
|
+
"domain": "operations",
|
|
777
|
+
"artifact_class": "policy",
|
|
778
|
+
"left_score": 1.0,
|
|
779
|
+
"right_score": 1.0,
|
|
780
|
+
"delta": 0.0,
|
|
781
|
+
"outcome": "tie"
|
|
782
|
+
},
|
|
783
|
+
{
|
|
784
|
+
"id": "writing_kernel_launch_result",
|
|
785
|
+
"domain": "writing",
|
|
786
|
+
"artifact_class": "result",
|
|
787
|
+
"left_score": 1.0,
|
|
788
|
+
"right_score": 0.8,
|
|
789
|
+
"delta": 0.2,
|
|
790
|
+
"outcome": "win"
|
|
791
|
+
}
|
|
792
|
+
]
|
|
793
|
+
},
|
|
794
|
+
"kernel_vs_freeform": {
|
|
795
|
+
"left": "kernel",
|
|
796
|
+
"right": "freeform",
|
|
797
|
+
"wins": 7,
|
|
798
|
+
"ties": 0,
|
|
799
|
+
"losses": 0,
|
|
800
|
+
"mean_pickup_score_delta": 0.276,
|
|
801
|
+
"by_case": [
|
|
802
|
+
{
|
|
803
|
+
"id": "software_trace_widget",
|
|
804
|
+
"domain": "software",
|
|
805
|
+
"artifact_class": "task",
|
|
806
|
+
"left_score": 1.0,
|
|
807
|
+
"right_score": 0.6,
|
|
808
|
+
"delta": 0.4,
|
|
809
|
+
"outcome": "win"
|
|
810
|
+
},
|
|
811
|
+
{
|
|
812
|
+
"id": "product_project_home",
|
|
813
|
+
"domain": "product",
|
|
814
|
+
"artifact_class": "decision",
|
|
815
|
+
"left_score": 1.0,
|
|
816
|
+
"right_score": 0.8,
|
|
817
|
+
"delta": 0.2,
|
|
818
|
+
"outcome": "win"
|
|
819
|
+
},
|
|
820
|
+
{
|
|
821
|
+
"id": "research_drift_hypothesis",
|
|
822
|
+
"domain": "research",
|
|
823
|
+
"artifact_class": "hypothesis",
|
|
824
|
+
"left_score": 1.0,
|
|
825
|
+
"right_score": 0.8,
|
|
826
|
+
"delta": 0.2,
|
|
827
|
+
"outcome": "win"
|
|
828
|
+
},
|
|
829
|
+
{
|
|
830
|
+
"id": "research_handoff_experiment",
|
|
831
|
+
"domain": "research",
|
|
832
|
+
"artifact_class": "experiment",
|
|
833
|
+
"left_score": 1.0,
|
|
834
|
+
"right_score": 0.667,
|
|
835
|
+
"delta": 0.333,
|
|
836
|
+
"outcome": "win"
|
|
837
|
+
},
|
|
838
|
+
{
|
|
839
|
+
"id": "operations_habanero_checkpoint",
|
|
840
|
+
"domain": "operations",
|
|
841
|
+
"artifact_class": "checkpoint",
|
|
842
|
+
"left_score": 1.0,
|
|
843
|
+
"right_score": 0.8,
|
|
844
|
+
"delta": 0.2,
|
|
845
|
+
"outcome": "win"
|
|
846
|
+
},
|
|
847
|
+
{
|
|
848
|
+
"id": "operations_runner_policy",
|
|
849
|
+
"domain": "operations",
|
|
850
|
+
"artifact_class": "policy",
|
|
851
|
+
"left_score": 1.0,
|
|
852
|
+
"right_score": 0.8,
|
|
853
|
+
"delta": 0.2,
|
|
854
|
+
"outcome": "win"
|
|
855
|
+
},
|
|
856
|
+
{
|
|
857
|
+
"id": "writing_kernel_launch_result",
|
|
858
|
+
"domain": "writing",
|
|
859
|
+
"artifact_class": "result",
|
|
860
|
+
"left_score": 1.0,
|
|
861
|
+
"right_score": 0.6,
|
|
862
|
+
"delta": 0.4,
|
|
863
|
+
"outcome": "win"
|
|
864
|
+
}
|
|
865
|
+
]
|
|
866
|
+
},
|
|
867
|
+
"generic_checklist_vs_freeform": {
|
|
868
|
+
"left": "generic_checklist",
|
|
869
|
+
"right": "freeform",
|
|
870
|
+
"wins": 4,
|
|
871
|
+
"ties": 1,
|
|
872
|
+
"losses": 2,
|
|
873
|
+
"mean_pickup_score_delta": 0.057,
|
|
874
|
+
"by_case": [
|
|
875
|
+
{
|
|
876
|
+
"id": "software_trace_widget",
|
|
877
|
+
"domain": "software",
|
|
878
|
+
"artifact_class": "task",
|
|
879
|
+
"left_score": 0.8,
|
|
880
|
+
"right_score": 0.6,
|
|
881
|
+
"delta": 0.2,
|
|
882
|
+
"outcome": "win"
|
|
883
|
+
},
|
|
884
|
+
{
|
|
885
|
+
"id": "product_project_home",
|
|
886
|
+
"domain": "product",
|
|
887
|
+
"artifact_class": "decision",
|
|
888
|
+
"left_score": 0.6,
|
|
889
|
+
"right_score": 0.8,
|
|
890
|
+
"delta": -0.2,
|
|
891
|
+
"outcome": "loss"
|
|
892
|
+
},
|
|
893
|
+
{
|
|
894
|
+
"id": "research_drift_hypothesis",
|
|
895
|
+
"domain": "research",
|
|
896
|
+
"artifact_class": "hypothesis",
|
|
897
|
+
"left_score": 0.6,
|
|
898
|
+
"right_score": 0.8,
|
|
899
|
+
"delta": -0.2,
|
|
900
|
+
"outcome": "loss"
|
|
901
|
+
},
|
|
902
|
+
{
|
|
903
|
+
"id": "research_handoff_experiment",
|
|
904
|
+
"domain": "research",
|
|
905
|
+
"artifact_class": "experiment",
|
|
906
|
+
"left_score": 0.667,
|
|
907
|
+
"right_score": 0.667,
|
|
908
|
+
"delta": 0.0,
|
|
909
|
+
"outcome": "tie"
|
|
910
|
+
},
|
|
911
|
+
{
|
|
912
|
+
"id": "operations_habanero_checkpoint",
|
|
913
|
+
"domain": "operations",
|
|
914
|
+
"artifact_class": "checkpoint",
|
|
915
|
+
"left_score": 1.0,
|
|
916
|
+
"right_score": 0.8,
|
|
917
|
+
"delta": 0.2,
|
|
918
|
+
"outcome": "win"
|
|
919
|
+
},
|
|
920
|
+
{
|
|
921
|
+
"id": "operations_runner_policy",
|
|
922
|
+
"domain": "operations",
|
|
923
|
+
"artifact_class": "policy",
|
|
924
|
+
"left_score": 1.0,
|
|
925
|
+
"right_score": 0.8,
|
|
926
|
+
"delta": 0.2,
|
|
927
|
+
"outcome": "win"
|
|
928
|
+
},
|
|
929
|
+
{
|
|
930
|
+
"id": "writing_kernel_launch_result",
|
|
931
|
+
"domain": "writing",
|
|
932
|
+
"artifact_class": "result",
|
|
933
|
+
"left_score": 0.8,
|
|
934
|
+
"right_score": 0.6,
|
|
935
|
+
"delta": 0.2,
|
|
936
|
+
"outcome": "win"
|
|
937
|
+
}
|
|
938
|
+
]
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
},
|
|
942
|
+
{
|
|
943
|
+
"run_index": 2,
|
|
944
|
+
"summary": {
|
|
945
|
+
"all_claims_pass": true,
|
|
946
|
+
"kernel_mean_pickup_score": 1.0,
|
|
947
|
+
"generic_checklist_mean_pickup_score": 0.781,
|
|
948
|
+
"freeform_mean_pickup_score": 0.695,
|
|
949
|
+
"kernel_mean_invention_rate": 0.0,
|
|
950
|
+
"generic_checklist_mean_invention_rate": 0.0,
|
|
951
|
+
"freeform_mean_invention_rate": 0.0
|
|
952
|
+
},
|
|
953
|
+
"conditions": {
|
|
954
|
+
"freeform": {
|
|
955
|
+
"condition": "freeform",
|
|
956
|
+
"cases_total": 7,
|
|
957
|
+
"rows": [
|
|
958
|
+
{
|
|
959
|
+
"id": "software_trace_widget",
|
|
960
|
+
"domain": "software",
|
|
961
|
+
"artifact_class": "task",
|
|
962
|
+
"pickup_score": 0.6,
|
|
963
|
+
"ambiguity_remaining": 0.4,
|
|
964
|
+
"answered_targets": 3,
|
|
965
|
+
"pickup_targets_total": 5,
|
|
966
|
+
"expected_present_fields": [
|
|
967
|
+
"constraints",
|
|
968
|
+
"goal",
|
|
969
|
+
"object"
|
|
970
|
+
],
|
|
971
|
+
"answers": {
|
|
972
|
+
"object": "terminal trace widget for lane monitoring.",
|
|
973
|
+
"goal": "let operators tell quickly when a lane is drifting.",
|
|
974
|
+
"boundary": null,
|
|
975
|
+
"constraints": "stay terminal-first and low friction.",
|
|
976
|
+
"success_criteria": null
|
|
977
|
+
},
|
|
978
|
+
"invented_fields": [],
|
|
979
|
+
"invention_rate": 0.0,
|
|
980
|
+
"artifact_type_guess": "feature brief",
|
|
981
|
+
"confidence": 0.95,
|
|
982
|
+
"ambiguities_count": 4,
|
|
983
|
+
"elapsed_ms": 18104.63,
|
|
984
|
+
"tokens_used": null,
|
|
985
|
+
"session_id": ""
|
|
986
|
+
},
|
|
987
|
+
{
|
|
988
|
+
"id": "product_project_home",
|
|
989
|
+
"domain": "product",
|
|
990
|
+
"artifact_class": "decision",
|
|
991
|
+
"pickup_score": 0.8,
|
|
992
|
+
"ambiguity_remaining": 0.2,
|
|
993
|
+
"answered_targets": 4,
|
|
994
|
+
"pickup_targets_total": 5,
|
|
995
|
+
"expected_present_fields": [
|
|
996
|
+
"chosen_path",
|
|
997
|
+
"consequences",
|
|
998
|
+
"question",
|
|
999
|
+
"rationale"
|
|
1000
|
+
],
|
|
1001
|
+
"answers": {
|
|
1002
|
+
"question": "should the web app lead with linked projects or the old idea board?",
|
|
1003
|
+
"chosen_path": "lead with linked projects first.",
|
|
1004
|
+
"rejected_alternatives": null,
|
|
1005
|
+
"rationale": "active work should be foregrounded and idea browsing can move into Pensieve.",
|
|
1006
|
+
"consequences": "the old idea board becomes secondary navigation."
|
|
1007
|
+
},
|
|
1008
|
+
"invented_fields": [],
|
|
1009
|
+
"invention_rate": 0.0,
|
|
1010
|
+
"artifact_type_guess": "decision note",
|
|
1011
|
+
"confidence": 0.95,
|
|
1012
|
+
"ambiguities_count": 2,
|
|
1013
|
+
"elapsed_ms": 27202.448,
|
|
1014
|
+
"tokens_used": null,
|
|
1015
|
+
"session_id": ""
|
|
1016
|
+
},
|
|
1017
|
+
{
|
|
1018
|
+
"id": "research_drift_hypothesis",
|
|
1019
|
+
"domain": "research",
|
|
1020
|
+
"artifact_class": "hypothesis",
|
|
1021
|
+
"pickup_score": 0.8,
|
|
1022
|
+
"ambiguity_remaining": 0.2,
|
|
1023
|
+
"answered_targets": 4,
|
|
1024
|
+
"pickup_targets_total": 5,
|
|
1025
|
+
"expected_present_fields": [
|
|
1026
|
+
"assumptions",
|
|
1027
|
+
"boundary",
|
|
1028
|
+
"claim",
|
|
1029
|
+
"test_path"
|
|
1030
|
+
],
|
|
1031
|
+
"answers": {
|
|
1032
|
+
"claim": "short drift summaries will help operators notice stalled lanes faster.",
|
|
1033
|
+
"boundary": "terminal-first multi-lane work.",
|
|
1034
|
+
"assumptions": "operators glance at summaries while they work.",
|
|
1035
|
+
"test_path": "compare stalled-lane pickup with and without summaries.",
|
|
1036
|
+
"falsifiers": null
|
|
1037
|
+
},
|
|
1038
|
+
"invented_fields": [],
|
|
1039
|
+
"invention_rate": 0.0,
|
|
1040
|
+
"artifact_type_guess": "hypothesis summary",
|
|
1041
|
+
"confidence": 0.94,
|
|
1042
|
+
"ambiguities_count": 2,
|
|
1043
|
+
"elapsed_ms": 17897.318,
|
|
1044
|
+
"tokens_used": null,
|
|
1045
|
+
"session_id": ""
|
|
1046
|
+
},
|
|
1047
|
+
{
|
|
1048
|
+
"id": "research_handoff_experiment",
|
|
1049
|
+
"domain": "research",
|
|
1050
|
+
"artifact_class": "experiment",
|
|
1051
|
+
"pickup_score": 0.667,
|
|
1052
|
+
"ambiguity_remaining": 0.333,
|
|
1053
|
+
"answered_targets": 4,
|
|
1054
|
+
"pickup_targets_total": 6,
|
|
1055
|
+
"expected_present_fields": [
|
|
1056
|
+
"evidence_expectations",
|
|
1057
|
+
"interpretation_limits",
|
|
1058
|
+
"method",
|
|
1059
|
+
"objective",
|
|
1060
|
+
"outputs"
|
|
1061
|
+
],
|
|
1062
|
+
"answers": {
|
|
1063
|
+
"objective": "compare free-form tasks, checklist tasks, and kernel tasks during handoff pickup",
|
|
1064
|
+
"method": "give matched task artifacts to a second operator and time correct interpretation",
|
|
1065
|
+
"inputs": null,
|
|
1066
|
+
"outputs": null,
|
|
1067
|
+
"evidence_expectations": "collect scores and clarification counts",
|
|
1068
|
+
"interpretation_limits": "internal sample only"
|
|
1069
|
+
},
|
|
1070
|
+
"invented_fields": [],
|
|
1071
|
+
"invention_rate": 0.0,
|
|
1072
|
+
"artifact_type_guess": "experiment brief",
|
|
1073
|
+
"confidence": 0.95,
|
|
1074
|
+
"ambiguities_count": 3,
|
|
1075
|
+
"elapsed_ms": 33657.578,
|
|
1076
|
+
"tokens_used": null,
|
|
1077
|
+
"session_id": ""
|
|
1078
|
+
},
|
|
1079
|
+
{
|
|
1080
|
+
"id": "operations_habanero_checkpoint",
|
|
1081
|
+
"domain": "operations",
|
|
1082
|
+
"artifact_class": "checkpoint",
|
|
1083
|
+
"pickup_score": 0.6,
|
|
1084
|
+
"ambiguity_remaining": 0.4,
|
|
1085
|
+
"answered_targets": 3,
|
|
1086
|
+
"pickup_targets_total": 5,
|
|
1087
|
+
"expected_present_fields": [
|
|
1088
|
+
"completed_unit",
|
|
1089
|
+
"current_state",
|
|
1090
|
+
"next_handoff_target",
|
|
1091
|
+
"risks"
|
|
1092
|
+
],
|
|
1093
|
+
"answers": {
|
|
1094
|
+
"completed_unit": "restored linked-project routing for Habanero",
|
|
1095
|
+
"current_state": "the repo is bound and the primary session is routable again",
|
|
1096
|
+
"risks": "other machines may still need a sync",
|
|
1097
|
+
"next_handoff_target": null,
|
|
1098
|
+
"artifact_refs": null
|
|
1099
|
+
},
|
|
1100
|
+
"invented_fields": [],
|
|
1101
|
+
"invention_rate": 0.0,
|
|
1102
|
+
"artifact_type_guess": "checkpoint note",
|
|
1103
|
+
"confidence": 0.95,
|
|
1104
|
+
"ambiguities_count": 2,
|
|
1105
|
+
"elapsed_ms": 12693.171,
|
|
1106
|
+
"tokens_used": null,
|
|
1107
|
+
"session_id": ""
|
|
1108
|
+
},
|
|
1109
|
+
{
|
|
1110
|
+
"id": "operations_runner_policy",
|
|
1111
|
+
"domain": "operations",
|
|
1112
|
+
"artifact_class": "policy",
|
|
1113
|
+
"pickup_score": 0.8,
|
|
1114
|
+
"ambiguity_remaining": 0.2,
|
|
1115
|
+
"answered_targets": 4,
|
|
1116
|
+
"pickup_targets_total": 5,
|
|
1117
|
+
"expected_present_fields": [
|
|
1118
|
+
"invariants",
|
|
1119
|
+
"rationale",
|
|
1120
|
+
"rule",
|
|
1121
|
+
"scope"
|
|
1122
|
+
],
|
|
1123
|
+
"answers": {
|
|
1124
|
+
"scope": "hosted runner pickup",
|
|
1125
|
+
"rule": "only claim hosted jobs for linked projects that have a routeable local session.",
|
|
1126
|
+
"rationale": "avoid claiming work with nowhere real to execute.",
|
|
1127
|
+
"invariants": "a claimed job must resolve to an actual local session.",
|
|
1128
|
+
"enforcement_surface": null
|
|
1129
|
+
},
|
|
1130
|
+
"invented_fields": [],
|
|
1131
|
+
"invention_rate": 0.0,
|
|
1132
|
+
"artifact_type_guess": "policy",
|
|
1133
|
+
"confidence": 0.98,
|
|
1134
|
+
"ambiguities_count": 1,
|
|
1135
|
+
"elapsed_ms": 13626.534,
|
|
1136
|
+
"tokens_used": null,
|
|
1137
|
+
"session_id": ""
|
|
1138
|
+
},
|
|
1139
|
+
{
|
|
1140
|
+
"id": "writing_kernel_launch_result",
|
|
1141
|
+
"domain": "writing",
|
|
1142
|
+
"artifact_class": "result",
|
|
1143
|
+
"pickup_score": 0.6,
|
|
1144
|
+
"ambiguity_remaining": 0.4,
|
|
1145
|
+
"answered_targets": 3,
|
|
1146
|
+
"pickup_targets_total": 5,
|
|
1147
|
+
"expected_present_fields": [
|
|
1148
|
+
"claim",
|
|
1149
|
+
"evidence_paths",
|
|
1150
|
+
"next_follow_up",
|
|
1151
|
+
"status"
|
|
1152
|
+
],
|
|
1153
|
+
"answers": {
|
|
1154
|
+
"claim": "ORP shipped the first reasoning kernel release.",
|
|
1155
|
+
"evidence_paths": null,
|
|
1156
|
+
"status": "shipped in the CLI.",
|
|
1157
|
+
"interpretation_limits": null,
|
|
1158
|
+
"next_follow_up": "run comparative studies against free-form artifacts and checklist artifacts."
|
|
1159
|
+
},
|
|
1160
|
+
"invented_fields": [],
|
|
1161
|
+
"invention_rate": 0.0,
|
|
1162
|
+
"artifact_type_guess": "kernel launch result",
|
|
1163
|
+
"confidence": 0.95,
|
|
1164
|
+
"ambiguities_count": 2,
|
|
1165
|
+
"elapsed_ms": 14835.559,
|
|
1166
|
+
"tokens_used": null,
|
|
1167
|
+
"session_id": ""
|
|
1168
|
+
}
|
|
1169
|
+
],
|
|
1170
|
+
"mean_pickup_score": 0.695,
|
|
1171
|
+
"mean_ambiguity_remaining": 0.305,
|
|
1172
|
+
"mean_answered_target_rate": 0.695,
|
|
1173
|
+
"mean_confidence": 0.953,
|
|
1174
|
+
"mean_ambiguities_count": 2.286,
|
|
1175
|
+
"mean_invention_rate": 0.0,
|
|
1176
|
+
"mean_elapsed_ms": 19716.748,
|
|
1177
|
+
"mean_tokens_used": null
|
|
1178
|
+
},
|
|
1179
|
+
"generic_checklist": {
|
|
1180
|
+
"condition": "generic_checklist",
|
|
1181
|
+
"cases_total": 7,
|
|
1182
|
+
"rows": [
|
|
1183
|
+
{
|
|
1184
|
+
"id": "software_trace_widget",
|
|
1185
|
+
"domain": "software",
|
|
1186
|
+
"artifact_class": "task",
|
|
1187
|
+
"pickup_score": 0.8,
|
|
1188
|
+
"ambiguity_remaining": 0.2,
|
|
1189
|
+
"answered_targets": 4,
|
|
1190
|
+
"pickup_targets_total": 5,
|
|
1191
|
+
"expected_present_fields": [
|
|
1192
|
+
"boundary",
|
|
1193
|
+
"constraints",
|
|
1194
|
+
"goal",
|
|
1195
|
+
"object",
|
|
1196
|
+
"success_criteria"
|
|
1197
|
+
],
|
|
1198
|
+
"answers": {
|
|
1199
|
+
"object": null,
|
|
1200
|
+
"goal": "Build the terminal trace widget for lane monitoring.",
|
|
1201
|
+
"boundary": "Terminal-first lane visibility in active ORP sessions.",
|
|
1202
|
+
"constraints": "low friction; no GUI dependency",
|
|
1203
|
+
"success_criteria": "An operator can identify a drifting lane quickly."
|
|
1204
|
+
},
|
|
1205
|
+
"invented_fields": [],
|
|
1206
|
+
"invention_rate": 0.0,
|
|
1207
|
+
"artifact_type_guess": "task",
|
|
1208
|
+
"confidence": 0.88,
|
|
1209
|
+
"ambiguities_count": 2,
|
|
1210
|
+
"elapsed_ms": 37112.903,
|
|
1211
|
+
"tokens_used": null,
|
|
1212
|
+
"session_id": ""
|
|
1213
|
+
},
|
|
1214
|
+
{
|
|
1215
|
+
"id": "product_project_home",
|
|
1216
|
+
"domain": "product",
|
|
1217
|
+
"artifact_class": "decision",
|
|
1218
|
+
"pickup_score": 0.4,
|
|
1219
|
+
"ambiguity_remaining": 0.6,
|
|
1220
|
+
"answered_targets": 2,
|
|
1221
|
+
"pickup_targets_total": 5,
|
|
1222
|
+
"expected_present_fields": [
|
|
1223
|
+
"chosen_path",
|
|
1224
|
+
"consequences",
|
|
1225
|
+
"question",
|
|
1226
|
+
"rationale",
|
|
1227
|
+
"rejected_alternatives"
|
|
1228
|
+
],
|
|
1229
|
+
"answers": {
|
|
1230
|
+
"question": null,
|
|
1231
|
+
"chosen_path": "Lead with linked projects and move broader idea browsing into Pensieve.",
|
|
1232
|
+
"rejected_alternatives": "keep the idea board as the front door.",
|
|
1233
|
+
"rationale": null,
|
|
1234
|
+
"consequences": null
|
|
1235
|
+
},
|
|
1236
|
+
"invented_fields": [],
|
|
1237
|
+
"invention_rate": 0.0,
|
|
1238
|
+
"artifact_type_guess": "decision",
|
|
1239
|
+
"confidence": 0.82,
|
|
1240
|
+
"ambiguities_count": 3,
|
|
1241
|
+
"elapsed_ms": 64092.018,
|
|
1242
|
+
"tokens_used": null,
|
|
1243
|
+
"session_id": ""
|
|
1244
|
+
},
|
|
1245
|
+
{
|
|
1246
|
+
"id": "research_drift_hypothesis",
|
|
1247
|
+
"domain": "research",
|
|
1248
|
+
"artifact_class": "hypothesis",
|
|
1249
|
+
"pickup_score": 0.8,
|
|
1250
|
+
"ambiguity_remaining": 0.2,
|
|
1251
|
+
"answered_targets": 4,
|
|
1252
|
+
"pickup_targets_total": 5,
|
|
1253
|
+
"expected_present_fields": [
|
|
1254
|
+
"assumptions",
|
|
1255
|
+
"boundary",
|
|
1256
|
+
"claim",
|
|
1257
|
+
"falsifiers",
|
|
1258
|
+
"test_path"
|
|
1259
|
+
],
|
|
1260
|
+
"answers": {
|
|
1261
|
+
"claim": "Drift summaries will improve stalled-lane pickup speed.",
|
|
1262
|
+
"boundary": "Terminal-first multi-lane workflows.",
|
|
1263
|
+
"assumptions": "Assumes operators actually consult the summary lane.",
|
|
1264
|
+
"test_path": "Add summaries and compare pickup behavior against a no-summary baseline.",
|
|
1265
|
+
"falsifiers": null
|
|
1266
|
+
},
|
|
1267
|
+
"invented_fields": [],
|
|
1268
|
+
"invention_rate": 0.0,
|
|
1269
|
+
"artifact_type_guess": "hypothesis",
|
|
1270
|
+
"confidence": 0.93,
|
|
1271
|
+
"ambiguities_count": 3,
|
|
1272
|
+
"elapsed_ms": 31522.13,
|
|
1273
|
+
"tokens_used": null,
|
|
1274
|
+
"session_id": ""
|
|
1275
|
+
},
|
|
1276
|
+
{
|
|
1277
|
+
"id": "research_handoff_experiment",
|
|
1278
|
+
"domain": "research",
|
|
1279
|
+
"artifact_class": "experiment",
|
|
1280
|
+
"pickup_score": 0.667,
|
|
1281
|
+
"ambiguity_remaining": 0.333,
|
|
1282
|
+
"answered_targets": 4,
|
|
1283
|
+
"pickup_targets_total": 6,
|
|
1284
|
+
"expected_present_fields": [
|
|
1285
|
+
"evidence_expectations",
|
|
1286
|
+
"inputs",
|
|
1287
|
+
"interpretation_limits",
|
|
1288
|
+
"method",
|
|
1289
|
+
"objective",
|
|
1290
|
+
"outputs"
|
|
1291
|
+
],
|
|
1292
|
+
"answers": {
|
|
1293
|
+
"objective": "Compare handoff pickup across free-form, checklist, and kernel task artifacts.",
|
|
1294
|
+
"method": "Give a second operator one artifact at a time and record time to correct interpretation.",
|
|
1295
|
+
"inputs": null,
|
|
1296
|
+
"outputs": null,
|
|
1297
|
+
"evidence_expectations": "pickup timings; clarification counts",
|
|
1298
|
+
"interpretation_limits": "Small sample may limit interpretation."
|
|
1299
|
+
},
|
|
1300
|
+
"invented_fields": [],
|
|
1301
|
+
"invention_rate": 0.0,
|
|
1302
|
+
"artifact_type_guess": "experiment",
|
|
1303
|
+
"confidence": 0.94,
|
|
1304
|
+
"ambiguities_count": 3,
|
|
1305
|
+
"elapsed_ms": 48029.293,
|
|
1306
|
+
"tokens_used": null,
|
|
1307
|
+
"session_id": ""
|
|
1308
|
+
},
|
|
1309
|
+
{
|
|
1310
|
+
"id": "operations_habanero_checkpoint",
|
|
1311
|
+
"domain": "operations",
|
|
1312
|
+
"artifact_class": "checkpoint",
|
|
1313
|
+
"pickup_score": 1.0,
|
|
1314
|
+
"ambiguity_remaining": 0.0,
|
|
1315
|
+
"answered_targets": 5,
|
|
1316
|
+
"pickup_targets_total": 5,
|
|
1317
|
+
"expected_present_fields": [
|
|
1318
|
+
"artifact_refs",
|
|
1319
|
+
"completed_unit",
|
|
1320
|
+
"current_state",
|
|
1321
|
+
"next_handoff_target",
|
|
1322
|
+
"risks"
|
|
1323
|
+
],
|
|
1324
|
+
"answers": {
|
|
1325
|
+
"completed_unit": "Restored the Habanero linked-project routing path.",
|
|
1326
|
+
"current_state": "The local repo, primary session, and hosted world are aligned again.",
|
|
1327
|
+
"risks": "[\"Other active machines may still carry stale routing state.\"]",
|
|
1328
|
+
"next_handoff_target": "Rerun runner sync on active machines and verify live pickup.",
|
|
1329
|
+
"artifact_refs": "[\".git/orp/link/project.json\",\"runner sync output\"]"
|
|
1330
|
+
},
|
|
1331
|
+
"invented_fields": [],
|
|
1332
|
+
"invention_rate": 0.0,
|
|
1333
|
+
"artifact_type_guess": "checkpoint",
|
|
1334
|
+
"confidence": 0.86,
|
|
1335
|
+
"ambiguities_count": 2,
|
|
1336
|
+
"elapsed_ms": 48918.208,
|
|
1337
|
+
"tokens_used": null,
|
|
1338
|
+
"session_id": ""
|
|
1339
|
+
},
|
|
1340
|
+
{
|
|
1341
|
+
"id": "operations_runner_policy",
|
|
1342
|
+
"domain": "operations",
|
|
1343
|
+
"artifact_class": "policy",
|
|
1344
|
+
"pickup_score": 1.0,
|
|
1345
|
+
"ambiguity_remaining": 0.0,
|
|
1346
|
+
"answered_targets": 5,
|
|
1347
|
+
"pickup_targets_total": 5,
|
|
1348
|
+
"expected_present_fields": [
|
|
1349
|
+
"enforcement_surface",
|
|
1350
|
+
"invariants",
|
|
1351
|
+
"rationale",
|
|
1352
|
+
"rule",
|
|
1353
|
+
"scope"
|
|
1354
|
+
],
|
|
1355
|
+
"answers": {
|
|
1356
|
+
"scope": "Hosted runner job pickup.",
|
|
1357
|
+
"rule": "Only claim hosted jobs for linked projects that have a routeable local session.",
|
|
1358
|
+
"rationale": "The rule exists to prevent dead-end job claims.",
|
|
1359
|
+
"invariants": "do not claim unroutable jobs",
|
|
1360
|
+
"enforcement_surface": "Runner pickup rejects unroutable jobs."
|
|
1361
|
+
},
|
|
1362
|
+
"invented_fields": [],
|
|
1363
|
+
"invention_rate": 0.0,
|
|
1364
|
+
"artifact_type_guess": "policy",
|
|
1365
|
+
"confidence": 0.87,
|
|
1366
|
+
"ambiguities_count": 3,
|
|
1367
|
+
"elapsed_ms": 78813.594,
|
|
1368
|
+
"tokens_used": null,
|
|
1369
|
+
"session_id": ""
|
|
1370
|
+
},
|
|
1371
|
+
{
|
|
1372
|
+
"id": "writing_kernel_launch_result",
|
|
1373
|
+
"domain": "writing",
|
|
1374
|
+
"artifact_class": "result",
|
|
1375
|
+
"pickup_score": 0.8,
|
|
1376
|
+
"ambiguity_remaining": 0.2,
|
|
1377
|
+
"answered_targets": 4,
|
|
1378
|
+
"pickup_targets_total": 5,
|
|
1379
|
+
"expected_present_fields": [
|
|
1380
|
+
"claim",
|
|
1381
|
+
"evidence_paths",
|
|
1382
|
+
"interpretation_limits",
|
|
1383
|
+
"next_follow_up",
|
|
1384
|
+
"status"
|
|
1385
|
+
],
|
|
1386
|
+
"answers": {
|
|
1387
|
+
"claim": "ORP shipped the first reasoning kernel release in the CLI.",
|
|
1388
|
+
"evidence_paths": "[\"docs/ORP_REASONING_KERNEL_V0_1.md\",\"docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md\"]",
|
|
1389
|
+
"status": null,
|
|
1390
|
+
"interpretation_limits": "comparative superiority is still unproven",
|
|
1391
|
+
"next_follow_up": "Run comparative artifact and handoff studies next."
|
|
1392
|
+
},
|
|
1393
|
+
"invented_fields": [],
|
|
1394
|
+
"invention_rate": 0.0,
|
|
1395
|
+
"artifact_type_guess": "result",
|
|
1396
|
+
"confidence": 0.88,
|
|
1397
|
+
"ambiguities_count": 3,
|
|
1398
|
+
"elapsed_ms": 42572.066,
|
|
1399
|
+
"tokens_used": null,
|
|
1400
|
+
"session_id": ""
|
|
1401
|
+
}
|
|
1402
|
+
],
|
|
1403
|
+
"mean_pickup_score": 0.781,
|
|
1404
|
+
"mean_ambiguity_remaining": 0.219,
|
|
1405
|
+
"mean_answered_target_rate": 0.781,
|
|
1406
|
+
"mean_confidence": 0.883,
|
|
1407
|
+
"mean_ambiguities_count": 2.714,
|
|
1408
|
+
"mean_invention_rate": 0.0,
|
|
1409
|
+
"mean_elapsed_ms": 50151.459,
|
|
1410
|
+
"mean_tokens_used": null
|
|
1411
|
+
},
|
|
1412
|
+
"kernel": {
|
|
1413
|
+
"condition": "kernel",
|
|
1414
|
+
"cases_total": 7,
|
|
1415
|
+
"rows": [
|
|
1416
|
+
{
|
|
1417
|
+
"id": "software_trace_widget",
|
|
1418
|
+
"domain": "software",
|
|
1419
|
+
"artifact_class": "task",
|
|
1420
|
+
"pickup_score": 1.0,
|
|
1421
|
+
"ambiguity_remaining": 0.0,
|
|
1422
|
+
"answered_targets": 5,
|
|
1423
|
+
"pickup_targets_total": 5,
|
|
1424
|
+
"expected_present_fields": [
|
|
1425
|
+
"boundary",
|
|
1426
|
+
"constraints",
|
|
1427
|
+
"goal",
|
|
1428
|
+
"object",
|
|
1429
|
+
"success_criteria"
|
|
1430
|
+
],
|
|
1431
|
+
"answers": {
|
|
1432
|
+
"object": "terminal trace widget",
|
|
1433
|
+
"goal": "surface lane drift and state clearly for operators",
|
|
1434
|
+
"boundary": "[\"terminal-first lane visibility\",\"active ORP sessions only\"]",
|
|
1435
|
+
"constraints": "[\"low friction\",\"no GUI dependency\"]",
|
|
1436
|
+
"success_criteria": "[\"an operator can identify a drifting lane within 10 seconds\",\"the widget does not overload the terminal surface\"]"
|
|
1437
|
+
},
|
|
1438
|
+
"invented_fields": [],
|
|
1439
|
+
"invention_rate": 0.0,
|
|
1440
|
+
"artifact_type_guess": "task",
|
|
1441
|
+
"confidence": 0.99,
|
|
1442
|
+
"ambiguities_count": 3,
|
|
1443
|
+
"elapsed_ms": 28558.725,
|
|
1444
|
+
"tokens_used": null,
|
|
1445
|
+
"session_id": ""
|
|
1446
|
+
},
|
|
1447
|
+
{
|
|
1448
|
+
"id": "product_project_home",
|
|
1449
|
+
"domain": "product",
|
|
1450
|
+
"artifact_class": "decision",
|
|
1451
|
+
"pickup_score": 1.0,
|
|
1452
|
+
"ambiguity_remaining": 0.0,
|
|
1453
|
+
"answered_targets": 5,
|
|
1454
|
+
"pickup_targets_total": 5,
|
|
1455
|
+
"expected_present_fields": [
|
|
1456
|
+
"chosen_path",
|
|
1457
|
+
"consequences",
|
|
1458
|
+
"question",
|
|
1459
|
+
"rationale",
|
|
1460
|
+
"rejected_alternatives"
|
|
1461
|
+
],
|
|
1462
|
+
"answers": {
|
|
1463
|
+
"question": "Should the web app home foreground linked projects or the old idea board?",
|
|
1464
|
+
"chosen_path": "Foreground linked projects and move broad idea browsing into Pensieve.",
|
|
1465
|
+
"rejected_alternatives": "[\"keep the old idea board as the default home\",\"split the home evenly between ideas and projects\"]",
|
|
1466
|
+
"rationale": "Active work should be reachable immediately, while the idea library can stay available as secondary navigation.",
|
|
1467
|
+
"consequences": "[\"linked projects become the primary home object\",\"idea browsing becomes one click deeper\"]"
|
|
1468
|
+
},
|
|
1469
|
+
"invented_fields": [],
|
|
1470
|
+
"invention_rate": 0.0,
|
|
1471
|
+
"artifact_type_guess": "decision",
|
|
1472
|
+
"confidence": 0.99,
|
|
1473
|
+
"ambiguities_count": 1,
|
|
1474
|
+
"elapsed_ms": 26493.236,
|
|
1475
|
+
"tokens_used": null,
|
|
1476
|
+
"session_id": ""
|
|
1477
|
+
},
|
|
1478
|
+
{
|
|
1479
|
+
"id": "research_drift_hypothesis",
|
|
1480
|
+
"domain": "research",
|
|
1481
|
+
"artifact_class": "hypothesis",
|
|
1482
|
+
"pickup_score": 1.0,
|
|
1483
|
+
"ambiguity_remaining": 0.0,
|
|
1484
|
+
"answered_targets": 5,
|
|
1485
|
+
"pickup_targets_total": 5,
|
|
1486
|
+
"expected_present_fields": [
|
|
1487
|
+
"assumptions",
|
|
1488
|
+
"boundary",
|
|
1489
|
+
"claim",
|
|
1490
|
+
"falsifiers",
|
|
1491
|
+
"test_path"
|
|
1492
|
+
],
|
|
1493
|
+
"answers": {
|
|
1494
|
+
"claim": "Short drift summaries reduce the time needed to identify stalled lanes.",
|
|
1495
|
+
"boundary": "[\"terminal-first multi-lane workflows\",\"operators already monitoring active lanes\"]",
|
|
1496
|
+
"assumptions": "[\"operators glance at summaries while they work\",\"summaries do not introduce excessive noise\"]",
|
|
1497
|
+
"test_path": "Run matched stalled-lane pickup trials with and without summaries and compare detection time.",
|
|
1498
|
+
"falsifiers": "[\"pickup time does not improve materially\",\"operators ignore the summaries\"]"
|
|
1499
|
+
},
|
|
1500
|
+
"invented_fields": [],
|
|
1501
|
+
"invention_rate": 0.0,
|
|
1502
|
+
"artifact_type_guess": "hypothesis",
|
|
1503
|
+
"confidence": 0.99,
|
|
1504
|
+
"ambiguities_count": 1,
|
|
1505
|
+
"elapsed_ms": 34178.77,
|
|
1506
|
+
"tokens_used": null,
|
|
1507
|
+
"session_id": ""
|
|
1508
|
+
},
|
|
1509
|
+
{
|
|
1510
|
+
"id": "research_handoff_experiment",
|
|
1511
|
+
"domain": "research",
|
|
1512
|
+
"artifact_class": "experiment",
|
|
1513
|
+
"pickup_score": 1.0,
|
|
1514
|
+
"ambiguity_remaining": 0.0,
|
|
1515
|
+
"answered_targets": 6,
|
|
1516
|
+
"pickup_targets_total": 6,
|
|
1517
|
+
"expected_present_fields": [
|
|
1518
|
+
"evidence_expectations",
|
|
1519
|
+
"inputs",
|
|
1520
|
+
"interpretation_limits",
|
|
1521
|
+
"method",
|
|
1522
|
+
"objective",
|
|
1523
|
+
"outputs"
|
|
1524
|
+
],
|
|
1525
|
+
"answers": {
|
|
1526
|
+
"objective": "Measure whether kernel task artifacts improve handoff pickup quality over free-form and generic checklist alternatives.",
|
|
1527
|
+
"method": "Run matched handoff trials where a second operator receives one artifact at a time and explains the task, constraints, and next action.",
|
|
1528
|
+
"inputs": "[\"matched prompt set\",\"second-operator reviewers\",\"three artifact conditions\"]",
|
|
1529
|
+
"outputs": "[\"pickup scores\",\"clarification counts\",\"time to correct interpretation\"]",
|
|
1530
|
+
"evidence_expectations": "[\"score sheets\",\"timing logs\",\"artifact corpus\"]",
|
|
1531
|
+
"interpretation_limits": "[\"internal sample size is small\",\"the pilot measures structural pickup, not full downstream outcomes\"]"
|
|
1532
|
+
},
|
|
1533
|
+
"invented_fields": [],
|
|
1534
|
+
"invention_rate": 0.0,
|
|
1535
|
+
"artifact_type_guess": "experiment",
|
|
1536
|
+
"confidence": 0.99,
|
|
1537
|
+
"ambiguities_count": 2,
|
|
1538
|
+
"elapsed_ms": 27351.059,
|
|
1539
|
+
"tokens_used": null,
|
|
1540
|
+
"session_id": ""
|
|
1541
|
+
},
|
|
1542
|
+
{
|
|
1543
|
+
"id": "operations_habanero_checkpoint",
|
|
1544
|
+
"domain": "operations",
|
|
1545
|
+
"artifact_class": "checkpoint",
|
|
1546
|
+
"pickup_score": 1.0,
|
|
1547
|
+
"ambiguity_remaining": 0.0,
|
|
1548
|
+
"answered_targets": 5,
|
|
1549
|
+
"pickup_targets_total": 5,
|
|
1550
|
+
"expected_present_fields": [
|
|
1551
|
+
"artifact_refs",
|
|
1552
|
+
"completed_unit",
|
|
1553
|
+
"current_state",
|
|
1554
|
+
"next_handoff_target",
|
|
1555
|
+
"risks"
|
|
1556
|
+
],
|
|
1557
|
+
"answers": {
|
|
1558
|
+
"completed_unit": "Restored canonical runner routing for Habanero.",
|
|
1559
|
+
"current_state": "The local project link, primary session, and hosted world are synchronized and routable again.",
|
|
1560
|
+
"risks": "[\"inactive machines may still hold stale routing state\",\"older queued jobs may need a fresh sync before pickup\"]",
|
|
1561
|
+
"next_handoff_target": "Rerun runner sync on active machines and verify one fresh hosted job pickup.",
|
|
1562
|
+
"artifact_refs": "[\".git/orp/link/project.json\",\".git/orp/link/sessions\",\"orp/artifacts\"]"
|
|
1563
|
+
},
|
|
1564
|
+
"invented_fields": [],
|
|
1565
|
+
"invention_rate": 0.0,
|
|
1566
|
+
"artifact_type_guess": "checkpoint",
|
|
1567
|
+
"confidence": 0.99,
|
|
1568
|
+
"ambiguities_count": 1,
|
|
1569
|
+
"elapsed_ms": 15567.294,
|
|
1570
|
+
"tokens_used": null,
|
|
1571
|
+
"session_id": ""
|
|
1572
|
+
},
|
|
1573
|
+
{
|
|
1574
|
+
"id": "operations_runner_policy",
|
|
1575
|
+
"domain": "operations",
|
|
1576
|
+
"artifact_class": "policy",
|
|
1577
|
+
"pickup_score": 1.0,
|
|
1578
|
+
"ambiguity_remaining": 0.0,
|
|
1579
|
+
"answered_targets": 5,
|
|
1580
|
+
"pickup_targets_total": 5,
|
|
1581
|
+
"expected_present_fields": [
|
|
1582
|
+
"enforcement_surface",
|
|
1583
|
+
"invariants",
|
|
1584
|
+
"rationale",
|
|
1585
|
+
"rule",
|
|
1586
|
+
"scope"
|
|
1587
|
+
],
|
|
1588
|
+
"answers": {
|
|
1589
|
+
"scope": "Hosted runner job pickup and claim behavior.",
|
|
1590
|
+
"rule": "Only claim a hosted job when the linked project has a routeable local session on the current machine.",
|
|
1591
|
+
"rationale": "Jobs should only be claimed when the runner can execute them against a real local target.",
|
|
1592
|
+
"invariants": "[\"a claimed job must resolve to an actual local session\",\"runner routing must stay machine-scoped\"]",
|
|
1593
|
+
"enforcement_surface": "runner sync, poll, and work lifecycle"
|
|
1594
|
+
},
|
|
1595
|
+
"invented_fields": [],
|
|
1596
|
+
"invention_rate": 0.0,
|
|
1597
|
+
"artifact_type_guess": "policy",
|
|
1598
|
+
"confidence": 0.99,
|
|
1599
|
+
"ambiguities_count": 1,
|
|
1600
|
+
"elapsed_ms": 34428.204,
|
|
1601
|
+
"tokens_used": null,
|
|
1602
|
+
"session_id": ""
|
|
1603
|
+
},
|
|
1604
|
+
{
|
|
1605
|
+
"id": "writing_kernel_launch_result",
|
|
1606
|
+
"domain": "writing",
|
|
1607
|
+
"artifact_class": "result",
|
|
1608
|
+
"pickup_score": 1.0,
|
|
1609
|
+
"ambiguity_remaining": 0.0,
|
|
1610
|
+
"answered_targets": 5,
|
|
1611
|
+
"pickup_targets_total": 5,
|
|
1612
|
+
"expected_present_fields": [
|
|
1613
|
+
"claim",
|
|
1614
|
+
"evidence_paths",
|
|
1615
|
+
"interpretation_limits",
|
|
1616
|
+
"next_follow_up",
|
|
1617
|
+
"status"
|
|
1618
|
+
],
|
|
1619
|
+
"answers": {
|
|
1620
|
+
"claim": "ORP shipped the first reasoning kernel release as a real CLI protocol surface.",
|
|
1621
|
+
"evidence_paths": "[\"docs/ORP_REASONING_KERNEL_V0_1.md\",\"docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md\",\"spec/v1/kernel.schema.json\"]",
|
|
1622
|
+
"status": "shipped in the ORP CLI and published to npm",
|
|
1623
|
+
"interpretation_limits": "[\"comparative superiority over free-form and checklist alternatives is not yet proven\",\"the current evidence is strongest on internal validity\"]",
|
|
1624
|
+
"next_follow_up": "Run the comparative artifact, pickup, and corpus-fit studies."
|
|
1625
|
+
},
|
|
1626
|
+
"invented_fields": [],
|
|
1627
|
+
"invention_rate": 0.0,
|
|
1628
|
+
"artifact_type_guess": "result",
|
|
1629
|
+
"confidence": 0.99,
|
|
1630
|
+
"ambiguities_count": 1,
|
|
1631
|
+
"elapsed_ms": 29763.315,
|
|
1632
|
+
"tokens_used": null,
|
|
1633
|
+
"session_id": ""
|
|
1634
|
+
}
|
|
1635
|
+
],
|
|
1636
|
+
"mean_pickup_score": 1.0,
|
|
1637
|
+
"mean_ambiguity_remaining": 0.0,
|
|
1638
|
+
"mean_answered_target_rate": 1.0,
|
|
1639
|
+
"mean_confidence": 0.99,
|
|
1640
|
+
"mean_ambiguities_count": 1.429,
|
|
1641
|
+
"mean_invention_rate": 0.0,
|
|
1642
|
+
"mean_elapsed_ms": 28048.658,
|
|
1643
|
+
"mean_tokens_used": null
|
|
1644
|
+
}
|
|
1645
|
+
},
|
|
1646
|
+
"pairwise": {
|
|
1647
|
+
"kernel_vs_generic_checklist": {
|
|
1648
|
+
"left": "kernel",
|
|
1649
|
+
"right": "generic_checklist",
|
|
1650
|
+
"wins": 5,
|
|
1651
|
+
"ties": 2,
|
|
1652
|
+
"losses": 0,
|
|
1653
|
+
"mean_pickup_score_delta": 0.219,
|
|
1654
|
+
"by_case": [
|
|
1655
|
+
{
|
|
1656
|
+
"id": "software_trace_widget",
|
|
1657
|
+
"domain": "software",
|
|
1658
|
+
"artifact_class": "task",
|
|
1659
|
+
"left_score": 1.0,
|
|
1660
|
+
"right_score": 0.8,
|
|
1661
|
+
"delta": 0.2,
|
|
1662
|
+
"outcome": "win"
|
|
1663
|
+
},
|
|
1664
|
+
{
|
|
1665
|
+
"id": "product_project_home",
|
|
1666
|
+
"domain": "product",
|
|
1667
|
+
"artifact_class": "decision",
|
|
1668
|
+
"left_score": 1.0,
|
|
1669
|
+
"right_score": 0.4,
|
|
1670
|
+
"delta": 0.6,
|
|
1671
|
+
"outcome": "win"
|
|
1672
|
+
},
|
|
1673
|
+
{
|
|
1674
|
+
"id": "research_drift_hypothesis",
|
|
1675
|
+
"domain": "research",
|
|
1676
|
+
"artifact_class": "hypothesis",
|
|
1677
|
+
"left_score": 1.0,
|
|
1678
|
+
"right_score": 0.8,
|
|
1679
|
+
"delta": 0.2,
|
|
1680
|
+
"outcome": "win"
|
|
1681
|
+
},
|
|
1682
|
+
{
|
|
1683
|
+
"id": "research_handoff_experiment",
|
|
1684
|
+
"domain": "research",
|
|
1685
|
+
"artifact_class": "experiment",
|
|
1686
|
+
"left_score": 1.0,
|
|
1687
|
+
"right_score": 0.667,
|
|
1688
|
+
"delta": 0.333,
|
|
1689
|
+
"outcome": "win"
|
|
1690
|
+
},
|
|
1691
|
+
{
|
|
1692
|
+
"id": "operations_habanero_checkpoint",
|
|
1693
|
+
"domain": "operations",
|
|
1694
|
+
"artifact_class": "checkpoint",
|
|
1695
|
+
"left_score": 1.0,
|
|
1696
|
+
"right_score": 1.0,
|
|
1697
|
+
"delta": 0.0,
|
|
1698
|
+
"outcome": "tie"
|
|
1699
|
+
},
|
|
1700
|
+
{
|
|
1701
|
+
"id": "operations_runner_policy",
|
|
1702
|
+
"domain": "operations",
|
|
1703
|
+
"artifact_class": "policy",
|
|
1704
|
+
"left_score": 1.0,
|
|
1705
|
+
"right_score": 1.0,
|
|
1706
|
+
"delta": 0.0,
|
|
1707
|
+
"outcome": "tie"
|
|
1708
|
+
},
|
|
1709
|
+
{
|
|
1710
|
+
"id": "writing_kernel_launch_result",
|
|
1711
|
+
"domain": "writing",
|
|
1712
|
+
"artifact_class": "result",
|
|
1713
|
+
"left_score": 1.0,
|
|
1714
|
+
"right_score": 0.8,
|
|
1715
|
+
"delta": 0.2,
|
|
1716
|
+
"outcome": "win"
|
|
1717
|
+
}
|
|
1718
|
+
]
|
|
1719
|
+
},
|
|
1720
|
+
"kernel_vs_freeform": {
|
|
1721
|
+
"left": "kernel",
|
|
1722
|
+
"right": "freeform",
|
|
1723
|
+
"wins": 7,
|
|
1724
|
+
"ties": 0,
|
|
1725
|
+
"losses": 0,
|
|
1726
|
+
"mean_pickup_score_delta": 0.305,
|
|
1727
|
+
"by_case": [
|
|
1728
|
+
{
|
|
1729
|
+
"id": "software_trace_widget",
|
|
1730
|
+
"domain": "software",
|
|
1731
|
+
"artifact_class": "task",
|
|
1732
|
+
"left_score": 1.0,
|
|
1733
|
+
"right_score": 0.6,
|
|
1734
|
+
"delta": 0.4,
|
|
1735
|
+
"outcome": "win"
|
|
1736
|
+
},
|
|
1737
|
+
{
|
|
1738
|
+
"id": "product_project_home",
|
|
1739
|
+
"domain": "product",
|
|
1740
|
+
"artifact_class": "decision",
|
|
1741
|
+
"left_score": 1.0,
|
|
1742
|
+
"right_score": 0.8,
|
|
1743
|
+
"delta": 0.2,
|
|
1744
|
+
"outcome": "win"
|
|
1745
|
+
},
|
|
1746
|
+
{
|
|
1747
|
+
"id": "research_drift_hypothesis",
|
|
1748
|
+
"domain": "research",
|
|
1749
|
+
"artifact_class": "hypothesis",
|
|
1750
|
+
"left_score": 1.0,
|
|
1751
|
+
"right_score": 0.8,
|
|
1752
|
+
"delta": 0.2,
|
|
1753
|
+
"outcome": "win"
|
|
1754
|
+
},
|
|
1755
|
+
{
|
|
1756
|
+
"id": "research_handoff_experiment",
|
|
1757
|
+
"domain": "research",
|
|
1758
|
+
"artifact_class": "experiment",
|
|
1759
|
+
"left_score": 1.0,
|
|
1760
|
+
"right_score": 0.667,
|
|
1761
|
+
"delta": 0.333,
|
|
1762
|
+
"outcome": "win"
|
|
1763
|
+
},
|
|
1764
|
+
{
|
|
1765
|
+
"id": "operations_habanero_checkpoint",
|
|
1766
|
+
"domain": "operations",
|
|
1767
|
+
"artifact_class": "checkpoint",
|
|
1768
|
+
"left_score": 1.0,
|
|
1769
|
+
"right_score": 0.6,
|
|
1770
|
+
"delta": 0.4,
|
|
1771
|
+
"outcome": "win"
|
|
1772
|
+
},
|
|
1773
|
+
{
|
|
1774
|
+
"id": "operations_runner_policy",
|
|
1775
|
+
"domain": "operations",
|
|
1776
|
+
"artifact_class": "policy",
|
|
1777
|
+
"left_score": 1.0,
|
|
1778
|
+
"right_score": 0.8,
|
|
1779
|
+
"delta": 0.2,
|
|
1780
|
+
"outcome": "win"
|
|
1781
|
+
},
|
|
1782
|
+
{
|
|
1783
|
+
"id": "writing_kernel_launch_result",
|
|
1784
|
+
"domain": "writing",
|
|
1785
|
+
"artifact_class": "result",
|
|
1786
|
+
"left_score": 1.0,
|
|
1787
|
+
"right_score": 0.6,
|
|
1788
|
+
"delta": 0.4,
|
|
1789
|
+
"outcome": "win"
|
|
1790
|
+
}
|
|
1791
|
+
]
|
|
1792
|
+
},
|
|
1793
|
+
"generic_checklist_vs_freeform": {
|
|
1794
|
+
"left": "generic_checklist",
|
|
1795
|
+
"right": "freeform",
|
|
1796
|
+
"wins": 4,
|
|
1797
|
+
"ties": 2,
|
|
1798
|
+
"losses": 1,
|
|
1799
|
+
"mean_pickup_score_delta": 0.086,
|
|
1800
|
+
"by_case": [
|
|
1801
|
+
{
|
|
1802
|
+
"id": "software_trace_widget",
|
|
1803
|
+
"domain": "software",
|
|
1804
|
+
"artifact_class": "task",
|
|
1805
|
+
"left_score": 0.8,
|
|
1806
|
+
"right_score": 0.6,
|
|
1807
|
+
"delta": 0.2,
|
|
1808
|
+
"outcome": "win"
|
|
1809
|
+
},
|
|
1810
|
+
{
|
|
1811
|
+
"id": "product_project_home",
|
|
1812
|
+
"domain": "product",
|
|
1813
|
+
"artifact_class": "decision",
|
|
1814
|
+
"left_score": 0.4,
|
|
1815
|
+
"right_score": 0.8,
|
|
1816
|
+
"delta": -0.4,
|
|
1817
|
+
"outcome": "loss"
|
|
1818
|
+
},
|
|
1819
|
+
{
|
|
1820
|
+
"id": "research_drift_hypothesis",
|
|
1821
|
+
"domain": "research",
|
|
1822
|
+
"artifact_class": "hypothesis",
|
|
1823
|
+
"left_score": 0.8,
|
|
1824
|
+
"right_score": 0.8,
|
|
1825
|
+
"delta": 0.0,
|
|
1826
|
+
"outcome": "tie"
|
|
1827
|
+
},
|
|
1828
|
+
{
|
|
1829
|
+
"id": "research_handoff_experiment",
|
|
1830
|
+
"domain": "research",
|
|
1831
|
+
"artifact_class": "experiment",
|
|
1832
|
+
"left_score": 0.667,
|
|
1833
|
+
"right_score": 0.667,
|
|
1834
|
+
"delta": 0.0,
|
|
1835
|
+
"outcome": "tie"
|
|
1836
|
+
},
|
|
1837
|
+
{
|
|
1838
|
+
"id": "operations_habanero_checkpoint",
|
|
1839
|
+
"domain": "operations",
|
|
1840
|
+
"artifact_class": "checkpoint",
|
|
1841
|
+
"left_score": 1.0,
|
|
1842
|
+
"right_score": 0.6,
|
|
1843
|
+
"delta": 0.4,
|
|
1844
|
+
"outcome": "win"
|
|
1845
|
+
},
|
|
1846
|
+
{
|
|
1847
|
+
"id": "operations_runner_policy",
|
|
1848
|
+
"domain": "operations",
|
|
1849
|
+
"artifact_class": "policy",
|
|
1850
|
+
"left_score": 1.0,
|
|
1851
|
+
"right_score": 0.8,
|
|
1852
|
+
"delta": 0.2,
|
|
1853
|
+
"outcome": "win"
|
|
1854
|
+
},
|
|
1855
|
+
{
|
|
1856
|
+
"id": "writing_kernel_launch_result",
|
|
1857
|
+
"domain": "writing",
|
|
1858
|
+
"artifact_class": "result",
|
|
1859
|
+
"left_score": 0.8,
|
|
1860
|
+
"right_score": 0.6,
|
|
1861
|
+
"delta": 0.2,
|
|
1862
|
+
"outcome": "win"
|
|
1863
|
+
}
|
|
1864
|
+
]
|
|
1865
|
+
}
|
|
1866
|
+
}
|
|
1867
|
+
}
|
|
1868
|
+
],
|
|
1869
|
+
"conditions": {
|
|
1870
|
+
"freeform": {
|
|
1871
|
+
"mean_pickup_score": 0.71,
|
|
1872
|
+
"pickup_score_stdev": 0.015,
|
|
1873
|
+
"mean_invention_rate": 0.0,
|
|
1874
|
+
"invention_rate_stdev": 0.0,
|
|
1875
|
+
"mean_confidence": 0.954,
|
|
1876
|
+
"confidence_stdev": 0.001,
|
|
1877
|
+
"mean_elapsed_ms": 18910.852,
|
|
1878
|
+
"elapsed_ms_stdev": 805.896
|
|
1879
|
+
},
|
|
1880
|
+
"generic_checklist": {
|
|
1881
|
+
"mean_pickup_score": 0.781,
|
|
1882
|
+
"pickup_score_stdev": 0.0,
|
|
1883
|
+
"mean_invention_rate": 0.0,
|
|
1884
|
+
"invention_rate_stdev": 0.0,
|
|
1885
|
+
"mean_confidence": 0.881,
|
|
1886
|
+
"confidence_stdev": 0.002,
|
|
1887
|
+
"mean_elapsed_ms": 50182.38,
|
|
1888
|
+
"elapsed_ms_stdev": 30.921
|
|
1889
|
+
},
|
|
1890
|
+
"kernel": {
|
|
1891
|
+
"mean_pickup_score": 1.0,
|
|
1892
|
+
"pickup_score_stdev": 0.0,
|
|
1893
|
+
"mean_invention_rate": 0.0,
|
|
1894
|
+
"invention_rate_stdev": 0.0,
|
|
1895
|
+
"mean_confidence": 0.99,
|
|
1896
|
+
"confidence_stdev": 0.0,
|
|
1897
|
+
"mean_elapsed_ms": 26453.938,
|
|
1898
|
+
"elapsed_ms_stdev": 1594.72
|
|
1899
|
+
}
|
|
1900
|
+
},
|
|
1901
|
+
"claims": [
|
|
1902
|
+
{
|
|
1903
|
+
"id": "kernel_stays_above_generic_checklist_across_replication",
|
|
1904
|
+
"claim": "Across repeated live Codex runs, kernel mean pickup stays at or above generic checklist mean pickup, and above it on the aggregated sample.",
|
|
1905
|
+
"status": "pass"
|
|
1906
|
+
},
|
|
1907
|
+
{
|
|
1908
|
+
"id": "kernel_stays_above_freeform_across_replication",
|
|
1909
|
+
"claim": "Across repeated live Codex runs, kernel mean pickup stays above free-form mean pickup.",
|
|
1910
|
+
"status": "pass"
|
|
1911
|
+
},
|
|
1912
|
+
{
|
|
1913
|
+
"id": "kernel_keeps_lowest_or_equal_invention_rate_across_replication",
|
|
1914
|
+
"claim": "Across repeated live Codex runs, kernel mean invention rate stays at or below the other conditions.",
|
|
1915
|
+
"status": "pass"
|
|
1916
|
+
}
|
|
1917
|
+
],
|
|
1918
|
+
"summary": {
|
|
1919
|
+
"all_claims_pass": true,
|
|
1920
|
+
"kernel_mean_pickup_score": 1.0,
|
|
1921
|
+
"generic_checklist_mean_pickup_score": 0.781,
|
|
1922
|
+
"freeform_mean_pickup_score": 0.71,
|
|
1923
|
+
"kernel_mean_invention_rate": 0.0,
|
|
1924
|
+
"generic_checklist_mean_invention_rate": 0.0,
|
|
1925
|
+
"freeform_mean_invention_rate": 0.0
|
|
1926
|
+
}
|
|
1927
|
+
}
|