open-research-protocol 0.4.7 → 0.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +15 -0
  2. package/cli/orp.py +1158 -43
  3. package/docs/AGENT_LOOP.md +3 -0
  4. package/docs/ORP_REASONING_KERNEL_AGENT_PILOT.md +125 -0
  5. package/docs/ORP_REASONING_KERNEL_AGENT_REPLICATION.md +97 -0
  6. package/docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md +100 -0
  7. package/docs/ORP_REASONING_KERNEL_COMPARISON_PILOT.md +116 -0
  8. package/docs/ORP_REASONING_KERNEL_CONTINUATION_PILOT.md +86 -0
  9. package/docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md +261 -0
  10. package/docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md +131 -0
  11. package/docs/ORP_REASONING_KERNEL_EVOLUTION.md +123 -0
  12. package/docs/ORP_REASONING_KERNEL_PICKUP_PILOT.md +107 -0
  13. package/docs/ORP_REASONING_KERNEL_TECHNICAL_VALIDATION.md +140 -22
  14. package/docs/ORP_REASONING_KERNEL_V0_1.md +11 -0
  15. package/docs/ORP_YOUTUBE_INSPECT.md +97 -0
  16. package/docs/benchmarks/orp_reasoning_kernel_agent_pilot_v0_1.json +796 -0
  17. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_task_smoke.json +487 -0
  18. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_1.json +1927 -0
  19. package/docs/benchmarks/orp_reasoning_kernel_agent_replication_v0_2.json +10217 -0
  20. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_task_smoke.json +174 -0
  21. package/docs/benchmarks/orp_reasoning_kernel_canonical_continuation_v0_1.json +598 -0
  22. package/docs/benchmarks/orp_reasoning_kernel_comparison_v0_1.json +688 -0
  23. package/docs/benchmarks/orp_reasoning_kernel_continuation_task_smoke.json +150 -0
  24. package/docs/benchmarks/orp_reasoning_kernel_continuation_v0_1.json +448 -0
  25. package/docs/benchmarks/orp_reasoning_kernel_pickup_v0_1.json +594 -0
  26. package/docs/benchmarks/orp_reasoning_kernel_v0_1_validation.json +769 -41
  27. package/examples/README.md +2 -0
  28. package/examples/kernel/comparison/comparison-corpus.json +337 -0
  29. package/examples/kernel/comparison/next-task-continuation.json +55 -0
  30. package/examples/kernel/corpus/operations/habanero-routing.checkpoint.kernel.yml +12 -0
  31. package/examples/kernel/corpus/operations/runner-routing.policy.kernel.yml +9 -0
  32. package/examples/kernel/corpus/product/project-home.decision.kernel.yml +11 -0
  33. package/examples/kernel/corpus/research/kernel-handoff.experiment.kernel.yml +16 -0
  34. package/examples/kernel/corpus/research/lane-drift.hypothesis.kernel.yml +11 -0
  35. package/examples/kernel/corpus/software/trace-widget.task.kernel.yml +13 -0
  36. package/examples/kernel/corpus/writing/kernel-launch.result.kernel.yml +12 -0
  37. package/llms.txt +3 -0
  38. package/package.json +4 -1
  39. package/scripts/orp-kernel-agent-pilot.py +673 -0
  40. package/scripts/orp-kernel-agent-replication.py +307 -0
  41. package/scripts/orp-kernel-benchmark.py +471 -2
  42. package/scripts/orp-kernel-canonical-continuation.py +381 -0
  43. package/scripts/orp-kernel-ci-check.py +138 -0
  44. package/scripts/orp-kernel-comparison.py +592 -0
  45. package/scripts/orp-kernel-continuation-pilot.py +384 -0
  46. package/scripts/orp-kernel-pickup.py +401 -0
  47. package/spec/v1/kernel-extension.schema.json +96 -0
  48. package/spec/v1/kernel-proposal.schema.json +115 -0
  49. package/spec/v1/kernel.schema.json +2 -1
  50. package/spec/v1/youtube-source.schema.json +151 -0
@@ -0,0 +1,150 @@
1
+ {
2
+ "schema_version": "1.0.0",
3
+ "kind": "orp_reasoning_kernel_continuation_pilot_report",
4
+ "metadata": {
5
+ "generated_at_utc": "2026-03-23T07:51:38Z",
6
+ "repo_commit": "c2f7f2a52744a00fb719d37de583da1f4ae615bd",
7
+ "repo_branch": "main",
8
+ "package_version": "0.4.7",
9
+ "python_version": "3.9.6",
10
+ "codex_version": "codex-cli 0.116.0",
11
+ "platform": "macOS-26.3-arm64-arm-64bit",
12
+ "model": "default"
13
+ },
14
+ "corpus": {
15
+ "cases_total": 1,
16
+ "domains": [
17
+ "software"
18
+ ],
19
+ "artifact_classes": [
20
+ "task"
21
+ ]
22
+ },
23
+ "conditions": {
24
+ "freeform": {
25
+ "condition": "freeform",
26
+ "cases_total": 1,
27
+ "rows": [
28
+ {
29
+ "id": "software_trace_widget",
30
+ "domain": "software",
31
+ "artifact_class": "task",
32
+ "continuation_score": 0.889,
33
+ "carry_forward_score": 1.0,
34
+ "invention_rate": 0.333,
35
+ "next_action_present": true,
36
+ "answers": {
37
+ "object": "terminal trace widget for lane monitoring",
38
+ "constraints": "stay terminal-first and low friction",
39
+ "success_criteria": "let operators tell quickly when a lane is drifting"
40
+ },
41
+ "explicitly_missing_count": 2,
42
+ "recommended_next_action": "Sketch the event inputs and a compact terminal panel layout for the lane-monitoring trace widget."
43
+ }
44
+ ],
45
+ "mean_continuation_score": 0.889,
46
+ "mean_carry_forward_score": 1.0,
47
+ "mean_invention_rate": 0.333,
48
+ "mean_confidence": 0.96,
49
+ "mean_elapsed_ms": 14495.965
50
+ },
51
+ "generic_checklist": {
52
+ "condition": "generic_checklist",
53
+ "cases_total": 1,
54
+ "rows": [
55
+ {
56
+ "id": "software_trace_widget",
57
+ "domain": "software",
58
+ "artifact_class": "task",
59
+ "continuation_score": 1.0,
60
+ "carry_forward_score": 1.0,
61
+ "invention_rate": 0.0,
62
+ "next_action_present": true,
63
+ "answers": {
64
+ "object": "Build the terminal trace widget for lane monitoring, focused on terminal-first lane visibility in active ORP sessions.",
65
+ "constraints": "low friction; no GUI dependency",
66
+ "success_criteria": "An operator can identify a drifting lane quickly."
67
+ },
68
+ "explicitly_missing_count": 6,
69
+ "recommended_next_action": "Define the event schema for lane-monitoring inputs, then implement a first terminal rendering pass centered on a compact summary row that helps operators spot drifting lanes quickly without surfacing every event."
70
+ }
71
+ ],
72
+ "mean_continuation_score": 1.0,
73
+ "mean_carry_forward_score": 1.0,
74
+ "mean_invention_rate": 0.0,
75
+ "mean_confidence": 0.9,
76
+ "mean_elapsed_ms": 12499.244
77
+ },
78
+ "kernel": {
79
+ "condition": "kernel",
80
+ "cases_total": 1,
81
+ "rows": [
82
+ {
83
+ "id": "software_trace_widget",
84
+ "domain": "software",
85
+ "artifact_class": "task",
86
+ "continuation_score": 1.0,
87
+ "carry_forward_score": 1.0,
88
+ "invention_rate": 0.0,
89
+ "next_action_present": true,
90
+ "answers": {
91
+ "object": "terminal trace widget",
92
+ "constraints": "[\"low friction\",\"no GUI dependency\"]",
93
+ "success_criteria": "[\"an operator can identify a drifting lane within 10 seconds\",\"the widget does not overload the terminal surface\"]"
94
+ },
95
+ "explicitly_missing_count": 5,
96
+ "recommended_next_action": "Locate the existing terminal trace widget surface or draft a minimal terminal-first widget spec for active ORP sessions, with explicit representations for lane drift and state that can be scanned in under 10 seconds without cluttering the terminal."
97
+ }
98
+ ],
99
+ "mean_continuation_score": 1.0,
100
+ "mean_carry_forward_score": 1.0,
101
+ "mean_invention_rate": 0.0,
102
+ "mean_confidence": 0.92,
103
+ "mean_elapsed_ms": 22262.331
104
+ }
105
+ },
106
+ "pairwise": {
107
+ "kernel_vs_generic_checklist": {
108
+ "left": "kernel",
109
+ "right": "generic_checklist",
110
+ "wins": 0,
111
+ "ties": 1,
112
+ "losses": 0,
113
+ "mean_continuation_score_delta": 0.0
114
+ },
115
+ "kernel_vs_freeform": {
116
+ "left": "kernel",
117
+ "right": "freeform",
118
+ "wins": 1,
119
+ "ties": 0,
120
+ "losses": 0,
121
+ "mean_continuation_score_delta": 0.111
122
+ }
123
+ },
124
+ "claims": [
125
+ {
126
+ "id": "kernel_outscores_generic_checklist_on_continuation",
127
+ "claim": "On the matched live continuation simulation, kernel artifacts support a downstream continuation score that meets or exceeds generic checklist artifacts without a higher invention rate.",
128
+ "status": "pass"
129
+ },
130
+ {
131
+ "id": "kernel_outscores_freeform_on_continuation",
132
+ "claim": "On the matched live continuation simulation, kernel artifacts support a stronger downstream continuation score than free-form artifacts.",
133
+ "status": "pass"
134
+ },
135
+ {
136
+ "id": "kernel_minimizes_continuation_invention",
137
+ "claim": "On the matched live continuation simulation, kernel artifacts minimize unsupported carry-forward invention.",
138
+ "status": "pass"
139
+ }
140
+ ],
141
+ "summary": {
142
+ "all_claims_pass": true,
143
+ "kernel_mean_continuation_score": 1.0,
144
+ "generic_checklist_mean_continuation_score": 1.0,
145
+ "freeform_mean_continuation_score": 0.889,
146
+ "kernel_mean_invention_rate": 0.0,
147
+ "generic_checklist_mean_invention_rate": 0.0,
148
+ "freeform_mean_invention_rate": 0.333
149
+ }
150
+ }
@@ -0,0 +1,448 @@
1
+ {
2
+ "schema_version": "1.0.0",
3
+ "kind": "orp_reasoning_kernel_continuation_pilot_report",
4
+ "metadata": {
5
+ "generated_at_utc": "2026-03-23T08:18:34Z",
6
+ "repo_commit": "c2f7f2a52744a00fb719d37de583da1f4ae615bd",
7
+ "repo_branch": "main",
8
+ "package_version": "0.4.7",
9
+ "python_version": "3.9.6",
10
+ "codex_version": "codex-cli 0.116.0",
11
+ "platform": "macOS-26.3-arm64-arm-64bit",
12
+ "model": "default"
13
+ },
14
+ "corpus": {
15
+ "cases_total": 7,
16
+ "domains": [
17
+ "operations",
18
+ "product",
19
+ "research",
20
+ "software",
21
+ "writing"
22
+ ],
23
+ "artifact_classes": [
24
+ "checkpoint",
25
+ "decision",
26
+ "experiment",
27
+ "hypothesis",
28
+ "policy",
29
+ "result",
30
+ "task"
31
+ ]
32
+ },
33
+ "conditions": {
34
+ "freeform": {
35
+ "condition": "freeform",
36
+ "cases_total": 7,
37
+ "rows": [
38
+ {
39
+ "id": "software_trace_widget",
40
+ "domain": "software",
41
+ "artifact_class": "task",
42
+ "continuation_score": 0.889,
43
+ "carry_forward_score": 1.0,
44
+ "invention_rate": 0.333,
45
+ "next_action_present": true,
46
+ "answers": {
47
+ "object": "terminal trace widget for lane monitoring",
48
+ "constraints": "stay terminal-first and low friction",
49
+ "success_criteria": "let operators tell quickly when a lane is drifting"
50
+ },
51
+ "explicitly_missing_count": 3,
52
+ "recommended_next_action": "Sketch the event inputs the widget should consume and draft a compact terminal panel layout that makes lane drift obvious at a glance."
53
+ },
54
+ {
55
+ "id": "product_project_home",
56
+ "domain": "product",
57
+ "artifact_class": "decision",
58
+ "continuation_score": 1.0,
59
+ "carry_forward_score": 1.0,
60
+ "invention_rate": 0.0,
61
+ "next_action_present": true,
62
+ "answers": {
63
+ "question": "should the web app lead with linked projects or the old idea board?",
64
+ "chosen_path": "lead with linked projects first.",
65
+ "consequences": "active work is foregrounded, while idea browsing moves into Pensieve and the old idea board becomes secondary navigation."
66
+ },
67
+ "explicitly_missing_count": 3,
68
+ "recommended_next_action": "Update the home surface plan/spec so linked projects are the primary entry point, and define the old idea board as secondary navigation within Pensieve."
69
+ },
70
+ {
71
+ "id": "research_drift_hypothesis",
72
+ "domain": "research",
73
+ "artifact_class": "hypothesis",
74
+ "continuation_score": 1.0,
75
+ "carry_forward_score": 1.0,
76
+ "invention_rate": 0.0,
77
+ "next_action_present": true,
78
+ "answers": {
79
+ "claim": "short drift summaries will help operators notice stalled lanes faster.",
80
+ "boundary": "terminal-first multi-lane work.",
81
+ "test_path": "compare stalled-lane pickup with and without summaries."
82
+ },
83
+ "explicitly_missing_count": 4,
84
+ "recommended_next_action": "Convert the hypothesis into a minimal experiment brief by preserving the stated scope and claim, then defining how 'stalled-lane pickup' will be measured in the with-vs-without-summaries comparison."
85
+ },
86
+ {
87
+ "id": "research_handoff_experiment",
88
+ "domain": "research",
89
+ "artifact_class": "experiment",
90
+ "continuation_score": 1.0,
91
+ "carry_forward_score": 1.0,
92
+ "invention_rate": 0.0,
93
+ "next_action_present": true,
94
+ "answers": {
95
+ "objective": "compare free-form tasks, checklist tasks, and kernel tasks during handoff pickup",
96
+ "method": "give matched task artifacts to a second operator and time correct interpretation",
97
+ "outputs": "scores and clarification counts"
98
+ },
99
+ "explicitly_missing_count": 4,
100
+ "recommended_next_action": "Define the rubric for \"correct interpretation,\" prepare the matched free-form/checklist/kernel task artifacts, and set up a simple log to time the second operator and record scores plus clarification counts."
101
+ },
102
+ {
103
+ "id": "operations_habanero_checkpoint",
104
+ "domain": "operations",
105
+ "artifact_class": "checkpoint",
106
+ "continuation_score": 1.0,
107
+ "carry_forward_score": 1.0,
108
+ "invention_rate": 0.0,
109
+ "next_action_present": true,
110
+ "answers": {
111
+ "current_state": "the repo is bound and the primary session is routable again",
112
+ "risks": "other machines may still need a sync",
113
+ "next_handoff_target": "active machines"
114
+ },
115
+ "explicitly_missing_count": 2,
116
+ "recommended_next_action": "Rerun runner sync on active machines."
117
+ },
118
+ {
119
+ "id": "operations_runner_policy",
120
+ "domain": "operations",
121
+ "artifact_class": "policy",
122
+ "continuation_score": 0.889,
123
+ "carry_forward_score": 0.667,
124
+ "invention_rate": 0.0,
125
+ "next_action_present": true,
126
+ "answers": {
127
+ "scope": "hosted runner pickup",
128
+ "rule": "only claim hosted jobs for linked projects that have a routeable local session",
129
+ "enforcement_surface": null
130
+ },
131
+ "explicitly_missing_count": 1,
132
+ "recommended_next_action": "Locate the hosted runner pickup or claim path and verify this policy is enforced before any hosted job is claimed."
133
+ },
134
+ {
135
+ "id": "writing_kernel_launch_result",
136
+ "domain": "writing",
137
+ "artifact_class": "result",
138
+ "continuation_score": 1.0,
139
+ "carry_forward_score": 1.0,
140
+ "invention_rate": 0.0,
141
+ "next_action_present": true,
142
+ "answers": {
143
+ "claim": "ORP shipped the first reasoning kernel release.",
144
+ "status": "shipped in the CLI",
145
+ "next_follow_up": "run comparative studies against free-form artifacts and checklist artifacts."
146
+ },
147
+ "explicitly_missing_count": 0,
148
+ "recommended_next_action": "Run comparative studies against free-form artifacts and checklist artifacts."
149
+ }
150
+ ],
151
+ "mean_continuation_score": 0.968,
152
+ "mean_carry_forward_score": 0.952,
153
+ "mean_invention_rate": 0.048,
154
+ "mean_confidence": 0.953,
155
+ "mean_elapsed_ms": 18133.067
156
+ },
157
+ "generic_checklist": {
158
+ "condition": "generic_checklist",
159
+ "cases_total": 7,
160
+ "rows": [
161
+ {
162
+ "id": "software_trace_widget",
163
+ "domain": "software",
164
+ "artifact_class": "task",
165
+ "continuation_score": 1.0,
166
+ "carry_forward_score": 1.0,
167
+ "invention_rate": 0.0,
168
+ "next_action_present": true,
169
+ "answers": {
170
+ "object": "Build the terminal trace widget for lane monitoring.",
171
+ "constraints": "low friction; no GUI dependency",
172
+ "success_criteria": "An operator can identify a drifting lane quickly."
173
+ },
174
+ "explicitly_missing_count": 6,
175
+ "recommended_next_action": "Define the event schema for lane-monitoring inputs, then implement a first terminal rendering pass with a compact summary row focused on quickly spotting lane drift."
176
+ },
177
+ {
178
+ "id": "product_project_home",
179
+ "domain": "product",
180
+ "artifact_class": "decision",
181
+ "continuation_score": 1.0,
182
+ "carry_forward_score": 1.0,
183
+ "invention_rate": 0.0,
184
+ "next_action_present": true,
185
+ "answers": {
186
+ "question": "What should the default logged-in dashboard home be: the linked-project home or the old idea-board-first home?",
187
+ "chosen_path": "Use the linked-project home as the default logged-in dashboard, and move broader idea browsing into Pensieve.",
188
+ "consequences": "The home experience stays calm and focused on linked projects with one-click access to active work, while idea discovery becomes less prominent on the home screen but the full idea library remains available via Pensieve."
189
+ },
190
+ "explicitly_missing_count": 3,
191
+ "recommended_next_action": "Implement the linked-project home shell as the default logged-in dashboard and add lower-right Pensieve access, then verify active work is reachable in one click and the full idea library remains accessible."
192
+ },
193
+ {
194
+ "id": "research_drift_hypothesis",
195
+ "domain": "research",
196
+ "artifact_class": "hypothesis",
197
+ "continuation_score": 0.889,
198
+ "carry_forward_score": 0.667,
199
+ "invention_rate": 0.0,
200
+ "next_action_present": true,
201
+ "answers": {
202
+ "claim": "Drift summaries will improve stalled-lane pickup speed.",
203
+ "boundary": "Terminal-first multi-lane workflows; operators cannot be flooded with noise.",
204
+ "test_path": null
205
+ },
206
+ "explicitly_missing_count": 4,
207
+ "recommended_next_action": "Design the summary-vs-no-summary comparison fixture and timing capture, with stalled-lane detection time as the primary measurement."
208
+ },
209
+ {
210
+ "id": "research_handoff_experiment",
211
+ "domain": "research",
212
+ "artifact_class": "experiment",
213
+ "continuation_score": 1.0,
214
+ "carry_forward_score": 1.0,
215
+ "invention_rate": 0.0,
216
+ "next_action_present": true,
217
+ "answers": {
218
+ "objective": "Compare handoff pickup across free-form, checklist, and kernel task artifacts for internal ORP operator handoffs.",
219
+ "method": "Use a matched prompt set, give a second operator one artifact at a time, and record time to correct interpretation and clarification counts.",
220
+ "outputs": "Pickup timings and clarification counts."
221
+ },
222
+ "explicitly_missing_count": 5,
223
+ "recommended_next_action": "Prepare the matched prompt set and reviewer score sheet for the internal ORP handoff comparison, with explicit scoring rules for when an interpretation counts as correct and how clarification counts are recorded."
224
+ },
225
+ {
226
+ "id": "operations_habanero_checkpoint",
227
+ "domain": "operations",
228
+ "artifact_class": "checkpoint",
229
+ "continuation_score": 1.0,
230
+ "carry_forward_score": 1.0,
231
+ "invention_rate": 0.0,
232
+ "next_action_present": true,
233
+ "answers": {
234
+ "current_state": "Restored the Habanero linked-project routing path; the local repo, primary session, and hosted world are aligned again, and the linked project is routable again.",
235
+ "risks": "Other active machines may still carry stale routing state.",
236
+ "next_handoff_target": "Rerun runner sync on active machines and verify live pickup."
237
+ },
238
+ "explicitly_missing_count": 3,
239
+ "recommended_next_action": "Rerun runner sync on each active machine, avoiding duplicate world bindings, then verify the linked project is picked up live."
240
+ },
241
+ {
242
+ "id": "operations_runner_policy",
243
+ "domain": "operations",
244
+ "artifact_class": "policy",
245
+ "continuation_score": 1.0,
246
+ "carry_forward_score": 1.0,
247
+ "invention_rate": 0.0,
248
+ "next_action_present": true,
249
+ "answers": {
250
+ "scope": "Hosted runner job pickup.",
251
+ "rule": "Only claim hosted jobs for linked projects that have a routeable local session.",
252
+ "enforcement_surface": "Runner pickup."
253
+ },
254
+ "explicitly_missing_count": 3,
255
+ "recommended_next_action": "Audit pickup behavior against stale-session and missing-session cases."
256
+ },
257
+ {
258
+ "id": "writing_kernel_launch_result",
259
+ "domain": "writing",
260
+ "artifact_class": "result",
261
+ "continuation_score": 1.0,
262
+ "carry_forward_score": 1.0,
263
+ "invention_rate": 0.0,
264
+ "next_action_present": true,
265
+ "answers": {
266
+ "claim": "ORP shipped the first reasoning kernel release in the CLI.",
267
+ "status": "Release shipped; validation docs published; comparative superiority remains unproven.",
268
+ "next_follow_up": "Run comparative artifact and handoff studies next."
269
+ },
270
+ "explicitly_missing_count": 3,
271
+ "recommended_next_action": "Run the comparative artifact and handoff studies next."
272
+ }
273
+ ],
274
+ "mean_continuation_score": 0.984,
275
+ "mean_carry_forward_score": 0.952,
276
+ "mean_invention_rate": 0.0,
277
+ "mean_confidence": 0.929,
278
+ "mean_elapsed_ms": 30277.013
279
+ },
280
+ "kernel": {
281
+ "condition": "kernel",
282
+ "cases_total": 7,
283
+ "rows": [
284
+ {
285
+ "id": "software_trace_widget",
286
+ "domain": "software",
287
+ "artifact_class": "task",
288
+ "continuation_score": 1.0,
289
+ "carry_forward_score": 1.0,
290
+ "invention_rate": 0.0,
291
+ "next_action_present": true,
292
+ "answers": {
293
+ "object": "terminal trace widget",
294
+ "constraints": "low friction; no GUI dependency",
295
+ "success_criteria": "an operator can identify a drifting lane within 10 seconds; the widget does not overload the terminal surface"
296
+ },
297
+ "explicitly_missing_count": 6,
298
+ "recommended_next_action": "Draft a minimal terminal-only widget spec for active ORP sessions that defines how lane drift and session state are displayed, then validate that the layout supports drift detection within 10 seconds without adding terminal clutter."
299
+ },
300
+ {
301
+ "id": "product_project_home",
302
+ "domain": "product",
303
+ "artifact_class": "decision",
304
+ "continuation_score": 1.0,
305
+ "carry_forward_score": 1.0,
306
+ "invention_rate": 0.0,
307
+ "next_action_present": true,
308
+ "answers": {
309
+ "question": "Should the web app home foreground linked projects or the old idea board?",
310
+ "chosen_path": "Foreground linked projects and move broad idea browsing into Pensieve.",
311
+ "consequences": "linked projects become the primary home object; idea browsing becomes one click deeper"
312
+ },
313
+ "explicitly_missing_count": 2,
314
+ "recommended_next_action": "Implement the information architecture decision: make linked projects the primary focus of the web app home, move broad idea browsing under Pensieve, and verify the new navigation preserves quick access to ideas one level deeper."
315
+ },
316
+ {
317
+ "id": "research_drift_hypothesis",
318
+ "domain": "research",
319
+ "artifact_class": "hypothesis",
320
+ "continuation_score": 1.0,
321
+ "carry_forward_score": 1.0,
322
+ "invention_rate": 0.0,
323
+ "next_action_present": true,
324
+ "answers": {
325
+ "claim": "Short drift summaries reduce the time needed to identify stalled lanes.",
326
+ "boundary": "[\"terminal-first multi-lane workflows\",\"operators already monitoring active lanes\"]",
327
+ "test_path": "Run matched stalled-lane pickup trials with and without summaries and compare detection time."
328
+ },
329
+ "explicitly_missing_count": 5,
330
+ "recommended_next_action": "Convert this hypothesis into a runnable experiment spec: define the exact drift summary intervention, the stalled-lane detection metric, and the success threshold, then execute matched stalled-lane pickup trials within the stated boundary."
331
+ },
332
+ {
333
+ "id": "research_handoff_experiment",
334
+ "domain": "research",
335
+ "artifact_class": "experiment",
336
+ "continuation_score": 1.0,
337
+ "carry_forward_score": 1.0,
338
+ "invention_rate": 0.0,
339
+ "next_action_present": true,
340
+ "answers": {
341
+ "objective": "Measure whether kernel task artifacts improve handoff pickup quality over free-form and generic checklist alternatives.",
342
+ "method": "Run matched handoff trials where a second operator receives one artifact at a time and explains the task, constraints, and next action.",
343
+ "outputs": "pickup scores; clarification counts; time to correct interpretation"
344
+ },
345
+ "explicitly_missing_count": 6,
346
+ "recommended_next_action": "Prepare the trial runbook and data-collection materials, then run the matched handoff trials across the three artifact conditions while capturing score sheets, clarification counts, and timing logs."
347
+ },
348
+ {
349
+ "id": "operations_habanero_checkpoint",
350
+ "domain": "operations",
351
+ "artifact_class": "checkpoint",
352
+ "continuation_score": 1.0,
353
+ "carry_forward_score": 1.0,
354
+ "invention_rate": 0.0,
355
+ "next_action_present": true,
356
+ "answers": {
357
+ "current_state": "The local project link, primary session, and hosted world are synchronized and routable again.",
358
+ "risks": "inactive machines may still hold stale routing state; older queued jobs may need a fresh sync before pickup",
359
+ "next_handoff_target": "Rerun runner sync on active machines and verify one fresh hosted job pickup."
360
+ },
361
+ "explicitly_missing_count": 3,
362
+ "recommended_next_action": "Rerun runner sync on the active machines, then verify that one fresh hosted job is picked up successfully before handing off again."
363
+ },
364
+ {
365
+ "id": "operations_runner_policy",
366
+ "domain": "operations",
367
+ "artifact_class": "policy",
368
+ "continuation_score": 1.0,
369
+ "carry_forward_score": 1.0,
370
+ "invention_rate": 0.0,
371
+ "next_action_present": true,
372
+ "answers": {
373
+ "scope": "Hosted runner job pickup and claim behavior.",
374
+ "rule": "Only claim a hosted job when the linked project has a routeable local session on the current machine.",
375
+ "enforcement_surface": "runner sync, poll, and work lifecycle"
376
+ },
377
+ "explicitly_missing_count": 5,
378
+ "recommended_next_action": "Audit the hosted runner claim path and enforce the claim gate across runner sync, poll, and work lifecycle so a hosted job is only claimed if the linked project resolves to a routeable local session on the current machine."
379
+ },
380
+ {
381
+ "id": "writing_kernel_launch_result",
382
+ "domain": "writing",
383
+ "artifact_class": "result",
384
+ "continuation_score": 1.0,
385
+ "carry_forward_score": 1.0,
386
+ "invention_rate": 0.0,
387
+ "next_action_present": true,
388
+ "answers": {
389
+ "claim": "ORP shipped the first reasoning kernel release as a real CLI protocol surface.",
390
+ "status": "shipped in the ORP CLI and published to npm",
391
+ "next_follow_up": "Run the comparative artifact, pickup, and corpus-fit studies."
392
+ },
393
+ "explicitly_missing_count": 0,
394
+ "recommended_next_action": "Run the comparative artifact, pickup, and corpus-fit studies before making stronger comparative claims, since the artifact says comparative superiority is not yet proven and current evidence is strongest on internal validity."
395
+ }
396
+ ],
397
+ "mean_continuation_score": 1.0,
398
+ "mean_carry_forward_score": 1.0,
399
+ "mean_invention_rate": 0.0,
400
+ "mean_confidence": 0.954,
401
+ "mean_elapsed_ms": 15364.95
402
+ }
403
+ },
404
+ "pairwise": {
405
+ "kernel_vs_generic_checklist": {
406
+ "left": "kernel",
407
+ "right": "generic_checklist",
408
+ "wins": 1,
409
+ "ties": 6,
410
+ "losses": 0,
411
+ "mean_continuation_score_delta": 0.016
412
+ },
413
+ "kernel_vs_freeform": {
414
+ "left": "kernel",
415
+ "right": "freeform",
416
+ "wins": 2,
417
+ "ties": 5,
418
+ "losses": 0,
419
+ "mean_continuation_score_delta": 0.032
420
+ }
421
+ },
422
+ "claims": [
423
+ {
424
+ "id": "kernel_outscores_generic_checklist_on_continuation",
425
+ "claim": "On the matched live continuation simulation, kernel artifacts support a downstream continuation score that meets or exceeds generic checklist artifacts without a higher invention rate.",
426
+ "status": "pass"
427
+ },
428
+ {
429
+ "id": "kernel_outscores_freeform_on_continuation",
430
+ "claim": "On the matched live continuation simulation, kernel artifacts support a stronger downstream continuation score than free-form artifacts.",
431
+ "status": "pass"
432
+ },
433
+ {
434
+ "id": "kernel_minimizes_continuation_invention",
435
+ "claim": "On the matched live continuation simulation, kernel artifacts minimize unsupported carry-forward invention.",
436
+ "status": "pass"
437
+ }
438
+ ],
439
+ "summary": {
440
+ "all_claims_pass": true,
441
+ "kernel_mean_continuation_score": 1.0,
442
+ "generic_checklist_mean_continuation_score": 0.984,
443
+ "freeform_mean_continuation_score": 0.968,
444
+ "kernel_mean_invention_rate": 0.0,
445
+ "generic_checklist_mean_invention_rate": 0.0,
446
+ "freeform_mean_invention_rate": 0.048
447
+ }
448
+ }