coordpy-ai 0.5.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. coordpy_ai-0.5.16/ARCHITECTURE.md +1443 -0
  2. coordpy_ai-0.5.16/CHANGELOG.md +4013 -0
  3. coordpy_ai-0.5.16/CITATION.cff +21 -0
  4. coordpy_ai-0.5.16/CONTRIBUTING.md +212 -0
  5. coordpy_ai-0.5.16/LICENSE +21 -0
  6. coordpy_ai-0.5.16/MANIFEST.in +39 -0
  7. coordpy_ai-0.5.16/PKG-INFO +258 -0
  8. coordpy_ai-0.5.16/README.md +200 -0
  9. coordpy_ai-0.5.16/RELEASING.md +206 -0
  10. coordpy_ai-0.5.16/SECURITY.md +38 -0
  11. coordpy_ai-0.5.16/coordpy/__init__.py +1560 -0
  12. coordpy_ai-0.5.16/coordpy/__main__.py +8 -0
  13. coordpy_ai-0.5.16/coordpy/_cli.py +408 -0
  14. coordpy_ai-0.5.16/coordpy/_internal/__init__.py +9 -0
  15. coordpy_ai-0.5.16/coordpy/_internal/core/__init__.py +4 -0
  16. coordpy_ai-0.5.16/coordpy/_internal/core/adaptive_sub.py +524 -0
  17. coordpy_ai-0.5.16/coordpy/_internal/core/agent_keys.py +154 -0
  18. coordpy_ai-0.5.16/coordpy/_internal/core/code_harness.py +171 -0
  19. coordpy_ai-0.5.16/coordpy/_internal/core/dynamic_comm.py +992 -0
  20. coordpy_ai-0.5.16/coordpy/_internal/core/extractor_noise.py +682 -0
  21. coordpy_ai-0.5.16/coordpy/_internal/core/llm_client.py +106 -0
  22. coordpy_ai-0.5.16/coordpy/_internal/core/role_handoff.py +643 -0
  23. coordpy_ai-0.5.16/coordpy/_internal/core/task_board.py +119 -0
  24. coordpy_ai-0.5.16/coordpy/_internal/experiments/__init__.py +4 -0
  25. coordpy_ai-0.5.16/coordpy/_internal/experiments/phase44_public_readiness.py +381 -0
  26. coordpy_ai-0.5.16/coordpy/_internal/product/__init__.py +19 -0
  27. coordpy_ai-0.5.16/coordpy/_internal/product/__main__.py +4 -0
  28. coordpy_ai-0.5.16/coordpy/_internal/product/ci_gate.py +279 -0
  29. coordpy_ai-0.5.16/coordpy/_internal/product/import_data.py +276 -0
  30. coordpy_ai-0.5.16/coordpy/_internal/product/profiles.py +254 -0
  31. coordpy_ai-0.5.16/coordpy/_internal/product/report.py +92 -0
  32. coordpy_ai-0.5.16/coordpy/_internal/product/runner.py +904 -0
  33. coordpy_ai-0.5.16/coordpy/_internal/tasks/__init__.py +1 -0
  34. coordpy_ai-0.5.16/coordpy/_internal/tasks/code_review.py +148 -0
  35. coordpy_ai-0.5.16/coordpy/_internal/tasks/collaborative_build.py +243 -0
  36. coordpy_ai-0.5.16/coordpy/_internal/tasks/collaborative_module.py +311 -0
  37. coordpy_ai-0.5.16/coordpy/_internal/tasks/compliance_review.py +1337 -0
  38. coordpy_ai-0.5.16/coordpy/_internal/tasks/consensus.py +93 -0
  39. coordpy_ai-0.5.16/coordpy/_internal/tasks/contested_incident.py +1993 -0
  40. coordpy_ai-0.5.16/coordpy/_internal/tasks/corpus_registry.py +250 -0
  41. coordpy_ai-0.5.16/coordpy/_internal/tasks/corpus_runtime_recipes.py +323 -0
  42. coordpy_ai-0.5.16/coordpy/_internal/tasks/data/_build_swe_lite_bank.py +1975 -0
  43. coordpy_ai-0.5.16/coordpy/_internal/tasks/data/swe_lite_style_bank.jsonl +57 -0
  44. coordpy_ai-0.5.16/coordpy/_internal/tasks/data/swe_real_shape_mini.jsonl +6 -0
  45. coordpy_ai-0.5.16/coordpy/_internal/tasks/distributed_summary.py +194 -0
  46. coordpy_ai-0.5.16/coordpy/_internal/tasks/drifting_consensus.py +88 -0
  47. coordpy_ai-0.5.16/coordpy/_internal/tasks/executable_snippets.py +542 -0
  48. coordpy_ai-0.5.16/coordpy/_internal/tasks/incident_triage.py +1538 -0
  49. coordpy_ai-0.5.16/coordpy/_internal/tasks/library_v2.py +724 -0
  50. coordpy_ai-0.5.16/coordpy/_internal/tasks/llm_consensus.py +93 -0
  51. coordpy_ai-0.5.16/coordpy/_internal/tasks/long_corpus.py +241 -0
  52. coordpy_ai-0.5.16/coordpy/_internal/tasks/needle_corpus.py +504 -0
  53. coordpy_ai-0.5.16/coordpy/_internal/tasks/nested_contested_incident.py +1287 -0
  54. coordpy_ai-0.5.16/coordpy/_internal/tasks/numeric_ledger.py +622 -0
  55. coordpy_ai-0.5.16/coordpy/_internal/tasks/protocol_codesign.py +620 -0
  56. coordpy_ai-0.5.16/coordpy/_internal/tasks/protocolkit_36.py +792 -0
  57. coordpy_ai-0.5.16/coordpy/_internal/tasks/python_corpus.py +745 -0
  58. coordpy_ai-0.5.16/coordpy/_internal/tasks/quant_strategy.py +226 -0
  59. coordpy_ai-0.5.16/coordpy/_internal/tasks/security_escalation.py +1430 -0
  60. coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_bench_bridge.py +1951 -0
  61. coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_loop_harness.py +594 -0
  62. coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_patch_parser.py +873 -0
  63. coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_raw_capture.py +534 -0
  64. coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_sandbox.py +732 -0
  65. coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_semantic_taxonomy.py +652 -0
  66. coordpy_ai-0.5.16/coordpy/_internal/tasks/task_scale_swe.py +1014 -0
  67. coordpy_ai-0.5.16/coordpy/_version.py +11 -0
  68. coordpy_ai-0.5.16/coordpy/agents.py +348 -0
  69. coordpy_ai-0.5.16/coordpy/api_layers.py +376 -0
  70. coordpy_ai-0.5.16/coordpy/capsule.py +1985 -0
  71. coordpy_ai-0.5.16/coordpy/capsule_decoder.py +610 -0
  72. coordpy_ai-0.5.16/coordpy/capsule_decoder_relational.py +466 -0
  73. coordpy_ai-0.5.16/coordpy/capsule_decoder_v2.py +1221 -0
  74. coordpy_ai-0.5.16/coordpy/capsule_policy.py +585 -0
  75. coordpy_ai-0.5.16/coordpy/capsule_policy_bundle.py +595 -0
  76. coordpy_ai-0.5.16/coordpy/capsule_runtime.py +1212 -0
  77. coordpy_ai-0.5.16/coordpy/config.py +134 -0
  78. coordpy_ai-0.5.16/coordpy/extensions/__init__.py +59 -0
  79. coordpy_ai-0.5.16/coordpy/extensions/examples/__init__.py +8 -0
  80. coordpy_ai-0.5.16/coordpy/extensions/examples/jsonl_report_sink.py +64 -0
  81. coordpy_ai-0.5.16/coordpy/extensions/registry.py +86 -0
  82. coordpy_ai-0.5.16/coordpy/extensions/report_sink.py +141 -0
  83. coordpy_ai-0.5.16/coordpy/extensions/sandbox.py +121 -0
  84. coordpy_ai-0.5.16/coordpy/extensions/taskbank.py +126 -0
  85. coordpy_ai-0.5.16/coordpy/integrated_synthesis.py +1224 -0
  86. coordpy_ai-0.5.16/coordpy/lifecycle_audit.py +658 -0
  87. coordpy_ai-0.5.16/coordpy/llm_backend.py +438 -0
  88. coordpy_ai-0.5.16/coordpy/provenance.py +157 -0
  89. coordpy_ai-0.5.16/coordpy/py.typed +0 -0
  90. coordpy_ai-0.5.16/coordpy/role_invariant_synthesis.py +1198 -0
  91. coordpy_ai-0.5.16/coordpy/run.py +131 -0
  92. coordpy_ai-0.5.16/coordpy/runtime.py +860 -0
  93. coordpy_ai-0.5.16/coordpy/synthetic_llm.py +271 -0
  94. coordpy_ai-0.5.16/coordpy/team_coord.py +32022 -0
  95. coordpy_ai-0.5.16/coordpy/team_policy.py +391 -0
  96. coordpy_ai-0.5.16/coordpy_ai.egg-info/PKG-INFO +258 -0
  97. coordpy_ai-0.5.16/coordpy_ai.egg-info/SOURCES.txt +104 -0
  98. coordpy_ai-0.5.16/coordpy_ai.egg-info/dependency_links.txt +1 -0
  99. coordpy_ai-0.5.16/coordpy_ai.egg-info/entry_points.txt +5 -0
  100. coordpy_ai-0.5.16/coordpy_ai.egg-info/requires.txt +32 -0
  101. coordpy_ai-0.5.16/coordpy_ai.egg-info/top_level.txt +1 -0
  102. coordpy_ai-0.5.16/examples/agent_team.py +44 -0
  103. coordpy_ai-0.5.16/examples/build_with_coordpy.py +173 -0
  104. coordpy_ai-0.5.16/pyproject.toml +198 -0
  105. coordpy_ai-0.5.16/setup.cfg +4 -0
  106. coordpy_ai-0.5.16/tests/test_smoke_full.py +318 -0
@@ -0,0 +1,1443 @@
1
+ # Context Zero — Reference Architecture
2
+
3
+ **CoordPy** is the shipped **context-capsule runtime** produced by the
4
+ **Context Zero** research programme. Every piece of context that
5
+ crosses a role boundary, a layer boundary, or a run boundary in CoordPy
6
+ is a typed, content-addressed, lifecycle-bounded, budget-bounded,
7
+ provenance-carrying **capsule** — never a raw prompt string. As of
8
+ SDK v3.3 (April 2026), capsules drive execution at the run boundary,
9
+ inside the inner sweep loop, AND on the parser axis (PARSE_OUTCOME
10
+ capsule sealed before every PATCH_PROPOSAL — Theorem W3-39). A
11
+ runtime-checkable lifecycle audit mechanically verifies eight
12
+ invariants L-1..L-8 over every finished run (Theorem W3-40), and
13
+ deterministic-mode replay opt-in (`RunSpec(deterministic=True)`)
14
+ collapses the full capsule DAG byte-for-byte across runs of the same
15
+ logical input (Theorem W3-41). Meta-artefacts have a formally-defined
16
+ detached-witness boundary (Theorem W3-36). Substantive on-disk
17
+ artifacts are content-addressed at write time and re-verifiable at
18
+ audit time. This document is the programme's architectural
19
+ reference: it covers the full substrate (routing, exact memory,
20
+ retrieval, planner, runtime calibration, typed handoffs) and the
21
+ CoordPy product surface built on top of it, now *centred* on the
22
+ Capsule Contract and *driven* by it. For a one-pass orientation,
23
+ start with [`docs/START_HERE.md`](docs/START_HERE.md). For canonical
24
+ research status see
25
+ [`docs/RESEARCH_STATUS.md`](docs/RESEARCH_STATUS.md); for the
26
+ canonical theorem registry see
27
+ [`docs/THEOREM_REGISTRY.md`](docs/THEOREM_REGISTRY.md); for the
28
+ do-not-overstate rule book see
29
+ [`docs/HOW_NOT_TO_OVERSTATE.md`](docs/HOW_NOT_TO_OVERSTATE.md).
30
+
31
+ ## The Capsule Contract (SDK v3 centre of gravity)
32
+
33
+ CoordPy's durable top-level description is:
34
+
35
+ > **CoordPy is a context-capsule runtime.** Every inter-role,
36
+ > inter-layer, and inter-run artefact satisfies a six-invariant
37
+ > contract:
38
+ >
39
+ > **C1 Identity.** Stable content-address (SHA-256) over
40
+ > `(kind, payload, budget, parents)`.
41
+ > **C2 Typed claim.** Closed vocabulary of `CapsuleKind`.
42
+ > **C3 Lifecycle.** `PROPOSED → ADMITTED → SEALED` (+ optional
43
+ > `RETIRED`); illegal transitions are refused.
44
+ > **C4 Budget.** Explicit `CapsuleBudget` checked at admit
45
+ > time.
46
+ > **C5 Provenance.** Parents must be in the ledger; the ledger
47
+ > keeps a hash chain so any retroactive
48
+ > insert breaks `verify_chain()`.
49
+ > **C6 Frozen.** A sealed capsule's CID is fixed for all
50
+ > time.
51
+
52
+ The Phase-19 `Handle`, Phase-31 `TypedHandoff`, Phase-35
53
+ `ThreadResolution`, Phase-36 `AdaptiveEdge`, every `SweepSpec` /
54
+ sweep-cell, every `ARTIFACT` on disk, and the `RUN_REPORT` itself
55
+ are all capsule-shaped. The `CapsuleLedger` is their shared
56
+ append-only, hash-chained container. The `RUN_REPORT` capsule's CID
57
+ is the durable identifier for a CoordPy run — send someone that CID
58
+ plus `product_report.json` and they can reproduce every upstream
59
+ capsule, verify the chain end-to-end, and know the bytes haven't
60
+ drifted.
61
+
62
+ Reference implementation: `vision_mvp/coordpy/capsule.py`. Theory note:
63
+ [`docs/archive/coordpy-milestones/RESULTS_COORDPY_CAPSULE.md`](docs/archive/coordpy-milestones/RESULTS_COORDPY_CAPSULE.md).
64
+ Contract tests: `vision_mvp/tests/test_coordpy_capsules.py`
65
+ (invariants C1..C6 individually + end-to-end).
66
+
67
+ ### Capsule-native execution (SDK v3.1)
68
+
69
+ The Capsule Contract above (C1..C6) describes *what a capsule is*.
70
+ SDK v3.1 adds the **execution-contract** layer: capsules drive
71
+ runtime, not just describe it.
72
+
73
+ ```
74
+ (sealed in flight)
75
+ start_run seal_readiness seal_sweep_spec seal_sweep_cell seal_provenance seal_run_report
76
+ │ │ │ │ │ │
77
+ ▼ ▼ ▼ ▼ ▼ ▼
78
+ PROFILE → READINESS_CHECK → SWEEP_SPEC → SWEEP_CELL → PROVENANCE → RUN_REPORT
79
+ ↑ ↑
80
+ │ │
81
+ seal_and_write_artifact seal_and_write_artifact
82
+ (sweep_result.json) (provenance.json,
83
+ readiness_verdict.json)
84
+ (every substantive artefact gets an
85
+ ARTIFACT capsule whose payload SHA-256
86
+ is verified against the on-disk file's
87
+ bytes by re-read.)
88
+ ```
89
+
90
+ A stage that fails leaves a typed entry in the runtime's *in-flight
91
+ register* that never reaches the ledger. Downstream stages refuse
92
+ to seal because the parent CID is missing (Capsule Contract C5).
93
+ The capsule layer is therefore the runtime's typed execution
94
+ contract for the run-boundary stages (W3-32, W3-35).
95
+
96
+ ### Intra-cell capsule-native + detached witness (SDK v3.2)
97
+
98
+ SDK v3.2 extends the capsule-native slice past the cell boundary.
99
+ Inside every sweep cell, each (task, strategy) parse→apply→test
100
+ transition seals two more capsules in flight:
101
+
102
+ ```
103
+ SWEEP_SPEC
104
+ ├── SWEEP_CELL_1 ··· SWEEP_CELL_n
105
+ ├── PATCH_PROPOSAL_1 (parent: SWEEP_SPEC)
106
+ │ └── TEST_VERDICT_1 (parent: PATCH_PROPOSAL_1)
107
+ ├── PATCH_PROPOSAL_2
108
+ │ └── TEST_VERDICT_2
109
+ └── ...
110
+
111
+ (post-fixed-point, secondary ledger)
112
+ RUN_REPORT
113
+ └── (cross-ref) META_MANIFEST
114
+ meta_artifacts:
115
+ product_report.json SHA
116
+ capsule_view.json SHA
117
+ product_summary.txt SHA
118
+ ```
119
+
120
+ The lifecycle ordering ``patch → verdict`` is enforced at the
121
+ type level (Theorem W3-32-extended). The meta-artefact set is
122
+ formally a *circularity slice* (Theorem W3-36 — no extension of
123
+ the primary ledger can authenticate a file whose bytes encode
124
+ the rendered view), so the META_MANIFEST sits in a *secondary*
125
+ ledger and is the one-hop trust unit beyond the primary view.
126
+ ``coordpy-capsule verify`` (v3.2) recomputes the chain from
127
+ on-disk header bytes (Theorem W3-37) and re-hashes every
128
+ ARTIFACT and meta-artefact at audit time (Theorem W3-38).
129
+
130
+ Reference implementation:
131
+ `vision_mvp/coordpy/capsule_runtime.py::CapsuleNativeRunContext`
132
+ (``seal_patch_proposal`` / ``seal_test_verdict`` /
133
+ ``seal_meta_manifest``); hooks plumbed through
134
+ `vision_mvp/tasks/swe_sandbox.py::run_swe_loop_sandboxed`. Theory
135
+ notes:
136
+ [`docs/archive/coordpy-milestones/RESULTS_COORDPY_CAPSULE_NATIVE.md`](docs/archive/coordpy-milestones/RESULTS_COORDPY_CAPSULE_NATIVE.md)
137
+ (W3-32..W3-35) and
138
+ [`docs/archive/coordpy-milestones/RESULTS_COORDPY_INTRA_CELL.md`](docs/archive/coordpy-milestones/RESULTS_COORDPY_INTRA_CELL.md)
139
+ (W3-32-extended / W3-36 / W3-37 / W3-38). Contract tests:
140
+ `vision_mvp/tests/test_coordpy_capsule_native.py` (16 tests, v3.1)
141
+ and `vision_mvp/tests/test_coordpy_capsule_native_intra_cell.py`
142
+ (16 tests, v3.2).
143
+
144
+ The post-hoc `build_report_ledger` adapter is retained for third
145
+ parties who fold finished `product_report` dicts (no runtime
146
+ context available); the two paths produce CID-equivalent ledgers
147
+ for the spine kinds (Theorem W3-34, preserved under the v3.2
148
+ intra-cell extension because intra-cell capsules are siblings of
149
+ the spine, not modifications of it).
150
+
151
+ ### Sub-intra-cell parse-outcome + lifecycle audit + determinism (SDK v3.3)
152
+
153
+ SDK v3.3 extends the discipline one further structural layer with a
154
+ PARSE_OUTCOME capsule sealed *before* every PATCH_PROPOSAL:
155
+
156
+ ```
157
+ SWEEP_SPEC
158
+ ├── SWEEP_CELL_1 ··· SWEEP_CELL_n
159
+ ├── PARSE_OUTCOME_1 (parent: SWEEP_SPEC)
160
+ │ └── PATCH_PROPOSAL_1
161
+ │ (parents: SWEEP_SPEC + PARSE_OUTCOME_1)
162
+ │ └── TEST_VERDICT_1
163
+ ├── PARSE_OUTCOME_2
164
+ │ └── PATCH_PROPOSAL_2
165
+ │ └── TEST_VERDICT_2
166
+ └── ...
167
+ ```
168
+
169
+ The parser's structured outcome — `ok` boolean, closed-vocabulary
170
+ `failure_kind`, `recovery` label, substitutions count, bounded
171
+ detail — becomes a typed witness on the capsule DAG. The parse →
172
+ patch → verdict chain is enforced at the type level (Theorem W3-39).
173
+
174
+ The **lifecycle audit** (`vision_mvp/coordpy/lifecycle_audit.py`)
175
+ mechanically verifies eight invariants L-1..L-8 over a finished
176
+ ledger:
177
+
178
+ * L-1 No orphan capsules.
179
+ * L-2 PATCH_PROPOSAL parents include SWEEP_SPEC.
180
+ * L-3 TEST_VERDICT parent is exactly one sealed PATCH_PROPOSAL.
181
+ * L-4 PARSE_OUTCOME parent is exactly SWEEP_SPEC.
182
+ * L-5 SWEEP_CELL parent is exactly SWEEP_SPEC.
183
+ * L-6 PATCH_PROPOSAL ↔ TEST_VERDICT ↔ PARSE_OUTCOME coordinates
184
+ are equinumerous.
185
+ * L-7 PATCH_PROPOSAL coordinates match its PARSE_OUTCOME parent's.
186
+ * L-8 TEST_VERDICT is sealed strictly after its PATCH_PROPOSAL.
187
+
188
+ The audit returns OK / BAD / EMPTY plus typed counterexamples. It is
189
+ runnable from a `CapsuleNativeRunContext` (in-process) or from an
190
+ on-disk `capsule_view.json` alone (forensic). Theorem W3-40 anchors
191
+ the audit's soundness.
192
+
193
+ **Deterministic-mode replay** (`RunSpec(deterministic=True)`) strips
194
+ per-run / host-local / wall-clock fields from the
195
+ PROVENANCE / READINESS_CHECK / RUN_REPORT capsule payloads and the
196
+ ARTIFACT capsule paths so two runs of the same deterministic
197
+ profile (mock mode, `in_process` / `subprocess` sandbox, frozen
198
+ JSONL) produce byte-identical full-DAG CIDs and chain head
199
+ (Theorem W3-41). On-disk product_report.json still records
200
+ wall-clock fields for forensic context — the determinism is on
201
+ the capsule graph, not on wall clock.
202
+
203
+ Reference implementation:
204
+ `vision_mvp/coordpy/capsule_runtime.py::CapsuleNativeRunContext.seal_parse_outcome`,
205
+ `vision_mvp/coordpy/lifecycle_audit.py`,
206
+ `vision_mvp/product/runner.py::_canonicalise_for_determinism`.
207
+ Theory note:
208
+ [`docs/archive/coordpy-milestones/RESULTS_COORDPY_DEEP_INTRA_CELL.md`](docs/archive/coordpy-milestones/RESULTS_COORDPY_DEEP_INTRA_CELL.md).
209
+ Contract tests: `vision_mvp/tests/test_coordpy_capsule_native_deeper.py`
210
+ (18 tests).
211
+
212
+ ### How capsules relate to the older CASR / substrate / handoff work
213
+
214
+ | Older primitive | Phase | Capsule kind it instantiates |
215
+ |--- |--- |--- |
216
+ | `context_ledger.Handle` | 19 | `HANDLE` |
217
+ | `role_handoff.TypedHandoff` | 31 | `HANDOFF` |
218
+ | `dynamic_comm.ThreadResolution` | 35 | `THREAD_RESOLUTION` |
219
+ | `adaptive_sub.AdaptiveEdge` | 36 | `ADAPTIVE_EDGE` |
220
+ | `coordpy.runtime.SweepSpec` | — | `SWEEP_SPEC` |
221
+ | per-cell sweep report (`coordpy.sweep.v2`) | — | `SWEEP_CELL` |
222
+ | `phase44_public_readiness` verdict | 44 | `READINESS_CHECK` |
223
+ | `coordpy.provenance.v1` manifest | — | `PROVENANCE` |
224
+ | on-disk `product_report.json` etc. | — | `ARTIFACT` |
225
+ | resolved profile dict | — | `PROFILE` |
226
+ | the run itself | — | `RUN_REPORT` |
227
+
228
+ The older primitives are **byte-for-byte unchanged**. The capsule
229
+ layer names the contract they already satisfied, lifts them under
230
+ one ledger, and makes that ledger the SDK's new public centre. None
231
+ of this is retrofitted cryptography: the hash-chaining that
232
+ `HandoffLog` already did (Phase 31), the content-addressing that
233
+ `MerkleDAG` / `ContextLedger.put` already did (Phase 19), and the
234
+ provenance manifest that every run already carried are the existing
235
+ evidence; SDK v3 recognises that they were instances of one shared
236
+ thing.
237
+
238
+ > **Naming.** `Context Zero` is the research programme; `CoordPy` is the first
239
+ > finished product produced by it. The original substrate proposal — **CASR**
240
+ > (Causal-Abstraction Scale-Renormalized Routing) — lives in
241
+ > `vision_mvp.core.*` as research-grade code and grounds CoordPy's O(log N)
242
+ > bounded-context claim (Theorem 3 in
243
+ > [`docs/archive/pre-coordpy-theory/PROOFS.md`](docs/archive/pre-coordpy-theory/PROOFS.md)).
244
+ > The programme's phase-by-
245
+ > phase diary lives in `vision_mvp/RESULTS_PHASE*.md`; the CoordPy SDK boundary
246
+ > lives under `vision_mvp/coordpy/` and is the stable public contract.
247
+ >
248
+ > **CoordPy SDK boundary (Slice 1 + v3 + v3.1).** The stable public
249
+ > surface is: `RunSpec` (with `capsule_native: bool = True`),
250
+ > `run`, `CoordPyConfig`, `profiles`, `report`, `ci_gate`,
251
+ > `import_data`, `build_manifest`, the capsule primitives
252
+ > (`ContextCapsule`, `CapsuleLedger`, `CapsuleView`,
253
+ > `build_report_ledger`, every `capsule_from_*` adapter),
254
+ > the capsule-native runtime symbols (`CapsuleNativeRunContext`,
255
+ > `seal_and_write_artifact`, `ContentAddressMismatch`,
256
+ > `CONSTRUCTION_IN_FLIGHT`, `CONSTRUCTION_POST_HOC`), and the schema
257
+ > constants (`coordpy.provenance.v1`, `phase45.product_report.v2`,
258
+ > `coordpy.capsule_view.v1`, `phase46.ci_verdict.v1`,
259
+ > `phase46.import_audit.v1`). See the **Stability matrix** in
260
+ > `README.md` and in `docs/context_zero_master_plan.md` for the
261
+ > durable classification of every layer (CoordPy SDK · capsule
262
+ > primitives · capsule-native runtime · core substrate · legacy
263
+ > product path · plugin/extension system · unified runtime ·
264
+ > Docker sandbox · research shards). Anything not on the SDK
265
+ > surface is research-grade or boundary/next-slice and may change
266
+ > without notice.
267
+
268
+ > **How to read the rest of this file.** The phase-by-phase
269
+ > callouts immediately below (Phases 26 → 44) are a *historical
270
+ > incremental record* of how the substrate was built up. They are
271
+ > kept verbatim for provenance; each claim is anchored to a
272
+ > `RESULTS_PHASE*.md` note and to tests. If you want the durable
273
+ > architecture, skip the phase callouts and read the layered
274
+ > substrate diagram further down (five substrate layers + render
275
+ > mode + runtime calibration + typed-handoff team layer), then § 3
276
+ > ("Architecture of the solution") in
277
+ > [`docs/context_zero_master_plan.md`](docs/context_zero_master_plan.md).
278
+ > For the CoordPy product surface specifically, see § 10 of the master
279
+ > plan and [`docs/START_HERE.md`](docs/START_HERE.md).
280
+
281
+ > **Architecture as of Phase 27: five substrate layers + a render
282
+ > mode + a snippet-scale runtime-calibration observer (Phase 26) +
283
+ > a *corpus-scale* runtime-calibration observer (Phase 27). Conservative
284
+ > intraprocedural + interprocedural semantic code analysis sits in
285
+ > the ingestion layer; the new runtime-calibration layer observes
286
+ > instrumented execution of the same code against a per-predicate
287
+ > probe set and reports the analyzer-vs-runtime divergence matrix.**
288
+ > The original CASR spec (below) covers the *routing* and *trigger*
289
+ > layers. Phases 19–21 added three more layers — *exact external
290
+ > memory*, *retrieval*, and *computation/planning* — that handle
291
+ > the content and aggregation sides of context. Phase 22 generalised
292
+ > the substrate to real Python codebases (AST-derived typed
293
+ > metadata) and introduced the **direct-exact** render path that
294
+ > bypasses the LLM when the planner has the answer. Phase 23
295
+ > validated the Phase-22 result across six real Python corpora
296
+ > (research / utility / test / CLI-framework / stdlib) with a
297
+ > reusable multi-corpus registry. Phase 24 extended the direct-
298
+ > exact guarantee from syntactic structure to conservative
299
+ > *intraprocedural* static-semantic predicates (`may_raise`,
300
+ > `is_recursive`, `may_write_global`,
301
+ > `calls_subprocess`/`filesystem`/`network`), computed from the AST
302
+ > by `core/code_semantics`; direct-exact scored 44 / 44 (100 %, σ
303
+ > = 0) on the semantic battery across six corpora. Phase 25
304
+ > extended the exact slice to conservative *interprocedural*
305
+ > semantic predicates — transitive closures over a local call graph
306
+ > plus Tarjan-SCC recursion-cycle detection — via
307
+ > `core/code_interproc`; direct-exact scored **50 / 50 (100 %, σ =
308
+ > 0)** on the Phase-25 interprocedural battery across the same six
309
+ > corpora with zero LLM calls and zero prompt chars. **Phase 26
310
+ > introduces a separate truth axis — *runtime-truth calibration* —
311
+ > via instrumented probes that observe how a function actually
312
+ > behaves when executed. The runtime layer is ADDITIVE: it does
313
+ > NOT replace the analyzer or planner; it sits alongside them as
314
+ > an observer that reports analyzer-vs-runtime divergence per
315
+ > predicate. On a 21-snippet executable corpus spanning 8 families,
316
+ > the analyzer agrees with runtime observation on 123 / 126
317
+ > (97.6 %) applicable measurements; every divergence lands on a
318
+ > Phase-24 pre-documented boundary condition. Analyzer-gold
319
+ > exactness and runtime-truth calibration are formalised as
320
+ > independent axes (Theorem P26-1); the direct-exact planner's
321
+ > 126 / 126 round-trip to the analyzer demonstrates the substrate
322
+ > guarantee is independent of analyzer calibration.**
323
+ >
324
+ > **Phase 28 extends the runtime-calibration axis along two
325
+ > orthogonal dimensions: (a) runtime calibration is run over the
326
+ > full local Phase-23 corpus set (`vision-core`, `vision-tasks`,
327
+ > `vision-tests`, `vision-experiments`) with coverage reported
328
+ > as a first-class cross-corpus variable (`ready_fraction` ranges
329
+ > from 2.9 % to 80.2 %), and (b) the analyzer's `may_raise` axis
330
+ > is split — the Phase-24 contract is preserved unchanged as
331
+ > `may_raise_explicit` (sound, FN = 0 across all four corpora),
332
+ > and a new conservative sound-over-precision predicate
333
+ > `may_raise_implicit` is added for implicit-raise propagation
334
+ > from builtin operations (soundness: FN = 1 / 116 runtime-
335
+ > positives on the pooled entered slice). The substrate layer is
336
+ > unchanged — Phase 28 touches the analyzer (`code_semantics`,
337
+ > `code_interproc`), the runtime observer (`code_runtime_calibration`,
338
+ > `code_corpus_runtime`), and adds the benchmark
339
+ > `phase28_multi_corpus_runtime_calibration`. See Theorems
340
+ > P28-1..P28-4 in `RESULTS_PHASE28.md`.**
341
+ >
342
+ > **Phase 29 adds two couples-but-independent pieces. First, a
343
+ > task-scale causal-relevance harness (`tasks/task_scale_swe` +
344
+ > `experiments/phase29_task_scale_falsifiability`) that runs the
345
+ > routing / substrate stack over a multi-role SWE-style task
346
+ > distribution and measures, per (task, role, event), whether the
347
+ > event is *causally relevant* under an analyzer-derived oracle.
348
+ > On 80 queries / 5 718 events across four corpora, the pooled
349
+ > aggregator-role causal-relevance fraction under naive broadcast
350
+ > is **4.54 %**; the substrate collapses aggregator context by
351
+ > **1 007×** at **100 %** correctness on matched tasks. This is
352
+ > the first task-scale test of the core thesis; falsifiability
353
+ > decision on the ROADMAP gate: **CONFIRMED** (Theorems P29-1 /
354
+ > P29-2 / P29-3 / P29-4). Second, a conservative method-instance
355
+ > auto-construction recipe (extends `code_corpus_runtime`):
356
+ > methods on safely-zero-arg-constructable classes (no custom
357
+ > `__init__`, or `__init__` with only self + defaulted params,
358
+ > or `@dataclass`-all-defaulted) promote to a new `ready_method`
359
+ > status; the probe constructs the instance under the Phase-26
360
+ > sandbox + Phase-27 budget tracer. Runtime `ready_fraction` on
361
+ > `vision-tests` lifts 2.9 % → 98.8 %; pooled entered slice grows
362
+ > 4.83× (306 → 1 477) with `may_raise_explicit` FN preserved at 0
363
+ > and construct-failed < 1 % (Theorem P29-5). The substrate layer
364
+ > is unchanged; Phase 29 touches `code_corpus_runtime` (method
365
+ > coverage) and adds the task-scale harness. See Theorems
366
+ > P29-1..P29-8 in `RESULTS_PHASE29.md`.**
367
+ >
368
+ > **Phase 31 adds a new substrate layer on the *team-communication*
369
+ > axis — typed, content-addressed, role-scoped handoffs between
370
+ > agents — and ships the programme's first *non-code* task-scale
371
+ > benchmark. The new module (`core/role_handoff.py`) provides
372
+ > `TypedHandoff`, `RoleSubscriptionTable`, bounded `RoleInbox`,
373
+ > hash-chained `HandoffLog`, per-(source_role, to_role,
374
+ > claim_kind) `DeliveryAccount`, and a `HandoffRouter`. The layer
375
+ > sits one level above the Phase-1/29 role-keyed Bloom routing: it
376
+ > routes by *claim kind* (e.g. `SLOW_QUERY_OBSERVED`,
377
+ > `DISK_FILL_CRITICAL`), so downstream roles can subscribe to
378
+ > load-bearing content without reading the payload. The companion
379
+ > benchmark (`tasks/incident_triage`) runs a five-role operational
380
+ > incident-triage team across five scenario kinds and four
381
+ > delivery strategies; substrate prompt size is **flat at 196
382
+ > tokens** across distractor densities k ∈ {6, 20, 60, 120}
383
+ > (event-stream 40 → 440 events), while naive collapses from 100 %
384
+ > → 20 % at k=120 under truncation. Theorems P31-1..P31-5 + two
385
+ > conjectures formalise the role-conditioned relevance
386
+ > factorisation, communication-sparsity lower bound, bounded-
387
+ > context upper bound, correctness preservation under subscription
388
+ > coverage, and a provable separation from any single-agent
389
+ > compression of the event stream (P31-5). See Theorems
390
+ > P31-1..P31-5 in `RESULTS_PHASE31.md`.**
391
+ >
392
+ > **Phase 39 adds a multi-role SWE-bench-style bridge
393
+ > *strictly above* the Phase-31 typed-handoff substrate
394
+ > and ships the first real-LLM data point on the
395
+ > Phase-38 prompt-variant pipeline:
396
+ > (a) `tasks/swe_bench_bridge` — a `SWEBenchStyleTask`
397
+ > schema that mirrors the public SWE-bench instance shape
398
+ > (`instance_id`, `repo`, `base_commit`,
399
+ > `problem_statement`, `gold_patch`, `test_source`); a
400
+ > four-instance hand-authored `MiniSWEBank` whose patches
401
+ > are line-anchored substitutions and whose hidden tests
402
+ > run in a fresh `exec` namespace (no shell, no
403
+ > subprocess, no network); a four-role team
404
+ > (`issue_reader` / `code_searcher` / `patch_generator`
405
+ > / `test_runner`) wired through the unchanged Phase-31
406
+ > `HandoffRouter`; a `SWEBenchAdapter.from_dict` shim
407
+ > documenting the schema mapping for a future real-
408
+ > SWE-bench loader. **Theorem P39-3** (substrate
409
+ > bounded-context preservation) — the patch_generator's
410
+ > prompt size is independent of `n_distractors` (842
411
+ > chars at every distractor count) while naive grows
412
+ > from 949 → 1936; **Theorem P39-4** (schema
413
+ > mappability) — the gap to public SWE-bench is
414
+ > adapter-shaped, not architectural.
415
+ > (b) `experiments/phase39_swe_bridge` — a runnable
416
+ > driver supporting `--mode mock` (deterministic oracle
417
+ > generator; sub-second) and `--mode real` (Ollama LLM
418
+ > patch generator).
419
+ > (c) `experiments/phase39_frontier_substrate` — a
420
+ > bounded cross-family sweep on Phase-31 incident triage
421
+ > across `llama3.1:8b`, `gemma2:9b`, `qwen2.5-coder:7b`.
422
+ > (d) Real-LLM data point on the Phase-38 prompt
423
+ > calibration pipeline (the existing
424
+ > `phase38_prompt_calibration --mode real` driver).
425
+ > **Theorem P39-1**: on `qwen2.5:0.5b`, four of five
426
+ > Phase-38 variants reproduce the Phase-37 default
427
+ > distribution to within ±0 calls; the bias is
428
+ > *model-shaped, not prompt-shaped* on this size class.
429
+ > **Theorem P39-2** (regime taxonomy): every team-
430
+ > shaped task admits a *communication-bounded* vs
431
+ > *transcription-bounded* decomposition; the substrate
432
+ > is the gating constraint only when the synthesis
433
+ > layer is order-preserving on the typed bundle. No
434
+ > Phase-31 through Phase-38 primitive is modified. See
435
+ > `RESULTS_PHASE39.md`.**
436
+ >
437
+ > **Phase 43 adds a semantic-failure taxonomy layer
438
+ > *strictly above* the Phase-42 parser-compliance layer,
439
+ > a public-style loader self-test, a frontier-model run
440
+ > (``qwen3.5:35b`` 36B-MoE on the ASPEN cluster), and one
441
+ > byte-safe trailing-delimiter pattern added to the Phase-42
442
+ > ``_strip_trailing_prose`` list.** Four coupled additions,
443
+ > all *strictly above* the Phase-42 layer (every Phase-42
444
+ > default preserves Phase-42 byte-for-byte):
445
+ > (a) ``vision_mvp/tasks/swe_semantic_taxonomy.py`` (NEW) —
446
+ > nine-label closed vocabulary (``SEM_OK`` / ``SEM_PARSE_FAIL``
447
+ > / ``SEM_WRONG_EDIT_SITE`` / ``SEM_RIGHT_SITE_WRONG_LOGIC``
448
+ > / ``SEM_INCOMPLETE_MULTI_HUNK`` / ``SEM_TEST_OVERFIT`` /
449
+ > ``SEM_STRUCTURAL_SEMANTIC_INERT`` / ``SEM_SYNTAX_INVALID``
450
+ > / ``SEM_NO_MATCH_RESIDUAL``) with a pure deterministic
451
+ > classifier and ``SemanticCounter`` aggregator. Sits
452
+ > strictly above the Phase-42 parser-compliance counter in
453
+ > the analysis stack.
454
+ > (b) ``vision_mvp/experiments/phase43_frontier_headroom.py``
455
+ > (NEW) — Phase-43 analysis driver. Ingests Phase-42-shape
456
+ > artifacts, re-derives per-cell semantic labels, emits
457
+ > cross-model summary JSON. Includes
458
+ > ``verify_public_style_loader`` that round-trips every
459
+ > bank instance through the loader + strict matcher under
460
+ > the oracle (57/57 saturation on the bundled bank).
461
+ > (c) ``vision_mvp/core/llm_client.py`` (EXTENDED) —
462
+ > ``LLMClient(think=…)`` threads Ollama's ``/api/generate``
463
+ > ``think`` field for Qwen3-class thinking models so their
464
+ > output budget is not consumed by internal reasoning.
465
+ > Default ``None`` preserves Phase-42 byte-for-byte.
466
+ > (d) ``vision_mvp/tasks/swe_patch_parser.py`` (one-pattern
467
+ > regression fix) — ``_PROSE_TAILS`` gains one pattern
468
+ > ``\n\s*<{2,4}\s*\Z`` that strips partial / full trailing
469
+ > delimiters (``<<``, ``<<<``, ``<<<<``). Surfaced by the
470
+ > ``qwen3.5:35b`` cluster run's unclosed_new failure shape.
471
+ > Byte-safe under Theorem P42-2.
472
+ >
473
+ > **Phase 43 theory**: Theorem P43-1 (bounded-context
474
+ > preservation on the external-validity bank — substrate
475
+ > 205.9 tokens flat across the full
476
+ > parser × matcher × distractor cross product); Theorem
477
+ > P43-2 (post-parser-recovery semantic residue is
478
+ > structurally classifiable — nine-label taxonomy is total,
479
+ > exhaustive, deterministic); Theorem P43-3 (semantic-ceiling
480
+ > separation on coder-finetuned models at N ≥ 50 —
481
+ > substrate-vs-naive gap is 0 pp on every measured
482
+ > coder-finetuned model, per-strategy failure-mix
483
+ > histograms are byte-identical, and the dominant residue
484
+ > label is ``SEM_WRONG_EDIT_SITE`` on coder-finetuned
485
+ > models vs ``SEM_SYNTAX_INVALID`` on general-purpose
486
+ > models of matched parameter class). Four conjectures
487
+ > (C43-1..C43-4). The programme's durable substrate claim
488
+ > is now unambiguous: *bounded active context per role*, not
489
+ > pass@1 lift. See ``RESULTS_PHASE43.md``.
490
+ >
491
+ > **Phase 44 adds raw-text residue capture, a refined semantic
492
+ > taxonomy (v2 classifier), and a validated public-SWE-bench-
493
+ > Lite drop-in readiness pipeline — *strictly above* the
494
+ > Phase-43 analysis layer.** Four coupled additions, all
495
+ > strictly additive (every Phase-43 default preserves
496
+ > Phase-43 byte-for-byte):
497
+ > (a) ``vision_mvp/tasks/swe_raw_capture.py`` (NEW) —
498
+ > ``RawCaptureRecord`` / ``RawCaptureStore`` with schema
499
+ > version ``phase44.v1``. Each record persists the raw LLM
500
+ > bytes + SHA-256, the ``ParseOutcome`` dict, the proposed
501
+ > substitutions, the applied substitutions after the matcher,
502
+ > the patched-source SHA-256, and the downstream verdict.
503
+ > ``make_capturing_generator`` wraps a bridge generator or a
504
+ > fresh ``llm_call`` and plumbs raw text into the store
505
+ > while preserving the Phase-42 LLM-output cache discipline.
506
+ > (b) ``vision_mvp/tasks/swe_semantic_taxonomy.py``
507
+ > (EXTENDED) — five new sub-labels
508
+ > (``SEM_RIGHT_FILE_WRONG_SPAN``, ``SEM_RIGHT_SPAN_WRONG_LOGIC``,
509
+ > ``SEM_PARTIAL_MULTI_HUNK_SUCCESS``,
510
+ > ``SEM_NARROW_FIX_TEST_OVERFIT``, ``SEM_STRUCTURAL_VALID_INERT``)
511
+ > partition the Phase-43 coarse buckets when raw bytes are
512
+ > available. ``classify_semantic_outcome_v2`` subsumes the v1
513
+ > classifier on sentinel inputs (Theorem P44-2).
514
+ > ``REFINEMENT_MAP`` is reflexive so the sentinel path
515
+ > remains a legal v2 classification.
516
+ > (c) ``vision_mvp/experiments/phase44_semantic_residue.py``
517
+ > (NEW) — sweep mode runs the Phase-42-shape experiment with
518
+ > raw capture on; analyse-only mode consumes (parent,
519
+ > capture) pairs and emits a ``phase44.summary.v1`` JSON
520
+ > with per-cell coarse + refined counters and a
521
+ > ``coarse_to_refined_partition`` audit.
522
+ > (d) ``vision_mvp/experiments/phase44_public_readiness.py``
523
+ > (NEW) — five-check CI-gate validator (schema / adapter /
524
+ > parser / matcher / test_runner) on any local JSONL.
525
+ > Emits ``{"ready": true, "n": 57, ...}`` on the bundled
526
+ > bank in ~5 s wall (Theorem P44-3).
527
+ >
528
+ > **Phase 44 theory**: Theorem P44-1 (raw capture is a
529
+ > lossless projection of pipeline state); Theorem P44-2
530
+ > (refined classifier is monotone on sentinel inputs —
531
+ > backwards-compatibility with Phase 43 is a theorem, not an
532
+ > aspiration); Theorem P44-3 (public-readiness saturates on
533
+ > the bundled bank at external-validity scale — the
534
+ > externalisation gap is now purely data-availability).
535
+ > Four conjectures (C44-1..C44-4) frame the sharper
536
+ > residue-composition questions raw capture makes
537
+ > measurable. See ``RESULTS_PHASE44.md``.
538
+ >
539
+ > **Phase 42 adds the parser-compliance attribution layer
540
+ > on top of the Phase-41 matcher axis and grows the
541
+ > SWE-bench-Lite-style bank past the ≥ 50-instance
542
+ > external-validity threshold.** Three coupled additions,
543
+ > all *strictly above* the Phase-41 layer (every Phase-41
544
+ > default preserves Phase-41 byte-for-byte):
545
+ > (a) `tasks/swe_patch_parser` (NEW) — a
546
+ > `parse_patch_block(text, mode, unified_diff_parser)`
547
+ > entry point with three modes (`PARSER_STRICT` = Phase-41
548
+ > baseline; `PARSER_ROBUST` = Phase-42 default with five
549
+ > named recovery heuristics; `PARSER_UNIFIED` = diff-only),
550
+ > a closed ten-label failure taxonomy (`PARSE_OK`,
551
+ > `PARSE_EMPTY_OUTPUT`, `PARSE_NO_BLOCK`,
552
+ > `PARSE_UNCLOSED_NEW`, `PARSE_UNCLOSED_OLD`,
553
+ > `PARSE_MALFORMED_DIFF`, `PARSE_EMPTY_PATCH`,
554
+ > `PARSE_MULTI_BLOCK`, `PARSE_PROSE_ONLY`,
555
+ > `PARSE_FENCED_ONLY`), and a six-label recovery enum
556
+ > (`RECOVERY_NONE`, `RECOVERY_CLOSED_AT_EOS`,
557
+ > `RECOVERY_FENCED_CODE`, `RECOVERY_LABEL_PREFIX`,
558
+ > `RECOVERY_UNIFIED_DIFF`, `RECOVERY_LOOSE_DELIM`).
559
+ > `ParserComplianceCounter` exposes `compliance_rate` /
560
+ > `raw_compliance_rate` / `recovery_lift` per cell.
561
+ > (b) `tasks/swe_bench_bridge` (EXTENDED) —
562
+ > `llm_patch_generator(..., parser_mode=…,
563
+ > parser_counter=…, prompt_style=…)` routes the parser axis
564
+ > from the bridge boundary; `None` preserves the Phase-41
565
+ > regex byte-for-byte. `build_patch_generator_prompt(…,
566
+ > prompt_style="block" | "unified_diff")` opts into a
567
+ > unified-diff output contract. Re-exports
568
+ > `parse_patch_block` / `ParseOutcome` /
569
+ > `ParserComplianceCounter` so one import gives the
570
+ > caller the full Phase-42 surface.
571
+ > (c) `tasks/data/swe_lite_style_bank.jsonl`
572
+ > (REGENERATED) — the Phase-41 28-instance bank grown
573
+ > with 29 new instances covering string manipulation,
574
+ > numeric guards, sequence construction, dict helpers,
575
+ > recursion/iteration, exception handling, set algebra,
576
+ > class state transitions (`StopLight` multi-hunk,
577
+ > `Stack.pop`), binary search off-by-one, graph walk
578
+ > reachability, and default argument correction. Every
579
+ > new instance validated via the same oracle-round-trip
580
+ > precondition as Phase 41.
581
+ > (d) `core/llm_client` (EXTENDED) —
582
+ > `LLMClient(base_url=None)` plumbs the ASPEN cluster
583
+ > endpoints (macbook-1 `http://192.168.12.191:11434`,
584
+ > macbook-2 `http://192.168.12.248:11434`); default
585
+ > `None` preserves the Phase-41 localhost semantics.
586
+ > (e) `experiments/phase42_parser_sweep` (NEW) — sweeps
587
+ > `(parser_mode × apply_mode × n_distractors)` with an
588
+ > LLM-output cache keyed per
589
+ > `(instance_id, strategy_proxy, n_distractors,
590
+ > prompt_style)` so the parser-mode axis re-parses
591
+ > cached text; emits the per-strategy
592
+ > `{recovered, regressed, unchanged_pass,
593
+ > unchanged_fail}` set delta between strict and each
594
+ > non-strict parser. **Theorem P42-1** (parser-compliance
595
+ > attribution: `Δ pass@1 = |R_recovered_parser| −
596
+ > |R_regressed_parser|` under every matcher × strategy ×
597
+ > distractor cell; promotes Conjecture C41-5 to theorem).
598
+ > **Theorem P42-2** (parser recovery cannot produce a
599
+ > false pass — byte-provenance argument). **Theorem
600
+ > P42-3** (robust parser dominates on format-
601
+ > noncompliant generators). Combined with Theorem P41-3
602
+ > and Theorem P39-2, the programme now has a
603
+ > **three-axis attribution surface**
604
+ > (parser × matcher × substrate). Phase-42 mock
605
+ > reproduces Theorem P41-1 on the 57-instance bank
606
+ > (substrate prompt 205.9 tokens flat, naive 197 → 527,
607
+ > 1 368 sandboxed measurements in 122 s). See
608
+ > `RESULTS_PHASE42.md`.
609
+ >
610
+ > **Phase 41 moves the Phase-40 real SWE loop to first
611
+ > larger-N data with a two-axis attribution surface.**
612
+ > Three coupled additions, all *strictly above* the
613
+ > Phase-40 layer (every Phase-40 artifact reruns
614
+ > byte-for-byte under the Phase-41 defaults):
615
+ > (a) `tasks/data/swe_lite_style_bank.jsonl` (NEW) —
616
+ > a 28-instance SWE-bench-Lite-shape JSONL bank
617
+ > (~4.7× the Phase-40 mini bank) covering a disciplined
618
+ > spectrum of edit shapes: operator-typo, off-by-one,
619
+ > wrong-branch, seed-wrong, aggregate-missing, mutation-
620
+ > vs-copy, multi-hunk (one class touches two methods),
621
+ > parity-partition, slice-direction, index-return,
622
+ > polarity-flipped, empty-guard, type-conversion,
623
+ > unicode edge, ambiguous comparator. A bank-builder
624
+ > (`_build_swe_lite_bank.py`) round-trips every instance
625
+ > through `parse_unified_diff + apply_patch +
626
+ > run_patched_test` before writing; refuses to register
627
+ > any instance whose diff doesn't parse, whose OLD blocks
628
+ > aren't uniquely anchored, or whose oracle-patched
629
+ > source doesn't pass the hidden test. The JSONL is the
630
+ > reproducibility precondition: Phase-41 evaluation runs
631
+ > offline in seconds.
632
+ > (b) `tasks/swe_bench_bridge` + `tasks/swe_sandbox`
633
+ > (EXTENDED) — `apply_patch` accepts an `apply_mode`
634
+ > kwarg ∈ {`strict` (default, Phase-40 byte-exact),
635
+ > `lstrip` (leading-whitespace drift tolerance),
636
+ > `ws_collapse` (internal-whitespace drift),
637
+ > `line_anchored` (trailing-whitespace drift)}. All three
638
+ > permissive modes retain **unique-match discipline**
639
+ > (a normalised OLD that appears more than once in the
640
+ > normalised source is rejected as `old_ambiguous`).
641
+ > `apply_mode` is threaded through `run_swe_loop`,
642
+ > every `Sandbox.run(...)` backend, and
643
+ > `run_swe_loop_sandboxed`; `SWEReport.config` records
644
+ > it for audit.
645
+ > (c) `experiments/phase41_swe_lite_sweep` (NEW) — the
646
+ > attribution-aware driver. Caches each LLM call per
647
+ > `(instance_id, strategy, n_distractors)` so permissive
648
+ > cells reuse strict cells' proposals (no extra LLM
649
+ > wall on the matcher axis); emits a per-strategy
650
+ > `{recovered, regressed, unchanged_pass,
651
+ > unchanged_fail}` set delta between each permissive
652
+ > mode and the strict baseline. **Theorem P41-1**
653
+ > (bounded-context preservation at 4.7× scale —
654
+ > substrate 746.4 chars flat, naive 806.8 → 2 125.8
655
+ > across `n_distractors ∈ {0, 6, 12, 24}` on 672
656
+ > sandboxed measurements). **Theorem P41-2** (oracle-
657
+ > ceiling is matcher-mode-invariant — permissive
658
+ > matching subtracts no correctness from a byte-exact
659
+ > generator). **Theorem P41-3** (matcher-permissiveness
660
+ > attribution decomposition: `Δ pass@1 = |R_recovered|
661
+ > − |R_regressed|`). Combined with Theorem P39-2, the
662
+ > programme now has a **two-axis attribution surface**
663
+ > for any real SWE loop — substrate delivery × matcher
664
+ > precision. Real-LLM sweeps on `qwen2.5-coder:7b`
665
+ > (28 instances) and `gemma2:9b` (subset) populate the
666
+ > attribution tables. See `RESULTS_PHASE41.md`.
667
+ >
668
+ > **Phase 40 makes the Phase-39 SWE bridge a real
669
+ > external task loop.** Three coupled additions, all
670
+ > *strictly above* the Phase-39 schema layer:
671
+ > (a) `tasks/swe_bench_bridge` extension —
672
+ > `parse_unified_diff` (a tolerant `git diff` parser),
673
+ > `SWEBenchAdapter.from_swe_bench_dict` (the real-shape
674
+ > adapter that derives `buggy_function` from the diff
675
+ > hunk and promotes a `test_patch` into a runnable
676
+ > `test_source`), `load_jsonl_bank` (hermetic JSONL
677
+ > loader with per-instance file namespacing), and a
678
+ > bundled six-instance JSONL artifact
679
+ > (`vision_mvp/tasks/data/swe_real_shape_mini.jsonl`).
680
+ > (b) `tasks/swe_sandbox` (NEW) — a `Sandbox` protocol
681
+ > with three backends: `InProcessSandbox` (Phase-39
682
+ > wrapped), `SubprocessSandbox` (new — wall-clock
683
+ > timeout, tempdir cwd, sanitised env, JSON outcome
684
+ > protocol so test-level vs sandbox-level failures are
685
+ > attributable), `DockerSandbox` (new — optional;
686
+ > `--network=none --read-only` rootfs, `tmpfs /work`,
687
+ > `--stop-timeout`). `select_sandbox("auto")` picks
688
+ > Docker → subprocess → in-process by availability;
689
+ > `run_swe_loop_sandboxed` is the sandbox-aware
690
+ > substrate runner.
691
+ > (c) `experiments/phase40_real_swe_bridge` (NEW) — the
692
+ > end-to-end driver that composes loader + substrate +
693
+ > sandbox + (optional) real LLM patch generator. Mock
694
+ > run: 72 sandboxed measurements, pass@1 = 1.000 on
695
+ > every (strategy, distractor) cell. Real-LLM runs:
696
+ > qwen2.5:0.5b (transcription-bounded, every cell hits
697
+ > patch_no_match) and qwen2.5-coder:7b (5/6 under
698
+ > naive/routing, 4/6 under substrate — honest variance
699
+ > at small N inside the P39-2 transcription-bounded
700
+ > regime). **Theorem P40-1** (unidiff round-trip),
701
+ > **Theorem P40-2** (real-shape substrate bounded-
702
+ > context preservation — substrate prompt 813 chars
703
+ > across n_distractors ∈ {0, 6, 12, 24}; naive grows
704
+ > 826 → 2 145), **Theorem P40-3** (sandbox-boundary
705
+ > preservation — InProcessSandbox and SubprocessSandbox
706
+ > deliver pass@1 = 1.000 on the oracle ceiling on the
707
+ > mini bank and the real-shape JSONL bank). The
708
+ > external-validity gap to public SWE-bench is now
709
+ > *empirical*, not infrastructural. See
710
+ > `RESULTS_PHASE40.md`.**
711
+ >
712
+ > **Phase 38 extends the coordination-primitive layer with
713
+ > four composition-level additions that close the two-layer
714
+ > ensemble, minimum-primitive-ablation, and prompt-variant
715
+ > frontier items named by Phase 37's conjectures:
716
+ > (a) `core/two_layer_ensemble` — a
717
+ > `PathUnionCausalityExtractor` with three combiner modes
718
+ > (`path_dual_agree` / `path_union_root` / `path_verified`)
719
+ > that sits strictly above any per-path noise wrapper, plus
720
+ > a `TwoLayerDefense` descriptor record. Theorem P38-2
721
+ > shows that `path_union_root` closes the Phase-37
722
+ > `adv_drop_root` cell where every reply-axis ensemble
723
+ > alone is powerless. (b) `core/extractor_adversary` —
724
+ > a `DropGoldClaimExtractor` adversarial layer-1 wrapper,
725
+ > a deterministic `NarrativeSecondaryExtractor` that
726
+ > catches dropped claims via service-tag matching, and a
727
+ > `UnionClaimExtractor` bridging the two. Theorem P38-1:
728
+ > the composition
729
+ > `UnionClaimExtractor ∘ EnsembleReplier(MODE_DUAL_AGREE)`
730
+ > is the unique configuration that recovers the joint
731
+ > layer-1 + layer-2 attack on the Phase-35 bank.
732
+ > (c) `core/primitive_ablation` — feature-flagged
733
+ > `AblatedFeatures` and thread runners (`run_ablated_thread_
734
+ > contested`, `run_ablated_thread_nested`) that toggle each
735
+ > of {`typed_vocab`, `bounded_witness`,
736
+ > `terminating_resolution`, `round_aware_state`,
737
+ > `frozen_membership`}. Theorem P38-3 presents the
738
+ > ablation-table falsifier for Phase-37 Conjecture C37-4.
739
+ > (d) `core/prompt_variants` — five surgical prompt
740
+ > variants (default, contrastive, few_shot, rubric,
741
+ > forced_order) + a `build_thread_reply_prompt_variant`
742
+ > dispatcher + a `VariantLLMThreadReplier` wrapper. Every
743
+ > variant preserves the Phase-36 typed-reply contract
744
+ > (allowed kinds, witness cap, UNCERTAIN fallback). A
745
+ > sibling `core/two_layer_ensemble` addition — `TwoLayer
746
+ > Defense` — is a descriptor record that records which
747
+ > layers are active for reporting. One surgical addition
748
+ > to `tasks/contested_incident`: an optional
749
+ > `claim_extractor` parameter on the handoff-protocol
750
+ > runners so Phase-38 layer-1 adversaries compose without
751
+ > modifying the Phase-35 decoder. No Phase-31 through
752
+ > Phase-37 primitive is modified. See `RESULTS_PHASE38.md`.**
753
+ >
754
+ > **Phase 37 extends the coordination-primitive layer with
755
+ > three composition-level additions, strictly above the
756
+ > Phase-36 reply primitives:
757
+ > (a) `core/reply_calibration` — a `CalibratingReplier` that
758
+ > wraps any `LLMThreadReplier` with a per-call oracle
759
+ > comparator and records every call into a 9-bucket
760
+ > correctness taxonomy (correct / malformed / oov / six
761
+ > semantic confusions) plus an orthogonal
762
+ > `witness_truncated` counter (Theorem P37-1: real LLMs
763
+ > produce 100 % well-formed JSON but 90 % semantic
764
+ > mislabel — the Phase-36 synthetic `malformed_prob` knob
765
+ > is a near-useless surrogate). (b) `core/reply_ensemble`
766
+ > — three pluggable ensemble modes (`dual_agree` AND-gated
767
+ > parallel; `primary_fallback` chatty-primary + fallback;
768
+ > `verified` primary + deterministic verifier), all
769
+ > matching the `LLMThreadReplier` shape so they drop into
770
+ > `causality_extractor_from_replier`. Theorems P37-2
771
+ > (biased-primary recovery), P37-3 (syntactic-noise
772
+ > recovery), P37-4 (structural limit — ensembles cannot
773
+ > recover extractor-output-level noise applied below
774
+ > them). (c) `tasks/nested_contested_incident` — a harder
775
+ > task family where round-1 replies are insufficient; a
776
+ > two-round thread harness (`run_nested_two_round_thread`)
777
+ > and a two-round adaptive-sub harness
778
+ > (`run_nested_two_round_adaptive_sub`) that uses a new
779
+ > `CLAIM_COORDINATION_BRIEFING` kind for inter-round
780
+ > auditor-to-producer briefings. Theorem P37-5: accuracy
781
+ > equivalence EXTENDS to nested contests at 0 pp gap, but
782
+ > the thread uses 0 inter-round briefings while
783
+ > adaptive_sub_2r uses 18 — a structural-complexity
784
+ > separation beneath the accuracy equivalence. No Phase-35
785
+ > or Phase-36 primitive is modified. See
786
+ > `RESULTS_PHASE37.md`.**
787
+ >
788
+ > **Phase 36 extends the dynamic-coordination layer with three
789
+ > sibling modules at the coordination-primitive layer (above
790
+ > `core/role_handoff`, parallel to `core/dynamic_comm`):
791
+ > (a) `core/reply_noise` — parameterised Bernoulli drop /
792
+ > mislabel wrappers and an adversarial reply wrapper targeting
793
+ > the gold `INDEPENDENT_ROOT` reply on a per-scenario budget
794
+ > (Theorems P36-1 graceful i.i.d. degradation and P36-2
795
+ > targeted-adversarial collapse). (b) `core/llm_thread_replier`
796
+ > — an `LLMThreadReplier` that drives a narrow, bounded LLM
797
+ > call per (producer, candidate) and returns a typed reply
798
+ > filtered against the Phase-35 reply-kind enum (Theorem P36-3
799
+ > LLM-replier substitutivity). (c) `core/adaptive_sub` — a
800
+ > bounded, TTL-expiring subscription-edit primitive
801
+ > (`AdaptiveSubscriptionTable` + `AdaptiveSubRouter` +
802
+ > `AdaptiveEdge`) offered as a serious comparison point to the
803
+ > Phase-35 escalation thread (Theorem P36-4 empirical
804
+ > equivalence). On the Phase-35 contested bank × the Phase-36
805
+ > noise × k × seed grid (96 paired measurements), the
806
+ > dynamic-thread vs adaptive-sub accuracy gap is 0.000 pp at
807
+ > every cell; token overhead is +12 %. The Phase-35 primitive
808
+ > is unchanged byte-for-byte. See `RESULTS_PHASE36.md`.**
809
+ >
810
+ > **Phase 35 adds a single new substrate layer strictly above
811
+ > Phase 31's typed-handoff layer and strictly below any
812
+ > unrestricted group-chat layer: the *escalation thread*
813
+ > (`core/dynamic_comm.EscalationThread` +
814
+ > `ThreadReply` + `ThreadResolution` + `DynamicCommRouter`). A
815
+ > thread has a frozen member set, a typed `issue_kind`
816
+ > (`RESOLVE_ROOT_CAUSE_CONFLICT` / `RESOLVE_SEVERITY_CONFLICT` /
817
+ > `RESOLVE_VERDICT_QUORUM` / `CONFIRM_CLAIM`), a bounded tuple
818
+ > of candidate claims, and three bounded budgets: `max_rounds`,
819
+ > `max_replies_per_member`, `witness_token_cap`. Member roles
820
+ > post typed replies from a small enumerated vocabulary
821
+ > (`INDEPENDENT_ROOT` / `DOWNSTREAM_SYMPTOM` / `UNCERTAIN` /
822
+ > `AGREE` / `DISAGREE` / `DEFER_TO`); the thread closes on
823
+ > quorum-on-agree, max-round exhaustion, or explicit opener
824
+ > close. The thread's single public output is a
825
+ > `CLAIM_THREAD_RESOLUTION` handoff routed through the
826
+ > unchanged Phase-31 `HandoffRouter`; thread-internal events
827
+ > (`THREAD:OPEN` / `THREAD:REPLY` / `THREAD:CLOSE`) are hash-
828
+ > chained in the existing `HandoffLog` for audit but never
829
+ > enter non-member inboxes (Theorem P35-4). Bounded-context is
830
+ > preserved with an additive `T·R_max·W` per role per round
831
+ > (Theorem P35-2), independent of |X|. The companion benchmark
832
+ > (`tasks/contested_incident`) runs a 6-scenario bank — 4
833
+ > contested root-cause pairs where static priority is
834
+ > inverted — showing the dynamic strategy at 100 % contested
835
+ > accuracy (flat at 246 tokens) vs static handoffs at 0 %
836
+ > contested accuracy (Theorem P35-1 separation). See Theorems
837
+ > P35-1..P35-4 + Conjectures C35-5, C35-6 in
838
+ > `RESULTS_PHASE35.md`.**
839
+ >
840
+ > **Phase 34 extends Arc 8 with (a) per-role-adaptive calibration
841
+ > (`core/extractor_calibration.per_role_audit_summary` +
842
+ > `core/extractor_noise.PerRoleNoiseConfig` +
843
+ > `per_role_noisy_extractor`): the pooled quadruple is now
844
+ > decomposed into per-role (δ̂_k, ε̂_k, μ̂_k, π̂_k) with a
845
+ > *limiting-role* argmax; on Phase-34's mock benchmark the per-role
846
+ > drop-rate spread is ≥ 0.33 across all three domains, confirming
847
+ > Conjecture C33-3's "pooled i.i.d. hides structure" on every
848
+ > domain; (b) an adversarial extractor wrapper
849
+ > (`core/extractor_noise.adversarial_extractor`) with three target
850
+ > modes — load-bearing claim drop with priority ordering, role
851
+ > silencing, severity-escalation injection — that provably beats
852
+ > i.i.d. at matched nominal budget (Theorem P34-2: at budget = 1 on
853
+ > all three domains the adversary collapses substrate accuracy to
854
+ > 0 % while matched i.i.d. preserves 20 %–80 %, gap +0.47 pp pooled);
855
+ > (c) the programme's first meaningful regex + LLM ensemble result
856
+ > (`core/ensemble_extractor.UnionExtractor`) on a compliance
857
+ > *mixed* bank (5 canonical + 5 narrative scenarios where regex and
858
+ > LLM have genuinely complementary coverage): regex 50 % / LLM 0 % /
859
+ > ensemble 100 % at pooled δ_u = 0.00 ≤ δ_r · δ_l = 0.188 —
860
+ > Conjecture C33-4 promoted to Theorem P34-3; (d) three theorems
861
+ > (P34-1 role-limited accuracy; P34-2 adversarial-vs-iid separation;
862
+ > P34-3 ensemble union lower bound) and two conjectures (C34-4
863
+ > typed-handoff ensemble-vs-adversary; C34-5 per-role replay as
864
+ > tighter predictor than pooled). The substrate primitive
865
+ > (`core/role_handoff`) remains byte-unchanged. See
866
+ > `RESULTS_PHASE34.md`.**
867
+ >
868
+ > **Phase 33 extends Arc 8 with (a) an LLM-driven extractor path
869
+ > (`core/llm_extractor`) — a drop-in replacement for any
870
+ > Phase-31/32 regex extractor that calls a
871
+ > ``Callable[[str], str]`` LLM per (role, scenario) boundary,
872
+ > parses the reply into typed ``(kind, payload, evids)`` tuples,
873
+ > and filters against ``known_kinds_by_role`` so the substrate's
874
+ > type-safety invariants are preserved under hallucination — the
875
+ > substrate primitive (`core/role_handoff`) is unchanged
876
+ > byte-for-byte; (b) a real-vs-synthetic noise calibration layer
877
+ > (`core/extractor_calibration`) that measures the empirical
878
+ > ``(δ̂ drop, ε̂ spurious, μ̂ mislabel, π̂ payload-corrupt)``
879
+ > quadruple against a gold causal chain and maps it to the
880
+ > closest Phase-32 synthetic sweep point — ``qwen2.5:0.5b`` on
881
+ > compliance review is 0.70 / 0.12 / 0.40 / 0.60, Phase-32
882
+ > closest-match predicts substrate accuracy / recall / precision
883
+ > within max-abs-gap 0.10 ⇒ verdict "approximates"; (c) a *third*
884
+ > non-code domain — security-audit escalation
885
+ > (`tasks/security_escalation`) — with a five-role cast (SOC /
886
+ > IR / threat intel / data steward / CISO), 15 claim kinds, and a
887
+ > novel **max-ordinal severity + claim-set classification**
888
+ > decoder (structurally distinct from Phase 31 priority-order and
889
+ > Phase 32 monotone-verdict shapes). Substrate flat at 242
890
+ > tokens / 100 % accuracy across k ∈ {6, 20, 60, 120}; naive
891
+ > collapses 100 % → 20 % at k = 120 under truncation; (d) three
892
+ > theorems (P33-1 LLM-extractor subsumption under the Phase-32
893
+ > sweep; P33-2 cross-domain correctness at K = 3; P33-3
894
+ > two-regime bound on max-ordinal decoders) and two conjectures
895
+ > (C33-3 role-heterogeneous noise; C33-4 ensemble-extractor
896
+ > composition). See `RESULTS_PHASE33.md`.**
897
+ >
898
+ > **Phase 32 extends Arc 8 with (a) a second non-code domain —
899
+ > vendor-onboarding compliance review (`tasks/compliance_review`)
900
+ > with a distinct role cast (legal / security / privacy / finance
901
+ > / compliance officer) and a priority-monotone-verdict + strict-
902
+ > set-flags decoder — that confirms the substrate's behaviour is
903
+ > domain-agnostic (substrate flat at 171 tokens / 100 % accuracy
904
+ > across k ∈ {6, 20, 60, 120}, same signature as Phase 31); (b) a
905
+ > parameterised extractor-noise module (`core/extractor_noise`)
906
+ > with five noise axes (drop / spurious / mislabel /
907
+ > payload_corrupt / seed) and a 96-point controlled sweep across
908
+ > both domains, confirming the Theorem-P32-2 two-regime
909
+ > graceful-degradation bound; (c) Theorem P32-1 (cross-domain
910
+ > correctness preservation), Theorem P32-2 (noisy-extractor
911
+ > graceful degradation, promoting C31-7 to theorem in the monotone
912
+ > regime), Theorem P32-3 (token-bound preservation under bounded
913
+ > noise — the inbox capacity is the regulariser); and (d) a
914
+ > frontier-model spot check with `qwen2.5-coder:7b` on both
915
+ > non-code benchmarks at k = 6. See Theorems P32-1..P32-3 +
916
+ > Conjectures C32-4, C32-5 in `RESULTS_PHASE32.md`.**
917
+ >
918
+ > **Phase 27 extends the runtime-calibration axis from the curated
919
+ > 21-snippet corpus to REAL CORPUS FUNCTIONS. The Phase-27 observer
920
+ > classifies every function in a corpus into a callability state
921
+ > (`ready_no_args` / `ready_typed` / `ready_curated` or one of
922
+ > several `unsupported_*` states), synthesises recipe-compatible
923
+ > arguments via a `SafeRecipeRegistry`, and runs the Phase-26 probes
924
+ > with additional `sys.settrace`-based entry detection and per-call
925
+ > wall-time budgeting. On `vision-core` (~791 functions) the ready
926
+ > slice is ~35.7 %; the remaining 64 % is structurally unprobable
927
+ > under the default recipe strategy (methods without auto-
928
+ > constructed instances, variadic args, generators, async, untyped
929
+ > positional params). Theorem P27-1 formalises this as a strict
930
+ > inclusion $F_R \subseteq F_A$; Theorem P27-2 shows corpus-scale
931
+ > runtime coverage is witness-availability-bounded, not planner-
932
+ > exactness-bounded — the planner round-trip remains 100 % on
933
+ > every predicate across every corpus.** The full
934
+ > architecture composes as:
935
+ >
936
+ > ```
937
+ > Routing (who talks to whom; O(log N)) — lossy by design
938
+ > ↓
939
+ > Trigger (when to refine) — lossy by design
940
+ > ↓
941
+ > Exact external memory (Merkle DAG) — LOSSLESS, content-addressed
942
+ > ↓ ┌─ text chunks (Phases 19–21)
943
+ > ↓ ├─ source files + AST metadata (Phase 22)
944
+ > ↓ ├─ source files + AST structural metadata
945
+ > ↓ │ + conservative intraprocedural metadata (Phase 24)
946
+ > ↓ └─ source files + AST structural metadata
947
+ > ↓ + conservative intraprocedural metadata
948
+ > ↓ + conservative INTERPROCEDURAL metadata (Phase 25)
949
+ > Retrieval (dense + lexical RRF + multi-hop) — lossy in ranking, never in content
950
+ > ↓
951
+ > Computation / planning (typed operators + planner) — LOSSLESS, deterministic
952
+ > ↓ ┌─ structural patterns (count / list / top / join)
953
+ > ↓ ├─ intraprocedural patterns (may_raise / recursive / io) [P24]
954
+ > ↓ └─ INTERPROCEDURAL patterns (trans_may_raise /
955
+ > ↓ participates_in_cycle /
956
+ > ↓ trans_calls_* / unresolved) [P25]
957
+ > Render: { wrap_llm | direct } — direct path: zero LLM, zero prompt
958
+ > ↓
959
+ > Bounded active context fed to the LLM (only when
960
+ > the wrap path or retrieval fallback is used) — bytes are exact slices of memory
961
+ >
962
+ > ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
963
+ > Phase-31 typed-handoff substrate (cross-role content channel)
964
+ > ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
965
+ > role A's events → role A's extractor → TypedHandoff
966
+ > (claim_kind, payload, src_event_ids, cid)
967
+ > ↓
968
+ > RoleSubscriptionTable[(src_role, claim_kind)]
969
+ > → set(consumer roles)
970
+ > ↓
971
+ > bounded RoleInbox (dedup by payload_cid,
972
+ > overflow accounted, wrong_role rejected)
973
+ > ↓
974
+ > hash-chained HandoffLog
975
+ > (SHA-256 over (prev_chain_hash, handoff
976
+ > fields); tamper / truncation detector)
977
+ > ↓
978
+ > per-(src_role, to_role, claim_kind)
979
+ > DeliveryAccount counters for the benchmark
980
+ > (Phase-31 is additive: the layer sits alongside routing and
981
+ > ingestion; teams that do not need typed handoffs can ignore
982
+ > it. The handoff layer lifts load-bearing content into routing
983
+ > headers so downstream roles can subscribe by claim-kind — the
984
+ > mechanism by which the Phase-29 "routing-by-type cannot rescue
985
+ > the aggregator" observation is resolved for general teams.)
986
+ > (Phase-32 adds a controlled noise wrapper
987
+ > `core/extractor_noise.noisy_extractor` that sits between any
988
+ > extractor and the router to exercise Theorem P32-2's
989
+ > graceful-degradation regimes; production runs use identity
990
+ > noise, the Phase-32 sweep uses non-trivial parameters.)
991
+ >
992
+ > ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
993
+ > Phase-35 dynamic-coordination layer (strictly above P31 layer)
994
+ > ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
995
+ > Auditor detects contested candidates in its RoleInbox
996
+ > ↓
997
+ > open_thread(issue_kind, frozen(members),
998
+ > candidate_claims, max_rounds,
999
+ > max_replies_per_member,
1000
+ > quorum, witness_token_cap)
1001
+ > ↓
1002
+ > member roles post typed ThreadReply messages:
1003
+ > {INDEPENDENT_ROOT, DOWNSTREAM_SYMPTOM,
1004
+ > UNCERTAIN, AGREE, DISAGREE, DEFER_TO}
1005
+ > ↓
1006
+ > close_thread → ThreadResolution:
1007
+ > {SINGLE_INDEPENDENT_ROOT, QUORUM_AGREE,
1008
+ > CONFLICT, NO_CONSENSUS, TIMEOUT}
1009
+ > ↓
1010
+ > emit(CLAIM_THREAD_RESOLUTION, payload="kind=...
1011
+ > winner=role/kind losers=r/k,...")
1012
+ > ↓ (through unchanged HandoffRouter)
1013
+ > RoleInbox(auditor) — single public output
1014
+ > (Phase-35 is strictly additive: thread-internal events live in
1015
+ > the existing HandoffLog but no inbox subscribes to the
1016
+ > THREAD:* internal claim kinds; non-member roles see zero
1017
+ > thread traffic. Bounded-context invariant extends with an
1018
+ > additive T·R_max·W per role per round — Theorem P35-2.)
1019
+ >
1020
+ > ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
1021
+ > Phase-26 runtime-calibration observer (additive, off-path)
1022
+ > ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
1023
+ > Source bytes ┄┄┄→ instrumented execution probes
1024
+ > (monkeypatched subprocess / filesystem /
1025
+ > network APIs; sys.settrace for cycles)
1026
+ > ↓
1027
+ > per-predicate RuntimeObservation:
1028
+ > runtime_flag, n_runs, n_triggered,
1029
+ > witnesses, decidable, applicable
1030
+ > ↓
1031
+ > calibration summary: FP, FN, fp_rate,
1032
+ > fn_rate, per-family breakdown
1033
+ > (source bytes and analyzer flags flow in; the runtime observer
1034
+ > reports a second truth value per predicate; the planner's
1035
+ > direct-exact path is unchanged.)
1036
+ >
1037
+ > ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
1038
+ > Phase-27 corpus-scale runtime-calibration observer (additive)
1039
+ > ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
1040
+ > Real corpus ┄┄┄→ CorpusFunctionCandidate per qname:
1041
+ > {ready_no_args | ready_typed | ready_curated
1042
+ > | unsupported_*} (AST + inspect.signature
1043
+ > + SafeRecipeRegistry lookup)
1044
+ > ↓
1045
+ > InvocationRecipe per ready candidate:
1046
+ > (no_args | typed from fuzz pool | curated)
1047
+ > ↓
1048
+ > sandbox + entry-and-budget tracer:
1049
+ > sys.settrace counts enter_count on
1050
+ > target code object; time.monotonic()
1051
+ > check every line event; sentinel on
1052
+ > budget expiry.
1053
+ > ↓
1054
+ > per-predicate CorpusObservation:
1055
+ > runtime_flag, n_runs, n_triggered,
1056
+ > n_entered, n_timeout, witnesses,
1057
+ > recipe_kind, applicable, entered, timeout
1058
+ > ↓
1059
+ > coverage account: per-status buckets +
1060
+ > ready_fraction, calibrated_fraction;
1061
+ > per-predicate metrics restricted to
1062
+ > entered=True.
1063
+ > ```
1064
+ >
1065
+ > See `vision_mvp/RESULTS_PHASE19.md`, `RESULTS_PHASE20.md`,
1066
+ > `RESULTS_PHASE21.md`, `RESULTS_PHASE22.md`, `RESULTS_PHASE23.md`,
1067
+ > `RESULTS_PHASE24.md`, and `RESULTS_PHASE25.md` for the cumulative
1068
+ > evidence: an exact byte-store + bounded-context worker beats
1069
+ > summarise-then-pool on long-document needle questions; hybrid
1070
+ > retrieval + structural multi-hop expansion close most of the
1071
+ > remaining recall gap; a typed operator pipeline answers
1072
+ > aggregation queries the retrieval layer cannot reach (91 % vs 64
1073
+ > % on synthetic aggregation, beating even oracle on that slice);
1074
+ > on a real Python codebase the direct-exact path achieves **7/7
1075
+ > correct with zero LLM calls and zero prompt chars**, while
1076
+ > retrieval-only conditions score **0/7** because aggregation is
1077
+ > structurally unreachable by top-k retrieval; across **six real
1078
+ > Python corpora** direct-exact scores **65/65 (100 %, σ = 0)** on
1079
+ > the structural battery, **44 / 44 (100 %, σ = 0)** on the Phase-
1080
+ > 24 intraprocedural semantic battery, and **50 / 50 (100 %, σ =
1081
+ > 0)** on the Phase-25 interprocedural semantic battery with zero
1082
+ > LLM calls; retrieval-mediated paths average **19.7 % (σ = 17.6)**
1083
+ > on structural aggregation, **49.6 % (σ = 15.8)** on Phase-24
1084
+ > semantic, and **38.0 % (σ = 23.1)** on Phase-25 interprocedural.
1085
+ > **The exact slice now covers syntactic code structure AND
1086
+ > conservative intraprocedural-semantic code properties AND
1087
+ > conservative interprocedural-semantic code properties — the last
1088
+ > including transitive effect propagation over a local call graph
1089
+ > and exact SCC-based recursion-cycle detection.** The CASR spec
1090
+ > below is unchanged for the routing/trigger layers.
1091
+
1092
+ ---
1093
+
1094
+ ---
1095
+
1096
+ ## Design Principles
1097
+
1098
+ 1. **Routing decisions must not require reading message content.** The Bloom filter operates on event type metadata (O(1)), not event bodies. Reading content for routing decisions would negate the efficiency gains.
1099
+
1100
+ 2. **Scale assignments are declarative, set at agent instantiation.** Scales do not change mid-task. Dynamic scale inference is a research question deferred to Phase 3.
1101
+
1102
+ 3. **The world model updates are asynchronous.** The surprise filter does not block message delivery. Updates to M_i happen in a background process.
1103
+
1104
+ 4. **No information is destroyed, only routed.** The event log is append-only. Any agent can replay the full event history if needed. CASR is a filter on delivery, not on storage.
1105
+
1106
+ 5. **Fail open.** When uncertain (Bloom filter positive hit, world model not yet trained), deliver the message. Over-delivery is safer than under-delivery.
1107
+
1108
+ ---
1109
+
1110
+ ## Agent Interface
1111
+
1112
+ Every agent in a CASR-enabled team exposes this interface at instantiation:
1113
+
1114
+ ```python
1115
+ @dataclass
1116
+ class AgentConfig:
1117
+ agent_id: str
1118
+ role: str # Human-readable role name
1119
+ task_description: str # Current task at instantiation
1120
+ scale: int # 0=Token, 1=Statement, 2=Function, 3=Module, 4=System
1121
+ distortion_budget: float # Acceptable task-error probability increase (0.0 to 1.0)
1122
+ causal_footprint: BloomFilter # Pre-computed relevance filter for this role
1123
+ world_model: Optional[GenerativeModel] # None until trained in Phase 2
1124
+ surprise_threshold: float # τᵢ — KL threshold for transmission (0.0 disables filter)
1125
+ ```
1126
+
1127
+ **Scale semantics:**
1128
+
1129
+ | Scale Value | Granularity | Example Events Visible | Example Roles |
1130
+ |-------------|-------------|----------------------|---------------|
1131
+ | 0 | Token | Every token, syntax error, formatting diff | Linter, formatter, syntax checker |
1132
+ | 1 | Statement | Individual tool calls, single code lines, test results | Code writer, unit tester, file editor |
1133
+ | 2 | Function | Function completions, subtask results, local test pass/fail | Subagent, debugger, function-level reviewer |
1134
+ | 3 | Module | Subsystem changes, integration test results, cross-function state | Orchestrator, module-level planner |
1135
+ | 4 | System | Architectural decisions, goal completions, global constraints | Meta-orchestrator, project planner |
1136
+
1137
+ **Distortion budget:** Expressed as maximum acceptable probability of the agent taking a suboptimal action due to missing context. Conservative agents (planners) use low budget (~0.01). Monitoring agents (checking for catastrophic failures only) use high budget (~0.20).
1138
+
1139
+ ---
1140
+
1141
+ ## The Scale Projection Operators
1142
+
1143
+ For each scale s, the projection operator P_s maps a full event to its representation at scale s.
1144
+
1145
+ **Required property (composability):**
1146
+ ```
1147
+ P_{s1}(P_{s2}(e)) = P_{max(s1,s2)}(e) for all events e
1148
+ ```
1149
+
1150
+ Applying two projections in sequence gives the coarser projection. This ensures consistency across the hierarchy.
1151
+
1152
+ **Fixed-point events** (preserved at all scales, P_s(e) = e for all s):
1153
+ - Task goal specification messages
1154
+ - Hard constraint declarations
1155
+ - Error/failure events (any unhandled exception or task failure)
1156
+ - Final output/completion events
1157
+
1158
+ **Projection implementations by scale transition:**
1159
+
1160
+ ```
1161
+ scale 0 → 1: Aggregate consecutive tokens into statement-level summaries.
1162
+ Discard whitespace, formatting, comments.
1163
+ Preserve: variable names, control flow, function calls.
1164
+
1165
+ scale 1 → 2: Aggregate statements into function-level summaries.
1166
+ Discard: intermediate variable states, loop iterations.
1167
+ Preserve: function signature, return value, side effects, errors.
1168
+
1169
+ scale 2 → 3: Aggregate function results into module-level summaries.
1170
+ Discard: internal function logic.
1171
+ Preserve: module interface changes, integration test results, exported state.
1172
+
1173
+ scale 3 → 4: Aggregate module changes into system-level summaries.
1174
+ Discard: implementation details.
1175
+ Preserve: architectural decisions, constraint violations, goal progress.
1176
+ ```
1177
+
1178
+ **Implementation note:** In the MVP, these projections are implemented as LLM calls with structured output schemas. In Phase 2, they can be replaced with fine-tuned smaller models for efficiency.
1179
+
1180
+ ---
1181
+
1182
+ ## Message Bus Architecture
1183
+
1184
+ The central component is an event-sourced message bus. All agents are publishers and subscribers.
1185
+
1186
+ ```
1187
+ ┌─────────────────────────────────────────────────────────┐
1188
+ │ EVENT BUS │
1189
+ │ │
1190
+ │ ┌──────────────┐ ┌──────────────────────────────┐ │
1191
+ │ │ Event Log │ │ Subscriber Registry │ │
1192
+ │ │ (append-only)│ │ agent_id → AgentConfig │ │
1193
+ │ └──────────────┘ └──────────────────────────────┘ │
1194
+ │ │
1195
+ │ On new event e published by agent aⱼ: │
1196
+ │ For each subscriber aᵢ: │
1197
+ │ 1. B_i(e.type) → if "definitely not": skip │
1198
+ │ 2. P_{sᵢ}(e) → compute scale projection │
1199
+ │ 3. δᵢ(e) = KL(M_i.predict() || e.embedding) │
1200
+ │ if δ < τᵢ and M_i is trained: skip │
1201
+ │ 4. Deliver P_{sᵢ}(e) to aᵢ's context queue │
1202
+ └─────────────────────────────────────────────────────────┘
1203
+ ```
1204
+
1205
+ **Event schema:**
1206
+
1207
+ ```python
1208
+ @dataclass
1209
+ class Event:
1210
+ event_id: str # UUID
1211
+ event_type: str # Enumerated type (tool_call, message, state_change, error, goal_update)
1212
+ sender_id: str # Sending agent
1213
+ timestamp: float # Unix timestamp
1214
+ scale_level: int # Scale of the originating agent
1215
+ body: dict # Full event content (not read during routing decision)
1216
+ embedding: np.ndarray # Precomputed embedding for world model comparison
1217
+ is_fixed_point: bool # If True, delivered to all agents unmodified
1218
+ ```
1219
+
1220
+ **Delivery guarantee:** At-least-once delivery. Events that pass all three CASR stages are queued for delivery. If an agent's queue is full (context window filling), the bus falls back to delivering only fixed-point events until the agent processes its queue.
1221
+
1222
+ ---
1223
+
1224
+ ## Bloom Filter Specification
1225
+
1226
+ **Construction (offline, per agent role):**
1227
+
1228
+ ```
1229
+ Input: Set of (event_type, is_relevant) pairs for this role
1230
+ Output: BloomFilter with target false positive rate p = 0.01
1231
+
1232
+ false_positive_rate = 0.01 (1% of irrelevant events pass the filter)
1233
+ n = number of event types in the system
1234
+ m = -n * ln(p) / (ln(2))^2 (filter size in bits)
1235
+ k = (m/n) * ln(2) (number of hash functions)
1236
+ ```
1237
+
1238
+ **At runtime:**
1239
+ ```
1240
+ query(event_type) → {DEFINITELY_NOT_RELEVANT, POSSIBLY_RELEVANT}
1241
+ ```
1242
+
1243
+ If DEFINITELY_NOT_RELEVANT: drop the event without reading its body.
1244
+ If POSSIBLY_RELEVANT: proceed to scale projection.
1245
+
1246
+ **Staleness mitigation:** Bloom filters are rebuilt at each task phase transition (e.g., when the orchestrator changes the global task state). Between transitions, the filter is immutable.
1247
+
1248
+ **Conservative initialization:** Before any empirical data is collected, initialize the Bloom filter to include all event types (100% pass rate). Refine using empirical footprint estimation once data is available.
1249
+
1250
+ ---
1251
+
1252
+ ## World Model Specification
1253
+
1254
+ The world model M_i for agent aᵢ is a lightweight model that predicts the next event's embedding given the agent's current context:
1255
+
1256
+ ```
1257
+ M_i : (current_context, recent_events) → predicted_event_embedding
1258
+ ```
1259
+
1260
+ **Stage 2 implementation (Phase 2+):**
1261
+ - Small transformer (≤7B parameters) or frozen large model with a fine-tuned prediction head
1262
+ - Input: last K events in aᵢ's context, projected to scale sᵢ
1263
+ - Output: predicted embedding of next event in aᵢ's context
1264
+ - Training: minimize L2 distance between predicted and actual event embeddings
1265
+
1266
+ **Surprise computation:**
1267
+ ```python
1268
+ def surprise(M_i, event_e):
1269
+ predicted = M_i.predict(current_context)
1270
+ actual = event_e.embedding
1271
+ return kl_divergence(predicted, actual)
1272
+ # or simpler: cosine_distance(predicted, actual)
1273
+ ```
1274
+
1275
+ **World model disabled (MVP):** In the MVP, M_i is not trained. Set τᵢ = 0, which delivers all events that pass the Bloom filter. The surprise filter is enabled incrementally in Phase 2.
1276
+
1277
+ ---
1278
+
1279
+ ## Failure Modes and Mitigations
1280
+
1281
+ | Failure Mode | Cause | Detection | Mitigation |
1282
+ |---|---|---|---|
1283
+ | Missing critical context | Bloom filter false negative (impossible by construction) | N/A | None needed — Bloom filters have no false negatives |
1284
+ | Context starvation | τᵢ too high, world model over-predicts | Agent produces incorrect output despite low context | Decrease τᵢ or trigger full-sync |
1285
+ | Bloom filter staleness | New event type introduced after filter construction | Agent fails to respond to new event types | Rebuild filters at phase transitions; default-include unknown event types |
1286
+ | World model drift | Team behavior diverges from training distribution | Surprise distribution shifts systematically | Periodic re-training of M_i on recent event logs |
1287
+ | Scale mismatch | Event from scale-0 agent delivered to scale-4 agent without projection | Scale-4 agent context fills with low-level detail | Scale projection is mandatory for all cross-scale delivery |
1288
+ | Orchestrator overload | All N workers complete simultaneously, flood orchestrator | Orchestrator queue depth spikes | Rate-limit delivery to orchestrator; batch completions within a time window |
1289
+
1290
+ **Full-state synchronization:** Every K rounds (K is a hyperparameter, default 50), each agent receives the unfiltered projection of all current state at its scale, bypassing all CASR filters. This corrects accumulated errors from stale Bloom filters and miscalibrated world models. K should be set to the expected task-phase length.
1291
+
1292
+ ---
1293
+
1294
+ ## Event Type Registry
1295
+
1296
+ A centralized registry of all event types and their default scale assignments. This is the source of truth for Bloom filter construction.
1297
+
1298
+ ```
1299
+ Core event types:
1300
+
1301
+ TOOL_CALL scale=1 (statement level by default)
1302
+ TOOL_RESULT scale=1
1303
+ FILE_EDIT scale=1
1304
+ FILE_CREATE scale=2 (function/module level)
1305
+ TEST_RUN scale=2
1306
+ TEST_RESULT scale=2
1307
+ FUNCTION_COMPLETE scale=2
1308
+ MODULE_COMPLETE scale=3
1309
+ TASK_GOAL_UPDATE scale=4, is_fixed_point=True
1310
+ HARD_CONSTRAINT scale=4, is_fixed_point=True
1311
+ ERROR_UNHANDLED scale=4, is_fixed_point=True (always delivers to all)
1312
+ TASK_COMPLETE scale=4, is_fixed_point=True
1313
+ AGENT_SPAWN scale=3
1314
+ AGENT_TERMINATE scale=3
1315
+ MESSAGE_AGENT scale=2 (default; overridden by sender scale)
1316
+ ```
1317
+
1318
+ **Custom event types:** Teams can register domain-specific event types with explicit scale assignments and relevance mappings per role.
1319
+
1320
+ ---
1321
+
1322
+ ## Scaling Characteristics
1323
+
1324
+ | Team Size | History Depth | Naive Context (tokens) | CASR Context (tokens) | Reduction |
1325
+ |-----------|--------------|----------------------|----------------------|-----------|
1326
+ | 5 agents | 50 rounds | ~12,500 | ~2,500 | 5x |
1327
+ | 10 agents | 100 rounds | ~100,000 | ~6,600 | 15x |
1328
+ | 20 agents | 200 rounds | ~800,000 | ~14,600 | 55x |
1329
+ | 50 agents | 500 rounds | ~12,500,000 | ~46,000 | 272x |
1330
+
1331
+ *Estimates based on O(H·log(N)) vs O(N·H²) scaling, with k=50 tokens per event, branching factor b=5.*
1332
+
1333
+ These are theoretical. Empirical validation is the primary goal of Phase 1 (MVP).
1334
+
1335
+ ---
1336
+
1337
+ ## Interface with Existing Frameworks
1338
+
1339
+ CASR is designed as a drop-in message bus layer for existing multi-agent frameworks.
1340
+
1341
+ **AutoGen integration:** Replace AutoGen's GroupChat or nested conversation patterns with the CASR event bus. Agent-to-agent messages become events; the bus handles routing.
1342
+
1343
+ **LangGraph integration:** Add a CASR routing layer to each graph edge. Before a LangGraph node receives its input state, run the state update through the CASR pipeline.
1344
+
1345
+ **CrewAI integration:** Intercept the task context assembly step. Instead of assembling full context for each agent, assemble CASR-filtered context.
1346
+
1347
+ The goal is not to replace these frameworks but to add principled context routing as a layer beneath their agent orchestration logic.
1348
+
1349
+
1350
+ ---
1351
+
1352
+ ## Phase-45 Product Surface (operator entrypoint)
1353
+
1354
+ Phase 45 added a thin orchestration surface on top of the Phase
1355
+ 31..44 stack at `vision_mvp/product/`:
1356
+
1357
+ - `vision_mvp/product/profiles.py` — six stable, versioned
1358
+ profiles (`local_smoke`, `bundled_57`, `bundled_57_mock_sweep`,
1359
+ `aspen_mac1_coder`, `aspen_mac2_frontier`, `public_jsonl`).
1360
+ Schema: `phase45.profile.v1`.
1361
+ - `vision_mvp/product/runner.py` — `run_profile(...)` composes
1362
+ readiness → sweep → report. Readiness is a hard gate unless
1363
+ overridden (Theorem P45-2). Real-LLM sweeps are *recorded* as
1364
+ a launch command rather than forked from inside the runner.
1365
+ - `vision_mvp/product/report.py` — summary renderer;
1366
+ reusable on any stored `product_report.json`.
1367
+ - One command:
1368
+ `python3 -m vision_mvp.product --profile <name> --out-dir <d>`
1369
+
1370
+ The product surface adds no new substrate semantics; see
1371
+ `vision_mvp/RESULTS_PHASE45.md` and
1372
+ `docs/context_zero_master_plan.md` §9 for the Finished-Product
1373
+ Checklist and release criteria.
1374
+
1375
+
1376
+ ---
1377
+
1378
+ ## Phase-46 Boundary Surface (external-exercise readiness)
1379
+
1380
+ Phase 46 adds a boundary layer between the Phase-45 product
1381
+ surface and the outside world:
1382
+
1383
+ - `vision_mvp/product/import_data.py` — `audit_jsonl(...)`:
1384
+ schema classification (native / hermetic / ambiguous /
1385
+ unusable), duplicate-id detection, decode / non-object /
1386
+ empty-bank failure modes, delegated Theorem-P44-3 readiness.
1387
+ CLI exit codes distinguish *missing file* (2) from *blocker*
1388
+ (1) from *clean* (0).
1389
+ - `vision_mvp/product/ci_gate.py` — `evaluate_report(...)` +
1390
+ `aggregate(...)`: five-check CI verdict over one or more
1391
+ `product_report.json` files. Threshold knobs for readiness
1392
+ fraction and per-cell pass@1; profile-whitelist support;
1393
+ machine-readable `phase46.ci_verdict.v1`.
1394
+ - Frontier-model slot: `aspen_mac1_coder_70b` profile +
1395
+ `profiles.model_availability()` declarative check. Runner
1396
+ attaches `model_metadata` to recorded launches so downstream
1397
+ consumers can distinguish *slot_pending_availability* from
1398
+ *assumed_resident* without probing Ollama.
1399
+
1400
+ The boundary layer does not change any programme-internal
1401
+ semantics; see `vision_mvp/RESULTS_PHASE46.md` and
1402
+ `docs/context_zero_master_plan.md` §9.9 for the endogenous /
1403
+ exogenous split.
1404
+
1405
+
1406
+ ---
1407
+
1408
+ ## Stable-vs-Experimental Boundary (SDK v3.29 / W28)
1409
+
1410
+ As of SDK v3.29 the CoordPy public surface is split into **stable**
1411
+ and **experimental** tiers, named explicitly in
1412
+ `vision_mvp/coordpy/__init__.py`:
1413
+
1414
+ * **Stable surface** (everything in `__all__` *not* in
1415
+ `__experimental__`): the run boundary (`RunSpec`, `run`),
1416
+ capsule primitives (`ContextCapsule`, `CapsuleLedger`,
1417
+ `CapsuleView`, lifecycle audit), provenance, the LLM backend
1418
+ abstraction (`LLMBackend`, `OllamaBackend`,
1419
+ `MLXDistributedBackend`), the team coordination ledger primitives
1420
+ (`capsule_team_handoff`, `capsule_role_view`, `capsule_team_decision`,
1421
+ `T_INVARIANTS`), and the layered API (`CoordPySimpleAPI`,
1422
+ `CoordPyBuilderAPI`, `CoordPyAdvancedAPI`). The W3 capsule contract,
1423
+ the W4 team-lifecycle audit, and the run-boundary product
1424
+ runtime contract are all in the stable surface and are subject
1425
+ to semantic-version compatibility within the 0.5.x line.
1426
+ * **Experimental surface** (`vision_mvp.coordpy.__experimental__`):
1427
+ the dense-control / multi-agent-coordination research line —
1428
+ W22 latent digest, W23 cross-cell delta, W24 session compaction,
1429
+ W25 shared fanout, W26 chain-persisted fanout, W27 multi-chain
1430
+ pivot, W28 ensemble-verified ratification. These symbols may
1431
+ evolve between minor versions; external callers should pin a
1432
+ specific SDK version (`__version__` / `SDK_VERSION`) and watch
1433
+ the CHANGELOG for breaking changes.
1434
+
1435
+ The split is *additive* on the v3.28 surface — every prior
1436
+ exported symbol remains exported; the `__experimental__` tuple
1437
+ is a *marker*, not a removal. External callers depending only on
1438
+ the stable surface should see no behavioural change crossing the
1439
+ v3.28 → v3.29 boundary.
1440
+
1441
+ The stability of the stable surface is mechanically asserted by
1442
+ `test_coordpy_public_api.py`; the experimental surface is asserted
1443
+ by the W22..W28 phase tests.