coordpy-ai 0.5.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coordpy_ai-0.5.16/ARCHITECTURE.md +1443 -0
- coordpy_ai-0.5.16/CHANGELOG.md +4013 -0
- coordpy_ai-0.5.16/CITATION.cff +21 -0
- coordpy_ai-0.5.16/CONTRIBUTING.md +212 -0
- coordpy_ai-0.5.16/LICENSE +21 -0
- coordpy_ai-0.5.16/MANIFEST.in +39 -0
- coordpy_ai-0.5.16/PKG-INFO +258 -0
- coordpy_ai-0.5.16/README.md +200 -0
- coordpy_ai-0.5.16/RELEASING.md +206 -0
- coordpy_ai-0.5.16/SECURITY.md +38 -0
- coordpy_ai-0.5.16/coordpy/__init__.py +1560 -0
- coordpy_ai-0.5.16/coordpy/__main__.py +8 -0
- coordpy_ai-0.5.16/coordpy/_cli.py +408 -0
- coordpy_ai-0.5.16/coordpy/_internal/__init__.py +9 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/__init__.py +4 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/adaptive_sub.py +524 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/agent_keys.py +154 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/code_harness.py +171 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/dynamic_comm.py +992 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/extractor_noise.py +682 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/llm_client.py +106 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/role_handoff.py +643 -0
- coordpy_ai-0.5.16/coordpy/_internal/core/task_board.py +119 -0
- coordpy_ai-0.5.16/coordpy/_internal/experiments/__init__.py +4 -0
- coordpy_ai-0.5.16/coordpy/_internal/experiments/phase44_public_readiness.py +381 -0
- coordpy_ai-0.5.16/coordpy/_internal/product/__init__.py +19 -0
- coordpy_ai-0.5.16/coordpy/_internal/product/__main__.py +4 -0
- coordpy_ai-0.5.16/coordpy/_internal/product/ci_gate.py +279 -0
- coordpy_ai-0.5.16/coordpy/_internal/product/import_data.py +276 -0
- coordpy_ai-0.5.16/coordpy/_internal/product/profiles.py +254 -0
- coordpy_ai-0.5.16/coordpy/_internal/product/report.py +92 -0
- coordpy_ai-0.5.16/coordpy/_internal/product/runner.py +904 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/__init__.py +1 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/code_review.py +148 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/collaborative_build.py +243 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/collaborative_module.py +311 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/compliance_review.py +1337 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/consensus.py +93 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/contested_incident.py +1993 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/corpus_registry.py +250 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/corpus_runtime_recipes.py +323 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/data/_build_swe_lite_bank.py +1975 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/data/swe_lite_style_bank.jsonl +57 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/data/swe_real_shape_mini.jsonl +6 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/distributed_summary.py +194 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/drifting_consensus.py +88 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/executable_snippets.py +542 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/incident_triage.py +1538 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/library_v2.py +724 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/llm_consensus.py +93 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/long_corpus.py +241 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/needle_corpus.py +504 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/nested_contested_incident.py +1287 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/numeric_ledger.py +622 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/protocol_codesign.py +620 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/protocolkit_36.py +792 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/python_corpus.py +745 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/quant_strategy.py +226 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/security_escalation.py +1430 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_bench_bridge.py +1951 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_loop_harness.py +594 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_patch_parser.py +873 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_raw_capture.py +534 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_sandbox.py +732 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/swe_semantic_taxonomy.py +652 -0
- coordpy_ai-0.5.16/coordpy/_internal/tasks/task_scale_swe.py +1014 -0
- coordpy_ai-0.5.16/coordpy/_version.py +11 -0
- coordpy_ai-0.5.16/coordpy/agents.py +348 -0
- coordpy_ai-0.5.16/coordpy/api_layers.py +376 -0
- coordpy_ai-0.5.16/coordpy/capsule.py +1985 -0
- coordpy_ai-0.5.16/coordpy/capsule_decoder.py +610 -0
- coordpy_ai-0.5.16/coordpy/capsule_decoder_relational.py +466 -0
- coordpy_ai-0.5.16/coordpy/capsule_decoder_v2.py +1221 -0
- coordpy_ai-0.5.16/coordpy/capsule_policy.py +585 -0
- coordpy_ai-0.5.16/coordpy/capsule_policy_bundle.py +595 -0
- coordpy_ai-0.5.16/coordpy/capsule_runtime.py +1212 -0
- coordpy_ai-0.5.16/coordpy/config.py +134 -0
- coordpy_ai-0.5.16/coordpy/extensions/__init__.py +59 -0
- coordpy_ai-0.5.16/coordpy/extensions/examples/__init__.py +8 -0
- coordpy_ai-0.5.16/coordpy/extensions/examples/jsonl_report_sink.py +64 -0
- coordpy_ai-0.5.16/coordpy/extensions/registry.py +86 -0
- coordpy_ai-0.5.16/coordpy/extensions/report_sink.py +141 -0
- coordpy_ai-0.5.16/coordpy/extensions/sandbox.py +121 -0
- coordpy_ai-0.5.16/coordpy/extensions/taskbank.py +126 -0
- coordpy_ai-0.5.16/coordpy/integrated_synthesis.py +1224 -0
- coordpy_ai-0.5.16/coordpy/lifecycle_audit.py +658 -0
- coordpy_ai-0.5.16/coordpy/llm_backend.py +438 -0
- coordpy_ai-0.5.16/coordpy/provenance.py +157 -0
- coordpy_ai-0.5.16/coordpy/py.typed +0 -0
- coordpy_ai-0.5.16/coordpy/role_invariant_synthesis.py +1198 -0
- coordpy_ai-0.5.16/coordpy/run.py +131 -0
- coordpy_ai-0.5.16/coordpy/runtime.py +860 -0
- coordpy_ai-0.5.16/coordpy/synthetic_llm.py +271 -0
- coordpy_ai-0.5.16/coordpy/team_coord.py +32022 -0
- coordpy_ai-0.5.16/coordpy/team_policy.py +391 -0
- coordpy_ai-0.5.16/coordpy_ai.egg-info/PKG-INFO +258 -0
- coordpy_ai-0.5.16/coordpy_ai.egg-info/SOURCES.txt +104 -0
- coordpy_ai-0.5.16/coordpy_ai.egg-info/dependency_links.txt +1 -0
- coordpy_ai-0.5.16/coordpy_ai.egg-info/entry_points.txt +5 -0
- coordpy_ai-0.5.16/coordpy_ai.egg-info/requires.txt +32 -0
- coordpy_ai-0.5.16/coordpy_ai.egg-info/top_level.txt +1 -0
- coordpy_ai-0.5.16/examples/agent_team.py +44 -0
- coordpy_ai-0.5.16/examples/build_with_coordpy.py +173 -0
- coordpy_ai-0.5.16/pyproject.toml +198 -0
- coordpy_ai-0.5.16/setup.cfg +4 -0
- coordpy_ai-0.5.16/tests/test_smoke_full.py +318 -0
|
@@ -0,0 +1,1443 @@
|
|
|
1
|
+
# Context Zero — Reference Architecture
|
|
2
|
+
|
|
3
|
+
**CoordPy** is the shipped **context-capsule runtime** produced by the
|
|
4
|
+
**Context Zero** research programme. Every piece of context that
|
|
5
|
+
crosses a role boundary, a layer boundary, or a run boundary in CoordPy
|
|
6
|
+
is a typed, content-addressed, lifecycle-bounded, budget-bounded,
|
|
7
|
+
provenance-carrying **capsule** — never a raw prompt string. As of
|
|
8
|
+
SDK v3.3 (April 2026), capsules drive execution at the run boundary,
|
|
9
|
+
inside the inner sweep loop, AND on the parser axis (PARSE_OUTCOME
|
|
10
|
+
capsule sealed before every PATCH_PROPOSAL — Theorem W3-39). A
|
|
11
|
+
runtime-checkable lifecycle audit mechanically verifies eight
|
|
12
|
+
invariants L-1..L-8 over every finished run (Theorem W3-40), and
|
|
13
|
+
deterministic-mode replay opt-in (`RunSpec(deterministic=True)`)
|
|
14
|
+
collapses the full capsule DAG byte-for-byte across runs of the same
|
|
15
|
+
logical input (Theorem W3-41). Meta-artefacts have a formally-defined
|
|
16
|
+
detached-witness boundary (Theorem W3-36). Substantive on-disk
|
|
17
|
+
artifacts are content-addressed at write time and re-verifiable at
|
|
18
|
+
audit time. This document is the programme's architectural
|
|
19
|
+
reference: it covers the full substrate (routing, exact memory,
|
|
20
|
+
retrieval, planner, runtime calibration, typed handoffs) and the
|
|
21
|
+
CoordPy product surface built on top of it, now *centred* on the
|
|
22
|
+
Capsule Contract and *driven* by it. For a one-pass orientation,
|
|
23
|
+
start with [`docs/START_HERE.md`](docs/START_HERE.md). For canonical
|
|
24
|
+
research status see
|
|
25
|
+
[`docs/RESEARCH_STATUS.md`](docs/RESEARCH_STATUS.md); for the
|
|
26
|
+
canonical theorem registry see
|
|
27
|
+
[`docs/THEOREM_REGISTRY.md`](docs/THEOREM_REGISTRY.md); for the
|
|
28
|
+
do-not-overstate rule book see
|
|
29
|
+
[`docs/HOW_NOT_TO_OVERSTATE.md`](docs/HOW_NOT_TO_OVERSTATE.md).
|
|
30
|
+
|
|
31
|
+
## The Capsule Contract (SDK v3 centre of gravity)
|
|
32
|
+
|
|
33
|
+
CoordPy's durable top-level description is:
|
|
34
|
+
|
|
35
|
+
> **CoordPy is a context-capsule runtime.** Every inter-role,
|
|
36
|
+
> inter-layer, and inter-run artefact satisfies a six-invariant
|
|
37
|
+
> contract:
|
|
38
|
+
>
|
|
39
|
+
> **C1 Identity.** Stable content-address (SHA-256) over
|
|
40
|
+
> `(kind, payload, budget, parents)`.
|
|
41
|
+
> **C2 Typed claim.** Closed vocabulary of `CapsuleKind`.
|
|
42
|
+
> **C3 Lifecycle.** `PROPOSED → ADMITTED → SEALED` (+ optional
|
|
43
|
+
> `RETIRED`); illegal transitions are refused.
|
|
44
|
+
> **C4 Budget.** Explicit `CapsuleBudget` checked at admit
|
|
45
|
+
> time.
|
|
46
|
+
> **C5 Provenance.** Parents must be in the ledger; the ledger
|
|
47
|
+
> keeps a hash chain so any retroactive
|
|
48
|
+
> insert breaks `verify_chain()`.
|
|
49
|
+
> **C6 Frozen.** A sealed capsule's CID is fixed for all
|
|
50
|
+
> time.
|
|
51
|
+
|
|
52
|
+
The Phase-19 `Handle`, Phase-31 `TypedHandoff`, Phase-35
|
|
53
|
+
`ThreadResolution`, Phase-36 `AdaptiveEdge`, every `SweepSpec` /
|
|
54
|
+
sweep-cell, every `ARTIFACT` on disk, and the `RUN_REPORT` itself
|
|
55
|
+
are all capsule-shaped. The `CapsuleLedger` is their shared
|
|
56
|
+
append-only, hash-chained container. The `RUN_REPORT` capsule's CID
|
|
57
|
+
is the durable identifier for a CoordPy run — send someone that CID
|
|
58
|
+
plus `product_report.json` and they can reproduce every upstream
|
|
59
|
+
capsule, verify the chain end-to-end, and know the bytes haven't
|
|
60
|
+
drifted.
|
|
61
|
+
|
|
62
|
+
Reference implementation: `vision_mvp/coordpy/capsule.py`. Theory note:
|
|
63
|
+
[`docs/archive/coordpy-milestones/RESULTS_COORDPY_CAPSULE.md`](docs/archive/coordpy-milestones/RESULTS_COORDPY_CAPSULE.md).
|
|
64
|
+
Contract tests: `vision_mvp/tests/test_coordpy_capsules.py`
|
|
65
|
+
(invariants C1..C6 individually + end-to-end).
|
|
66
|
+
|
|
67
|
+
### Capsule-native execution (SDK v3.1)
|
|
68
|
+
|
|
69
|
+
The Capsule Contract above (C1..C6) describes *what a capsule is*.
|
|
70
|
+
SDK v3.1 adds the **execution-contract** layer: capsules drive
|
|
71
|
+
runtime, not just describe it.
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
(sealed in flight)
|
|
75
|
+
start_run seal_readiness seal_sweep_spec seal_sweep_cell seal_provenance seal_run_report
|
|
76
|
+
│ │ │ │ │ │
|
|
77
|
+
▼ ▼ ▼ ▼ ▼ ▼
|
|
78
|
+
PROFILE → READINESS_CHECK → SWEEP_SPEC → SWEEP_CELL → PROVENANCE → RUN_REPORT
|
|
79
|
+
↑ ↑
|
|
80
|
+
│ │
|
|
81
|
+
seal_and_write_artifact seal_and_write_artifact
|
|
82
|
+
(sweep_result.json) (provenance.json,
|
|
83
|
+
readiness_verdict.json)
|
|
84
|
+
(every substantive artefact gets an
|
|
85
|
+
ARTIFACT capsule whose payload SHA-256
|
|
86
|
+
is verified against the on-disk file's
|
|
87
|
+
bytes by re-read.)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
A stage that fails leaves a typed entry in the runtime's *in-flight
|
|
91
|
+
register* that never reaches the ledger. Downstream stages refuse
|
|
92
|
+
to seal because the parent CID is missing (Capsule Contract C5).
|
|
93
|
+
The capsule layer is therefore the runtime's typed execution
|
|
94
|
+
contract for the run-boundary stages (W3-32, W3-35).
|
|
95
|
+
|
|
96
|
+
### Intra-cell capsule-native + detached witness (SDK v3.2)
|
|
97
|
+
|
|
98
|
+
SDK v3.2 extends the capsule-native slice past the cell boundary.
|
|
99
|
+
Inside every sweep cell, each (task, strategy) parse→apply→test
|
|
100
|
+
transition seals two more capsules in flight:
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
SWEEP_SPEC
|
|
104
|
+
├── SWEEP_CELL_1 ··· SWEEP_CELL_n
|
|
105
|
+
├── PATCH_PROPOSAL_1 (parent: SWEEP_SPEC)
|
|
106
|
+
│ └── TEST_VERDICT_1 (parent: PATCH_PROPOSAL_1)
|
|
107
|
+
├── PATCH_PROPOSAL_2
|
|
108
|
+
│ └── TEST_VERDICT_2
|
|
109
|
+
└── ...
|
|
110
|
+
|
|
111
|
+
(post-fixed-point, secondary ledger)
|
|
112
|
+
RUN_REPORT
|
|
113
|
+
└── (cross-ref) META_MANIFEST
|
|
114
|
+
meta_artifacts:
|
|
115
|
+
product_report.json SHA
|
|
116
|
+
capsule_view.json SHA
|
|
117
|
+
product_summary.txt SHA
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
The lifecycle ordering ``patch → verdict`` is enforced at the
|
|
121
|
+
type level (Theorem W3-32-extended). The meta-artefact set is
|
|
122
|
+
formally a *circularity slice* (Theorem W3-36 — no extension of
|
|
123
|
+
the primary ledger can authenticate a file whose bytes encode
|
|
124
|
+
the rendered view), so the META_MANIFEST sits in a *secondary*
|
|
125
|
+
ledger and is the one-hop trust unit beyond the primary view.
|
|
126
|
+
``coordpy-capsule verify`` (v3.2) recomputes the chain from
|
|
127
|
+
on-disk header bytes (Theorem W3-37) and re-hashes every
|
|
128
|
+
ARTIFACT and meta-artefact at audit time (Theorem W3-38).
|
|
129
|
+
|
|
130
|
+
Reference implementation:
|
|
131
|
+
`vision_mvp/coordpy/capsule_runtime.py::CapsuleNativeRunContext`
|
|
132
|
+
(``seal_patch_proposal`` / ``seal_test_verdict`` /
|
|
133
|
+
``seal_meta_manifest``); hooks plumbed through
|
|
134
|
+
`vision_mvp/tasks/swe_sandbox.py::run_swe_loop_sandboxed`. Theory
|
|
135
|
+
notes:
|
|
136
|
+
[`docs/archive/coordpy-milestones/RESULTS_COORDPY_CAPSULE_NATIVE.md`](docs/archive/coordpy-milestones/RESULTS_COORDPY_CAPSULE_NATIVE.md)
|
|
137
|
+
(W3-32..W3-35) and
|
|
138
|
+
[`docs/archive/coordpy-milestones/RESULTS_COORDPY_INTRA_CELL.md`](docs/archive/coordpy-milestones/RESULTS_COORDPY_INTRA_CELL.md)
|
|
139
|
+
(W3-32-extended / W3-36 / W3-37 / W3-38). Contract tests:
|
|
140
|
+
`vision_mvp/tests/test_coordpy_capsule_native.py` (16 tests, v3.1)
|
|
141
|
+
and `vision_mvp/tests/test_coordpy_capsule_native_intra_cell.py`
|
|
142
|
+
(16 tests, v3.2).
|
|
143
|
+
|
|
144
|
+
The post-hoc `build_report_ledger` adapter is retained for third
|
|
145
|
+
parties who fold finished `product_report` dicts (no runtime
|
|
146
|
+
context available); the two paths produce CID-equivalent ledgers
|
|
147
|
+
for the spine kinds (Theorem W3-34, preserved under the v3.2
|
|
148
|
+
intra-cell extension because intra-cell capsules are siblings of
|
|
149
|
+
the spine, not modifications of it).
|
|
150
|
+
|
|
151
|
+
### Sub-intra-cell parse-outcome + lifecycle audit + determinism (SDK v3.3)
|
|
152
|
+
|
|
153
|
+
SDK v3.3 extends the discipline one further structural layer with a
|
|
154
|
+
PARSE_OUTCOME capsule sealed *before* every PATCH_PROPOSAL:
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
SWEEP_SPEC
|
|
158
|
+
├── SWEEP_CELL_1 ··· SWEEP_CELL_n
|
|
159
|
+
├── PARSE_OUTCOME_1 (parent: SWEEP_SPEC)
|
|
160
|
+
│ └── PATCH_PROPOSAL_1
|
|
161
|
+
│ (parents: SWEEP_SPEC + PARSE_OUTCOME_1)
|
|
162
|
+
│ └── TEST_VERDICT_1
|
|
163
|
+
├── PARSE_OUTCOME_2
|
|
164
|
+
│ └── PATCH_PROPOSAL_2
|
|
165
|
+
│ └── TEST_VERDICT_2
|
|
166
|
+
└── ...
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
The parser's structured outcome — `ok` boolean, closed-vocabulary
|
|
170
|
+
`failure_kind`, `recovery` label, substitutions count, bounded
|
|
171
|
+
detail — becomes a typed witness on the capsule DAG. The parse →
|
|
172
|
+
patch → verdict chain is enforced at the type level (Theorem W3-39).
|
|
173
|
+
|
|
174
|
+
The **lifecycle audit** (`vision_mvp/coordpy/lifecycle_audit.py`)
|
|
175
|
+
mechanically verifies eight invariants L-1..L-8 over a finished
|
|
176
|
+
ledger:
|
|
177
|
+
|
|
178
|
+
* L-1 No orphan capsules.
|
|
179
|
+
* L-2 PATCH_PROPOSAL parents include SWEEP_SPEC.
|
|
180
|
+
* L-3 TEST_VERDICT parent is exactly one sealed PATCH_PROPOSAL.
|
|
181
|
+
* L-4 PARSE_OUTCOME parent is exactly SWEEP_SPEC.
|
|
182
|
+
* L-5 SWEEP_CELL parent is exactly SWEEP_SPEC.
|
|
183
|
+
* L-6 PATCH_PROPOSAL ↔ TEST_VERDICT ↔ PARSE_OUTCOME coordinates
|
|
184
|
+
are equinumerous.
|
|
185
|
+
* L-7 PATCH_PROPOSAL coordinates match its PARSE_OUTCOME parent's.
|
|
186
|
+
* L-8 TEST_VERDICT is sealed strictly after its PATCH_PROPOSAL.
|
|
187
|
+
|
|
188
|
+
The audit returns OK / BAD / EMPTY plus typed counterexamples. It is
|
|
189
|
+
runnable from a `CapsuleNativeRunContext` (in-process) or from an
|
|
190
|
+
on-disk `capsule_view.json` alone (forensic). Theorem W3-40 anchors
|
|
191
|
+
the audit's soundness.
|
|
192
|
+
|
|
193
|
+
**Deterministic-mode replay** (`RunSpec(deterministic=True)`) strips
|
|
194
|
+
per-run / host-local / wall-clock fields from the
|
|
195
|
+
PROVENANCE / READINESS_CHECK / RUN_REPORT capsule payloads and the
|
|
196
|
+
ARTIFACT capsule paths so two runs of the same deterministic
|
|
197
|
+
profile (mock mode, `in_process` / `subprocess` sandbox, frozen
|
|
198
|
+
JSONL) produce byte-identical full-DAG CIDs and chain head
|
|
199
|
+
(Theorem W3-41). On-disk product_report.json still records
|
|
200
|
+
wall-clock fields for forensic context — the determinism is on
|
|
201
|
+
the capsule graph, not on wall clock.
|
|
202
|
+
|
|
203
|
+
Reference implementation:
|
|
204
|
+
`vision_mvp/coordpy/capsule_runtime.py::CapsuleNativeRunContext.seal_parse_outcome`,
|
|
205
|
+
`vision_mvp/coordpy/lifecycle_audit.py`,
|
|
206
|
+
`vision_mvp/product/runner.py::_canonicalise_for_determinism`.
|
|
207
|
+
Theory note:
|
|
208
|
+
[`docs/archive/coordpy-milestones/RESULTS_COORDPY_DEEP_INTRA_CELL.md`](docs/archive/coordpy-milestones/RESULTS_COORDPY_DEEP_INTRA_CELL.md).
|
|
209
|
+
Contract tests: `vision_mvp/tests/test_coordpy_capsule_native_deeper.py`
|
|
210
|
+
(18 tests).
|
|
211
|
+
|
|
212
|
+
### How capsules relate to the older CASR / substrate / handoff work
|
|
213
|
+
|
|
214
|
+
| Older primitive | Phase | Capsule kind it instantiates |
|
|
215
|
+
|--- |--- |--- |
|
|
216
|
+
| `context_ledger.Handle` | 19 | `HANDLE` |
|
|
217
|
+
| `role_handoff.TypedHandoff` | 31 | `HANDOFF` |
|
|
218
|
+
| `dynamic_comm.ThreadResolution` | 35 | `THREAD_RESOLUTION` |
|
|
219
|
+
| `adaptive_sub.AdaptiveEdge` | 36 | `ADAPTIVE_EDGE` |
|
|
220
|
+
| `coordpy.runtime.SweepSpec` | — | `SWEEP_SPEC` |
|
|
221
|
+
| per-cell sweep report (`coordpy.sweep.v2`) | — | `SWEEP_CELL` |
|
|
222
|
+
| `phase44_public_readiness` verdict | 44 | `READINESS_CHECK` |
|
|
223
|
+
| `coordpy.provenance.v1` manifest | — | `PROVENANCE` |
|
|
224
|
+
| on-disk `product_report.json` etc. | — | `ARTIFACT` |
|
|
225
|
+
| resolved profile dict | — | `PROFILE` |
|
|
226
|
+
| the run itself | — | `RUN_REPORT` |
|
|
227
|
+
|
|
228
|
+
The older primitives are **byte-for-byte unchanged**. The capsule
|
|
229
|
+
layer names the contract they already satisfied, lifts them under
|
|
230
|
+
one ledger, and makes that ledger the SDK's new public centre. None
|
|
231
|
+
of this is retrofitted cryptography: the hash-chaining that
|
|
232
|
+
`HandoffLog` already did (Phase 31), the content-addressing that
|
|
233
|
+
`MerkleDAG` / `ContextLedger.put` already did (Phase 19), and the
|
|
234
|
+
provenance manifest that every run already carried are the existing
|
|
235
|
+
evidence; SDK v3 recognises that they were instances of one shared
|
|
236
|
+
thing.
|
|
237
|
+
|
|
238
|
+
> **Naming.** `Context Zero` is the research programme; `CoordPy` is the first
|
|
239
|
+
> finished product produced by it. The original substrate proposal — **CASR**
|
|
240
|
+
> (Causal-Abstraction Scale-Renormalized Routing) — lives in
|
|
241
|
+
> `vision_mvp.core.*` as research-grade code and grounds CoordPy's O(log N)
|
|
242
|
+
> bounded-context claim (Theorem 3 in
|
|
243
|
+
> [`docs/archive/pre-coordpy-theory/PROOFS.md`](docs/archive/pre-coordpy-theory/PROOFS.md)).
|
|
244
|
+
> The programme's phase-by-
|
|
245
|
+
> phase diary lives in `vision_mvp/RESULTS_PHASE*.md`; the CoordPy SDK boundary
|
|
246
|
+
> lives under `vision_mvp/coordpy/` and is the stable public contract.
|
|
247
|
+
>
|
|
248
|
+
> **CoordPy SDK boundary (Slice 1 + v3 + v3.1).** The stable public
|
|
249
|
+
> surface is: `RunSpec` (with `capsule_native: bool = True`),
|
|
250
|
+
> `run`, `CoordPyConfig`, `profiles`, `report`, `ci_gate`,
|
|
251
|
+
> `import_data`, `build_manifest`, the capsule primitives
|
|
252
|
+
> (`ContextCapsule`, `CapsuleLedger`, `CapsuleView`,
|
|
253
|
+
> `build_report_ledger`, every `capsule_from_*` adapter),
|
|
254
|
+
> the capsule-native runtime symbols (`CapsuleNativeRunContext`,
|
|
255
|
+
> `seal_and_write_artifact`, `ContentAddressMismatch`,
|
|
256
|
+
> `CONSTRUCTION_IN_FLIGHT`, `CONSTRUCTION_POST_HOC`), and the schema
|
|
257
|
+
> constants (`coordpy.provenance.v1`, `phase45.product_report.v2`,
|
|
258
|
+
> `coordpy.capsule_view.v1`, `phase46.ci_verdict.v1`,
|
|
259
|
+
> `phase46.import_audit.v1`). See the **Stability matrix** in
|
|
260
|
+
> `README.md` and in `docs/context_zero_master_plan.md` for the
|
|
261
|
+
> durable classification of every layer (CoordPy SDK · capsule
|
|
262
|
+
> primitives · capsule-native runtime · core substrate · legacy
|
|
263
|
+
> product path · plugin/extension system · unified runtime ·
|
|
264
|
+
> Docker sandbox · research shards). Anything not on the SDK
|
|
265
|
+
> surface is research-grade or boundary/next-slice and may change
|
|
266
|
+
> without notice.
|
|
267
|
+
|
|
268
|
+
> **How to read the rest of this file.** The phase-by-phase
|
|
269
|
+
> callouts immediately below (Phases 26 → 44) are a *historical
|
|
270
|
+
> incremental record* of how the substrate was built up. They are
|
|
271
|
+
> kept verbatim for provenance; each claim is anchored to a
|
|
272
|
+
> `RESULTS_PHASE*.md` note and to tests. If you want the durable
|
|
273
|
+
> architecture, skip the phase callouts and read the layered
|
|
274
|
+
> substrate diagram further down (five substrate layers + render
|
|
275
|
+
> mode + runtime calibration + typed-handoff team layer), then § 3
|
|
276
|
+
> ("Architecture of the solution") in
|
|
277
|
+
> [`docs/context_zero_master_plan.md`](docs/context_zero_master_plan.md).
|
|
278
|
+
> For the CoordPy product surface specifically, see § 10 of the master
|
|
279
|
+
> plan and [`docs/START_HERE.md`](docs/START_HERE.md).
|
|
280
|
+
|
|
281
|
+
> **Architecture as of Phase 27: five substrate layers + a render
|
|
282
|
+
> mode + a snippet-scale runtime-calibration observer (Phase 26) +
|
|
283
|
+
> a *corpus-scale* runtime-calibration observer (Phase 27). Conservative
|
|
284
|
+
> intraprocedural + interprocedural semantic code analysis sits in
|
|
285
|
+
> the ingestion layer; the new runtime-calibration layer observes
|
|
286
|
+
> instrumented execution of the same code against a per-predicate
|
|
287
|
+
> probe set and reports the analyzer-vs-runtime divergence matrix.**
|
|
288
|
+
> The original CASR spec (below) covers the *routing* and *trigger*
|
|
289
|
+
> layers. Phases 19–21 added three more layers — *exact external
|
|
290
|
+
> memory*, *retrieval*, and *computation/planning* — that handle
|
|
291
|
+
> the content and aggregation sides of context. Phase 22 generalised
|
|
292
|
+
> the substrate to real Python codebases (AST-derived typed
|
|
293
|
+
> metadata) and introduced the **direct-exact** render path that
|
|
294
|
+
> bypasses the LLM when the planner has the answer. Phase 23
|
|
295
|
+
> validated the Phase-22 result across six real Python corpora
|
|
296
|
+
> (research / utility / test / CLI-framework / stdlib) with a
|
|
297
|
+
> reusable multi-corpus registry. Phase 24 extended the direct-
|
|
298
|
+
> exact guarantee from syntactic structure to conservative
|
|
299
|
+
> *intraprocedural* static-semantic predicates (`may_raise`,
|
|
300
|
+
> `is_recursive`, `may_write_global`,
|
|
301
|
+
> `calls_subprocess`/`filesystem`/`network`), computed from the AST
|
|
302
|
+
> by `core/code_semantics`; direct-exact scored 44 / 44 (100 %, σ
|
|
303
|
+
> = 0) on the semantic battery across six corpora. Phase 25
|
|
304
|
+
> extended the exact slice to conservative *interprocedural*
|
|
305
|
+
> semantic predicates — transitive closures over a local call graph
|
|
306
|
+
> plus Tarjan-SCC recursion-cycle detection — via
|
|
307
|
+
> `core/code_interproc`; direct-exact scored **50 / 50 (100 %, σ =
|
|
308
|
+
> 0)** on the Phase-25 interprocedural battery across the same six
|
|
309
|
+
> corpora with zero LLM calls and zero prompt chars. **Phase 26
|
|
310
|
+
> introduces a separate truth axis — *runtime-truth calibration* —
|
|
311
|
+
> via instrumented probes that observe how a function actually
|
|
312
|
+
> behaves when executed. The runtime layer is ADDITIVE: it does
|
|
313
|
+
> NOT replace the analyzer or planner; it sits alongside them as
|
|
314
|
+
> an observer that reports analyzer-vs-runtime divergence per
|
|
315
|
+
> predicate. On a 21-snippet executable corpus spanning 8 families,
|
|
316
|
+
> the analyzer agrees with runtime observation on 123 / 126
|
|
317
|
+
> (97.6 %) applicable measurements; every divergence lands on a
|
|
318
|
+
> Phase-24 pre-documented boundary condition. Analyzer-gold
|
|
319
|
+
> exactness and runtime-truth calibration are formalised as
|
|
320
|
+
> independent axes (Theorem P26-1); the direct-exact planner's
|
|
321
|
+
> 126 / 126 round-trip to the analyzer demonstrates the substrate
|
|
322
|
+
> guarantee is independent of analyzer calibration.**
|
|
323
|
+
>
|
|
324
|
+
> **Phase 28 extends the runtime-calibration axis along two
|
|
325
|
+
> orthogonal dimensions: (a) runtime calibration is run over the
|
|
326
|
+
> full local Phase-23 corpus set (`vision-core`, `vision-tasks`,
|
|
327
|
+
> `vision-tests`, `vision-experiments`) with coverage reported
|
|
328
|
+
> as a first-class cross-corpus variable (`ready_fraction` ranges
|
|
329
|
+
> from 2.9 % to 80.2 %), and (b) the analyzer's `may_raise` axis
|
|
330
|
+
> is split — the Phase-24 contract is preserved unchanged as
|
|
331
|
+
> `may_raise_explicit` (sound, FN = 0 across all four corpora),
|
|
332
|
+
> and a new conservative sound-over-precision predicate
|
|
333
|
+
> `may_raise_implicit` is added for implicit-raise propagation
|
|
334
|
+
> from builtin operations (soundness: FN = 1 / 116 runtime-
|
|
335
|
+
> positives on the pooled entered slice). The substrate layer is
|
|
336
|
+
> unchanged — Phase 28 touches the analyzer (`code_semantics`,
|
|
337
|
+
> `code_interproc`), the runtime observer (`code_runtime_calibration`,
|
|
338
|
+
> `code_corpus_runtime`), and adds the benchmark
|
|
339
|
+
> `phase28_multi_corpus_runtime_calibration`. See Theorems
|
|
340
|
+
> P28-1..P28-4 in `RESULTS_PHASE28.md`.**
|
|
341
|
+
>
|
|
342
|
+
> **Phase 29 adds two couples-but-independent pieces. First, a
|
|
343
|
+
> task-scale causal-relevance harness (`tasks/task_scale_swe` +
|
|
344
|
+
> `experiments/phase29_task_scale_falsifiability`) that runs the
|
|
345
|
+
> routing / substrate stack over a multi-role SWE-style task
|
|
346
|
+
> distribution and measures, per (task, role, event), whether the
|
|
347
|
+
> event is *causally relevant* under an analyzer-derived oracle.
|
|
348
|
+
> On 80 queries / 5 718 events across four corpora, the pooled
|
|
349
|
+
> aggregator-role causal-relevance fraction under naive broadcast
|
|
350
|
+
> is **4.54 %**; the substrate collapses aggregator context by
|
|
351
|
+
> **1 007×** at **100 %** correctness on matched tasks. This is
|
|
352
|
+
> the first task-scale test of the core thesis; falsifiability
|
|
353
|
+
> decision on the ROADMAP gate: **CONFIRMED** (Theorems P29-1 /
|
|
354
|
+
> P29-2 / P29-3 / P29-4). Second, a conservative method-instance
|
|
355
|
+
> auto-construction recipe (extends `code_corpus_runtime`):
|
|
356
|
+
> methods on safely-zero-arg-constructable classes (no custom
|
|
357
|
+
> `__init__`, or `__init__` with only self + defaulted params,
|
|
358
|
+
> or `@dataclass`-all-defaulted) promote to a new `ready_method`
|
|
359
|
+
> status; the probe constructs the instance under the Phase-26
|
|
360
|
+
> sandbox + Phase-27 budget tracer. Runtime `ready_fraction` on
|
|
361
|
+
> `vision-tests` lifts 2.9 % → 98.8 %; pooled entered slice grows
|
|
362
|
+
> 4.83× (306 → 1 477) with `may_raise_explicit` FN preserved at 0
|
|
363
|
+
> and construct-failed < 1 % (Theorem P29-5). The substrate layer
|
|
364
|
+
> is unchanged; Phase 29 touches `code_corpus_runtime` (method
|
|
365
|
+
> coverage) and adds the task-scale harness. See Theorems
|
|
366
|
+
> P29-1..P29-8 in `RESULTS_PHASE29.md`.**
|
|
367
|
+
>
|
|
368
|
+
> **Phase 31 adds a new substrate layer on the *team-communication*
|
|
369
|
+
> axis — typed, content-addressed, role-scoped handoffs between
|
|
370
|
+
> agents — and ships the programme's first *non-code* task-scale
|
|
371
|
+
> benchmark. The new module (`core/role_handoff.py`) provides
|
|
372
|
+
> `TypedHandoff`, `RoleSubscriptionTable`, bounded `RoleInbox`,
|
|
373
|
+
> hash-chained `HandoffLog`, per-(source_role, to_role,
|
|
374
|
+
> claim_kind) `DeliveryAccount`, and a `HandoffRouter`. The layer
|
|
375
|
+
> sits one level above the Phase-1/29 role-keyed Bloom routing: it
|
|
376
|
+
> routes by *claim kind* (e.g. `SLOW_QUERY_OBSERVED`,
|
|
377
|
+
> `DISK_FILL_CRITICAL`), so downstream roles can subscribe to
|
|
378
|
+
> load-bearing content without reading the payload. The companion
|
|
379
|
+
> benchmark (`tasks/incident_triage`) runs a five-role operational
|
|
380
|
+
> incident-triage team across five scenario kinds and four
|
|
381
|
+
> delivery strategies; substrate prompt size is **flat at 196
|
|
382
|
+
> tokens** across distractor densities k ∈ {6, 20, 60, 120}
|
|
383
|
+
> (event-stream 40 → 440 events), while naive collapses from 100 %
|
|
384
|
+
> → 20 % at k=120 under truncation. Theorems P31-1..P31-5 + two
|
|
385
|
+
> conjectures formalise the role-conditioned relevance
|
|
386
|
+
> factorisation, communication-sparsity lower bound, bounded-
|
|
387
|
+
> context upper bound, correctness preservation under subscription
|
|
388
|
+
> coverage, and a provable separation from any single-agent
|
|
389
|
+
> compression of the event stream (P31-5). See Theorems
|
|
390
|
+
> P31-1..P31-5 in `RESULTS_PHASE31.md`.**
|
|
391
|
+
>
|
|
392
|
+
> **Phase 39 adds a multi-role SWE-bench-style bridge
|
|
393
|
+
> *strictly above* the Phase-31 typed-handoff substrate
|
|
394
|
+
> and ships the first real-LLM data point on the
|
|
395
|
+
> Phase-38 prompt-variant pipeline:
|
|
396
|
+
> (a) `tasks/swe_bench_bridge` — a `SWEBenchStyleTask`
|
|
397
|
+
> schema that mirrors the public SWE-bench instance shape
|
|
398
|
+
> (`instance_id`, `repo`, `base_commit`,
|
|
399
|
+
> `problem_statement`, `gold_patch`, `test_source`); a
|
|
400
|
+
> four-instance hand-authored `MiniSWEBank` whose patches
|
|
401
|
+
> are line-anchored substitutions and whose hidden tests
|
|
402
|
+
> run in a fresh `exec` namespace (no shell, no
|
|
403
|
+
> subprocess, no network); a four-role team
|
|
404
|
+
> (`issue_reader` / `code_searcher` / `patch_generator`
|
|
405
|
+
> / `test_runner`) wired through the unchanged Phase-31
|
|
406
|
+
> `HandoffRouter`; a `SWEBenchAdapter.from_dict` shim
|
|
407
|
+
> documenting the schema mapping for a future real-
|
|
408
|
+
> SWE-bench loader. **Theorem P39-3** (substrate
|
|
409
|
+
> bounded-context preservation) — the patch_generator's
|
|
410
|
+
> prompt size is independent of `n_distractors` (842
|
|
411
|
+
> chars at every distractor count) while naive grows
|
|
412
|
+
> from 949 → 1936; **Theorem P39-4** (schema
|
|
413
|
+
> mappability) — the gap to public SWE-bench is
|
|
414
|
+
> adapter-shaped, not architectural.
|
|
415
|
+
> (b) `experiments/phase39_swe_bridge` — a runnable
|
|
416
|
+
> driver supporting `--mode mock` (deterministic oracle
|
|
417
|
+
> generator; sub-second) and `--mode real` (Ollama LLM
|
|
418
|
+
> patch generator).
|
|
419
|
+
> (c) `experiments/phase39_frontier_substrate` — a
|
|
420
|
+
> bounded cross-family sweep on Phase-31 incident triage
|
|
421
|
+
> across `llama3.1:8b`, `gemma2:9b`, `qwen2.5-coder:7b`.
|
|
422
|
+
> (d) Real-LLM data point on the Phase-38 prompt
|
|
423
|
+
> calibration pipeline (the existing
|
|
424
|
+
> `phase38_prompt_calibration --mode real` driver).
|
|
425
|
+
> **Theorem P39-1**: on `qwen2.5:0.5b`, four of five
|
|
426
|
+
> Phase-38 variants reproduce the Phase-37 default
|
|
427
|
+
> distribution to within ±0 calls; the bias is
|
|
428
|
+
> *model-shaped, not prompt-shaped* on this size class.
|
|
429
|
+
> **Theorem P39-2** (regime taxonomy): every team-
|
|
430
|
+
> shaped task admits a *communication-bounded* vs
|
|
431
|
+
> *transcription-bounded* decomposition; the substrate
|
|
432
|
+
> is the gating constraint only when the synthesis
|
|
433
|
+
> layer is order-preserving on the typed bundle. No
|
|
434
|
+
> Phase-31 through Phase-38 primitive is modified. See
|
|
435
|
+
> `RESULTS_PHASE39.md`.**
|
|
436
|
+
>
|
|
437
|
+
> **Phase 43 adds a semantic-failure taxonomy layer
|
|
438
|
+
> *strictly above* the Phase-42 parser-compliance layer,
|
|
439
|
+
> a public-style loader self-test, a frontier-model run
|
|
440
|
+
> (``qwen3.5:35b`` 36B-MoE on the ASPEN cluster), and one
|
|
441
|
+
> byte-safe trailing-delimiter pattern added to the Phase-42
|
|
442
|
+
> ``_strip_trailing_prose`` list.** Four coupled additions,
|
|
443
|
+
> all *strictly above* the Phase-42 layer (every Phase-42
|
|
444
|
+
> default preserves Phase-42 byte-for-byte):
|
|
445
|
+
> (a) ``vision_mvp/tasks/swe_semantic_taxonomy.py`` (NEW) —
|
|
446
|
+
> nine-label closed vocabulary (``SEM_OK`` / ``SEM_PARSE_FAIL``
|
|
447
|
+
> / ``SEM_WRONG_EDIT_SITE`` / ``SEM_RIGHT_SITE_WRONG_LOGIC``
|
|
448
|
+
> / ``SEM_INCOMPLETE_MULTI_HUNK`` / ``SEM_TEST_OVERFIT`` /
|
|
449
|
+
> ``SEM_STRUCTURAL_SEMANTIC_INERT`` / ``SEM_SYNTAX_INVALID``
|
|
450
|
+
> / ``SEM_NO_MATCH_RESIDUAL``) with a pure deterministic
|
|
451
|
+
> classifier and ``SemanticCounter`` aggregator. Sits
|
|
452
|
+
> strictly above the Phase-42 parser-compliance counter in
|
|
453
|
+
> the analysis stack.
|
|
454
|
+
> (b) ``vision_mvp/experiments/phase43_frontier_headroom.py``
|
|
455
|
+
> (NEW) — Phase-43 analysis driver. Ingests Phase-42-shape
|
|
456
|
+
> artifacts, re-derives per-cell semantic labels, emits
|
|
457
|
+
> cross-model summary JSON. Includes
|
|
458
|
+
> ``verify_public_style_loader`` that round-trips every
|
|
459
|
+
> bank instance through the loader + strict matcher under
|
|
460
|
+
> the oracle (57/57 saturation on the bundled bank).
|
|
461
|
+
> (c) ``vision_mvp/core/llm_client.py`` (EXTENDED) —
|
|
462
|
+
> ``LLMClient(think=…)`` threads Ollama's ``/api/generate``
|
|
463
|
+
> ``think`` field for Qwen3-class thinking models so their
|
|
464
|
+
> output budget is not consumed by internal reasoning.
|
|
465
|
+
> Default ``None`` preserves Phase-42 byte-for-byte.
|
|
466
|
+
> (d) ``vision_mvp/tasks/swe_patch_parser.py`` (one-pattern
|
|
467
|
+
> regression fix) — ``_PROSE_TAILS`` gains one pattern
|
|
468
|
+
> ``\n\s*<{2,4}\s*\Z`` that strips partial / full trailing
|
|
469
|
+
> delimiters (``<<``, ``<<<``, ``<<<<``). Surfaced by the
|
|
470
|
+
> ``qwen3.5:35b`` cluster run's unclosed_new failure shape.
|
|
471
|
+
> Byte-safe under Theorem P42-2.
|
|
472
|
+
>
|
|
473
|
+
> **Phase 43 theory**: Theorem P43-1 (bounded-context
|
|
474
|
+
> preservation on the external-validity bank — substrate
|
|
475
|
+
> 205.9 tokens flat across the full
|
|
476
|
+
> parser × matcher × distractor cross product); Theorem
|
|
477
|
+
> P43-2 (post-parser-recovery semantic residue is
|
|
478
|
+
> structurally classifiable — nine-label taxonomy is total,
|
|
479
|
+
> exhaustive, deterministic); Theorem P43-3 (semantic-ceiling
|
|
480
|
+
> separation on coder-finetuned models at N ≥ 50 —
|
|
481
|
+
> substrate-vs-naive gap is 0 pp on every measured
|
|
482
|
+
> coder-finetuned model, per-strategy failure-mix
|
|
483
|
+
> histograms are byte-identical, and the dominant residue
|
|
484
|
+
> label is ``SEM_WRONG_EDIT_SITE`` on coder-finetuned
|
|
485
|
+
> models vs ``SEM_SYNTAX_INVALID`` on general-purpose
|
|
486
|
+
> models of matched parameter class). Four conjectures
|
|
487
|
+
> (C43-1..C43-4). The programme's durable substrate claim
|
|
488
|
+
> is now unambiguous: *bounded active context per role*, not
|
|
489
|
+
> pass@1 lift. See ``RESULTS_PHASE43.md``.
|
|
490
|
+
>
|
|
491
|
+
> **Phase 44 adds raw-text residue capture, a refined semantic
|
|
492
|
+
> taxonomy (v2 classifier), and a validated public-SWE-bench-
|
|
493
|
+
> Lite drop-in readiness pipeline — *strictly above* the
|
|
494
|
+
> Phase-43 analysis layer.** Four coupled additions, all
|
|
495
|
+
> strictly additive (every Phase-43 default preserves
|
|
496
|
+
> Phase-43 byte-for-byte):
|
|
497
|
+
> (a) ``vision_mvp/tasks/swe_raw_capture.py`` (NEW) —
|
|
498
|
+
> ``RawCaptureRecord`` / ``RawCaptureStore`` with schema
|
|
499
|
+
> version ``phase44.v1``. Each record persists the raw LLM
|
|
500
|
+
> bytes + SHA-256, the ``ParseOutcome`` dict, the proposed
|
|
501
|
+
> substitutions, the applied substitutions after the matcher,
|
|
502
|
+
> the patched-source SHA-256, and the downstream verdict.
|
|
503
|
+
> ``make_capturing_generator`` wraps a bridge generator or a
|
|
504
|
+
> fresh ``llm_call`` and plumbs raw text into the store
|
|
505
|
+
> while preserving the Phase-42 LLM-output cache discipline.
|
|
506
|
+
> (b) ``vision_mvp/tasks/swe_semantic_taxonomy.py``
|
|
507
|
+
> (EXTENDED) — five new sub-labels
|
|
508
|
+
> (``SEM_RIGHT_FILE_WRONG_SPAN``, ``SEM_RIGHT_SPAN_WRONG_LOGIC``,
|
|
509
|
+
> ``SEM_PARTIAL_MULTI_HUNK_SUCCESS``,
|
|
510
|
+
> ``SEM_NARROW_FIX_TEST_OVERFIT``, ``SEM_STRUCTURAL_VALID_INERT``)
|
|
511
|
+
> partition the Phase-43 coarse buckets when raw bytes are
|
|
512
|
+
> available. ``classify_semantic_outcome_v2`` subsumes the v1
|
|
513
|
+
> classifier on sentinel inputs (Theorem P44-2).
|
|
514
|
+
> ``REFINEMENT_MAP`` is reflexive so the sentinel path
|
|
515
|
+
> remains a legal v2 classification.
|
|
516
|
+
> (c) ``vision_mvp/experiments/phase44_semantic_residue.py``
|
|
517
|
+
> (NEW) — sweep mode runs the Phase-42-shape experiment with
|
|
518
|
+
> raw capture on; analyse-only mode consumes (parent,
|
|
519
|
+
> capture) pairs and emits a ``phase44.summary.v1`` JSON
|
|
520
|
+
> with per-cell coarse + refined counters and a
|
|
521
|
+
> ``coarse_to_refined_partition`` audit.
|
|
522
|
+
> (d) ``vision_mvp/experiments/phase44_public_readiness.py``
|
|
523
|
+
> (NEW) — five-check CI-gate validator (schema / adapter /
|
|
524
|
+
> parser / matcher / test_runner) on any local JSONL.
|
|
525
|
+
> Emits ``{"ready": true, "n": 57, ...}`` on the bundled
|
|
526
|
+
> bank in ~5 s wall (Theorem P44-3).
|
|
527
|
+
>
|
|
528
|
+
> **Phase 44 theory**: Theorem P44-1 (raw capture is a
|
|
529
|
+
> lossless projection of pipeline state); Theorem P44-2
|
|
530
|
+
> (refined classifier is monotone on sentinel inputs —
|
|
531
|
+
> backwards-compatibility with Phase 43 is a theorem, not an
|
|
532
|
+
> aspiration); Theorem P44-3 (public-readiness saturates on
|
|
533
|
+
> the bundled bank at external-validity scale — the
|
|
534
|
+
> externalisation gap is now purely data-availability).
|
|
535
|
+
> Four conjectures (C44-1..C44-4) frame the sharper
|
|
536
|
+
> residue-composition questions raw capture makes
|
|
537
|
+
> measurable. See ``RESULTS_PHASE44.md``.
|
|
538
|
+
>
|
|
539
|
+
> **Phase 42 adds the parser-compliance attribution layer
|
|
540
|
+
> on top of the Phase-41 matcher axis and grows the
|
|
541
|
+
> SWE-bench-Lite-style bank past the ≥ 50-instance
|
|
542
|
+
> external-validity threshold.** Three coupled additions,
|
|
543
|
+
> all *strictly above* the Phase-41 layer (every Phase-41
|
|
544
|
+
> default preserves Phase-41 byte-for-byte):
|
|
545
|
+
> (a) `tasks/swe_patch_parser` (NEW) — a
|
|
546
|
+
> `parse_patch_block(text, mode, unified_diff_parser)`
|
|
547
|
+
> entry point with three modes (`PARSER_STRICT` = Phase-41
|
|
548
|
+
> baseline; `PARSER_ROBUST` = Phase-42 default with five
|
|
549
|
+
> named recovery heuristics; `PARSER_UNIFIED` = diff-only),
|
|
550
|
+
> a closed ten-label failure taxonomy (`PARSE_OK`,
|
|
551
|
+
> `PARSE_EMPTY_OUTPUT`, `PARSE_NO_BLOCK`,
|
|
552
|
+
> `PARSE_UNCLOSED_NEW`, `PARSE_UNCLOSED_OLD`,
|
|
553
|
+
> `PARSE_MALFORMED_DIFF`, `PARSE_EMPTY_PATCH`,
|
|
554
|
+
> `PARSE_MULTI_BLOCK`, `PARSE_PROSE_ONLY`,
|
|
555
|
+
> `PARSE_FENCED_ONLY`), and a six-label recovery enum
|
|
556
|
+
> (`RECOVERY_NONE`, `RECOVERY_CLOSED_AT_EOS`,
|
|
557
|
+
> `RECOVERY_FENCED_CODE`, `RECOVERY_LABEL_PREFIX`,
|
|
558
|
+
> `RECOVERY_UNIFIED_DIFF`, `RECOVERY_LOOSE_DELIM`).
|
|
559
|
+
> `ParserComplianceCounter` exposes `compliance_rate` /
|
|
560
|
+
> `raw_compliance_rate` / `recovery_lift` per cell.
|
|
561
|
+
> (b) `tasks/swe_bench_bridge` (EXTENDED) —
|
|
562
|
+
> `llm_patch_generator(..., parser_mode=…,
|
|
563
|
+
> parser_counter=…, prompt_style=…)` routes the parser axis
|
|
564
|
+
> from the bridge boundary; `None` preserves the Phase-41
|
|
565
|
+
> regex byte-for-byte. `build_patch_generator_prompt(…,
|
|
566
|
+
> prompt_style="block" | "unified_diff")` opts into a
|
|
567
|
+
> unified-diff output contract. Re-exports
|
|
568
|
+
> `parse_patch_block` / `ParseOutcome` /
|
|
569
|
+
> `ParserComplianceCounter` so one import gives the
|
|
570
|
+
> caller the full Phase-42 surface.
|
|
571
|
+
> (c) `tasks/data/swe_lite_style_bank.jsonl`
|
|
572
|
+
> (REGENERATED) — the Phase-41 28-instance bank grown
|
|
573
|
+
> with 29 new instances covering string manipulation,
|
|
574
|
+
> numeric guards, sequence construction, dict helpers,
|
|
575
|
+
> recursion/iteration, exception handling, set algebra,
|
|
576
|
+
> class state transitions (`StopLight` multi-hunk,
|
|
577
|
+
> `Stack.pop`), binary search off-by-one, graph walk
|
|
578
|
+
> reachability, and default argument correction. Every
|
|
579
|
+
> new instance validated via the same oracle-round-trip
|
|
580
|
+
> precondition as Phase 41.
|
|
581
|
+
> (d) `core/llm_client` (EXTENDED) —
|
|
582
|
+
> `LLMClient(base_url=None)` plumbs the ASPEN cluster
|
|
583
|
+
> endpoints (macbook-1 `http://192.168.12.191:11434`,
|
|
584
|
+
> macbook-2 `http://192.168.12.248:11434`); default
|
|
585
|
+
> `None` preserves the Phase-41 localhost semantics.
|
|
586
|
+
> (e) `experiments/phase42_parser_sweep` (NEW) — sweeps
|
|
587
|
+
> `(parser_mode × apply_mode × n_distractors)` with an
|
|
588
|
+
> LLM-output cache keyed per
|
|
589
|
+
> `(instance_id, strategy_proxy, n_distractors,
|
|
590
|
+
> prompt_style)` so the parser-mode axis re-parses
|
|
591
|
+
> cached text; emits the per-strategy
|
|
592
|
+
> `{recovered, regressed, unchanged_pass,
|
|
593
|
+
> unchanged_fail}` set delta between strict and each
|
|
594
|
+
> non-strict parser. **Theorem P42-1** (parser-compliance
|
|
595
|
+
> attribution: `Δ pass@1 = |R_recovered_parser| −
|
|
596
|
+
> |R_regressed_parser|` under every matcher × strategy ×
|
|
597
|
+
> distractor cell; promotes Conjecture C41-5 to theorem).
|
|
598
|
+
> **Theorem P42-2** (parser recovery cannot produce a
|
|
599
|
+
> false pass — byte-provenance argument). **Theorem
|
|
600
|
+
> P42-3** (robust parser dominates on format-
|
|
601
|
+
> noncompliant generators). Combined with Theorem P41-3
|
|
602
|
+
> and Theorem P39-2, the programme now has a
|
|
603
|
+
> **three-axis attribution surface**
|
|
604
|
+
> (parser × matcher × substrate). Phase-42 mock
|
|
605
|
+
> reproduces Theorem P41-1 on the 57-instance bank
|
|
606
|
+
> (substrate prompt 205.9 tokens flat, naive 197 → 527,
|
|
607
|
+
> 1 368 sandboxed measurements in 122 s). See
|
|
608
|
+
> `RESULTS_PHASE42.md`.
|
|
609
|
+
>
|
|
610
|
+
> **Phase 41 moves the Phase-40 real SWE loop to first
|
|
611
|
+
> larger-N data with a two-axis attribution surface.**
|
|
612
|
+
> Three coupled additions, all *strictly above* the
|
|
613
|
+
> Phase-40 layer (every Phase-40 artifact reruns
|
|
614
|
+
> byte-for-byte under the Phase-41 defaults):
|
|
615
|
+
> (a) `tasks/data/swe_lite_style_bank.jsonl` (NEW) —
|
|
616
|
+
> a 28-instance SWE-bench-Lite-shape JSONL bank
|
|
617
|
+
> (~4.7× the Phase-40 mini bank) covering a disciplined
|
|
618
|
+
> spectrum of edit shapes: operator-typo, off-by-one,
|
|
619
|
+
> wrong-branch, seed-wrong, aggregate-missing, mutation-
|
|
620
|
+
> vs-copy, multi-hunk (one class touches two methods),
|
|
621
|
+
> parity-partition, slice-direction, index-return,
|
|
622
|
+
> polarity-flipped, empty-guard, type-conversion,
|
|
623
|
+
> unicode edge, ambiguous comparator. A bank-builder
|
|
624
|
+
> (`_build_swe_lite_bank.py`) round-trips every instance
|
|
625
|
+
> through `parse_unified_diff + apply_patch +
|
|
626
|
+
> run_patched_test` before writing; refuses to register
|
|
627
|
+
> any instance whose diff doesn't parse, whose OLD blocks
|
|
628
|
+
> aren't uniquely anchored, or whose oracle-patched
|
|
629
|
+
> source doesn't pass the hidden test. The JSONL is the
|
|
630
|
+
> reproducibility precondition: Phase-41 evaluation runs
|
|
631
|
+
> offline in seconds.
|
|
632
|
+
> (b) `tasks/swe_bench_bridge` + `tasks/swe_sandbox`
|
|
633
|
+
> (EXTENDED) — `apply_patch` accepts an `apply_mode`
|
|
634
|
+
> kwarg ∈ {`strict` (default, Phase-40 byte-exact),
|
|
635
|
+
> `lstrip` (leading-whitespace drift tolerance),
|
|
636
|
+
> `ws_collapse` (internal-whitespace drift),
|
|
637
|
+
> `line_anchored` (trailing-whitespace drift)}. All three
|
|
638
|
+
> permissive modes retain **unique-match discipline**
|
|
639
|
+
> (a normalised OLD that appears more than once in the
|
|
640
|
+
> normalised source is rejected as `old_ambiguous`).
|
|
641
|
+
> `apply_mode` is threaded through `run_swe_loop`,
|
|
642
|
+
> every `Sandbox.run(...)` backend, and
|
|
643
|
+
> `run_swe_loop_sandboxed`; `SWEReport.config` records
|
|
644
|
+
> it for audit.
|
|
645
|
+
> (c) `experiments/phase41_swe_lite_sweep` (NEW) — the
|
|
646
|
+
> attribution-aware driver. Caches each LLM call per
|
|
647
|
+
> `(instance_id, strategy, n_distractors)` so permissive
|
|
648
|
+
> cells reuse strict cells' proposals (no extra LLM
|
|
649
|
+
> wall on the matcher axis); emits a per-strategy
|
|
650
|
+
> `{recovered, regressed, unchanged_pass,
|
|
651
|
+
> unchanged_fail}` set delta between each permissive
|
|
652
|
+
> mode and the strict baseline. **Theorem P41-1**
|
|
653
|
+
> (bounded-context preservation at 4.7× scale —
|
|
654
|
+
> substrate 746.4 chars flat, naive 806.8 → 2 125.8
|
|
655
|
+
> across `n_distractors ∈ {0, 6, 12, 24}` on 672
|
|
656
|
+
> sandboxed measurements). **Theorem P41-2** (oracle-
|
|
657
|
+
> ceiling is matcher-mode-invariant — permissive
|
|
658
|
+
> matching subtracts no correctness from a byte-exact
|
|
659
|
+
> generator). **Theorem P41-3** (matcher-permissiveness
|
|
660
|
+
> attribution decomposition: `Δ pass@1 = |R_recovered|
|
|
661
|
+
> − |R_regressed|`). Combined with Theorem P39-2, the
|
|
662
|
+
> programme now has a **two-axis attribution surface**
|
|
663
|
+
> for any real SWE loop — substrate delivery × matcher
|
|
664
|
+
> precision. Real-LLM sweeps on `qwen2.5-coder:7b`
|
|
665
|
+
> (28 instances) and `gemma2:9b` (subset) populate the
|
|
666
|
+
> attribution tables. See `RESULTS_PHASE41.md`.
|
|
667
|
+
>
|
|
668
|
+
> **Phase 40 makes the Phase-39 SWE bridge a real
|
|
669
|
+
> external task loop.** Three coupled additions, all
|
|
670
|
+
> *strictly above* the Phase-39 schema layer:
|
|
671
|
+
> (a) `tasks/swe_bench_bridge` extension —
|
|
672
|
+
> `parse_unified_diff` (a tolerant `git diff` parser),
|
|
673
|
+
> `SWEBenchAdapter.from_swe_bench_dict` (the real-shape
|
|
674
|
+
> adapter that derives `buggy_function` from the diff
|
|
675
|
+
> hunk and promotes a `test_patch` into a runnable
|
|
676
|
+
> `test_source`), `load_jsonl_bank` (hermetic JSONL
|
|
677
|
+
> loader with per-instance file namespacing), and a
|
|
678
|
+
> bundled six-instance JSONL artifact
|
|
679
|
+
> (`vision_mvp/tasks/data/swe_real_shape_mini.jsonl`).
|
|
680
|
+
> (b) `tasks/swe_sandbox` (NEW) — a `Sandbox` protocol
|
|
681
|
+
> with three backends: `InProcessSandbox` (Phase-39
|
|
682
|
+
> wrapped), `SubprocessSandbox` (new — wall-clock
|
|
683
|
+
> timeout, tempdir cwd, sanitised env, JSON outcome
|
|
684
|
+
> protocol so test-level vs sandbox-level failures are
|
|
685
|
+
> attributable), `DockerSandbox` (new — optional;
|
|
686
|
+
> `--network=none --read-only` rootfs, `tmpfs /work`,
|
|
687
|
+
> `--stop-timeout`). `select_sandbox("auto")` picks
|
|
688
|
+
> Docker → subprocess → in-process by availability;
|
|
689
|
+
> `run_swe_loop_sandboxed` is the sandbox-aware
|
|
690
|
+
> substrate runner.
|
|
691
|
+
> (c) `experiments/phase40_real_swe_bridge` (NEW) — the
|
|
692
|
+
> end-to-end driver that composes loader + substrate +
|
|
693
|
+
> sandbox + (optional) real LLM patch generator. Mock
|
|
694
|
+
> run: 72 sandboxed measurements, pass@1 = 1.000 on
|
|
695
|
+
> every (strategy, distractor) cell. Real-LLM runs:
|
|
696
|
+
> qwen2.5:0.5b (transcription-bounded, every cell hits
|
|
697
|
+
> patch_no_match) and qwen2.5-coder:7b (5/6 under
|
|
698
|
+
> naive/routing, 4/6 under substrate — honest variance
|
|
699
|
+
> at small N inside the P39-2 transcription-bounded
|
|
700
|
+
> regime). **Theorem P40-1** (unidiff round-trip),
|
|
701
|
+
> **Theorem P40-2** (real-shape substrate bounded-
|
|
702
|
+
> context preservation — substrate prompt 813 chars
|
|
703
|
+
> across n_distractors ∈ {0, 6, 12, 24}; naive grows
|
|
704
|
+
> 826 → 2 145), **Theorem P40-3** (sandbox-boundary
|
|
705
|
+
> preservation — InProcessSandbox and SubprocessSandbox
|
|
706
|
+
> deliver pass@1 = 1.000 on the oracle ceiling on the
|
|
707
|
+
> mini bank and the real-shape JSONL bank). The
|
|
708
|
+
> external-validity gap to public SWE-bench is now
|
|
709
|
+
> *empirical*, not infrastructural. See
|
|
710
|
+
> `RESULTS_PHASE40.md`.**
|
|
711
|
+
>
|
|
712
|
+
> **Phase 38 extends the coordination-primitive layer with
|
|
713
|
+
> four composition-level additions that close the two-layer
|
|
714
|
+
> ensemble, minimum-primitive-ablation, and prompt-variant
|
|
715
|
+
> frontier items named by Phase 37's conjectures:
|
|
716
|
+
> (a) `core/two_layer_ensemble` — a
|
|
717
|
+
> `PathUnionCausalityExtractor` with three combiner modes
|
|
718
|
+
> (`path_dual_agree` / `path_union_root` / `path_verified`)
|
|
719
|
+
> that sits strictly above any per-path noise wrapper, plus
|
|
720
|
+
> a `TwoLayerDefense` descriptor record. Theorem P38-2
|
|
721
|
+
> shows that `path_union_root` closes the Phase-37
|
|
722
|
+
> `adv_drop_root` cell where every reply-axis ensemble
|
|
723
|
+
> alone is powerless. (b) `core/extractor_adversary` —
|
|
724
|
+
> a `DropGoldClaimExtractor` adversarial layer-1 wrapper,
|
|
725
|
+
> a deterministic `NarrativeSecondaryExtractor` that
|
|
726
|
+
> catches dropped claims via service-tag matching, and a
|
|
727
|
+
> `UnionClaimExtractor` bridging the two. Theorem P38-1:
|
|
728
|
+
> the composition
|
|
729
|
+
> `UnionClaimExtractor ∘ EnsembleReplier(MODE_DUAL_AGREE)`
|
|
730
|
+
> is the unique configuration that recovers the joint
|
|
731
|
+
> layer-1 + layer-2 attack on the Phase-35 bank.
|
|
732
|
+
> (c) `core/primitive_ablation` — feature-flagged
|
|
733
|
+
> `AblatedFeatures` and thread runners (`run_ablated_thread_
|
|
734
|
+
> contested`, `run_ablated_thread_nested`) that toggle each
|
|
735
|
+
> of {`typed_vocab`, `bounded_witness`,
|
|
736
|
+
> `terminating_resolution`, `round_aware_state`,
|
|
737
|
+
> `frozen_membership`}. Theorem P38-3 presents the
|
|
738
|
+
> ablation-table falsifier for Phase-37 Conjecture C37-4.
|
|
739
|
+
> (d) `core/prompt_variants` — five surgical prompt
|
|
740
|
+
> variants (default, contrastive, few_shot, rubric,
|
|
741
|
+
> forced_order) + a `build_thread_reply_prompt_variant`
|
|
742
|
+
> dispatcher + a `VariantLLMThreadReplier` wrapper. Every
|
|
743
|
+
> variant preserves the Phase-36 typed-reply contract
|
|
744
|
+
> (allowed kinds, witness cap, UNCERTAIN fallback). A
|
|
745
|
+
> sibling `core/two_layer_ensemble` addition — `TwoLayer
|
|
746
|
+
> Defense` — is a descriptor record that records which
|
|
747
|
+
> layers are active for reporting. One surgical addition
|
|
748
|
+
> to `tasks/contested_incident`: an optional
|
|
749
|
+
> `claim_extractor` parameter on the handoff-protocol
|
|
750
|
+
> runners so Phase-38 layer-1 adversaries compose without
|
|
751
|
+
> modifying the Phase-35 decoder. No Phase-31 through
|
|
752
|
+
> Phase-37 primitive is modified. See `RESULTS_PHASE38.md`.**
|
|
753
|
+
>
|
|
754
|
+
> **Phase 37 extends the coordination-primitive layer with
|
|
755
|
+
> three composition-level additions, strictly above the
|
|
756
|
+
> Phase-36 reply primitives:
|
|
757
|
+
> (a) `core/reply_calibration` — a `CalibratingReplier` that
|
|
758
|
+
> wraps any `LLMThreadReplier` with a per-call oracle
|
|
759
|
+
> comparator and records every call into a 9-bucket
|
|
760
|
+
> correctness taxonomy (correct / malformed / oov / six
|
|
761
|
+
> semantic confusions) plus an orthogonal
|
|
762
|
+
> `witness_truncated` counter (Theorem P37-1: real LLMs
|
|
763
|
+
> produce 100 % well-formed JSON but 90 % semantic
|
|
764
|
+
> mislabel — the Phase-36 synthetic `malformed_prob` knob
|
|
765
|
+
> is a near-useless surrogate). (b) `core/reply_ensemble`
|
|
766
|
+
> — three pluggable ensemble modes (`dual_agree` AND-gated
|
|
767
|
+
> parallel; `primary_fallback` chatty-primary + fallback;
|
|
768
|
+
> `verified` primary + deterministic verifier), all
|
|
769
|
+
> matching the `LLMThreadReplier` shape so they drop into
|
|
770
|
+
> `causality_extractor_from_replier`. Theorems P37-2
|
|
771
|
+
> (biased-primary recovery), P37-3 (syntactic-noise
|
|
772
|
+
> recovery), P37-4 (structural limit — ensembles cannot
|
|
773
|
+
> recover extractor-output-level noise applied below
|
|
774
|
+
> them). (c) `tasks/nested_contested_incident` — a harder
|
|
775
|
+
> task family where round-1 replies are insufficient; a
|
|
776
|
+
> two-round thread harness (`run_nested_two_round_thread`)
|
|
777
|
+
> and a two-round adaptive-sub harness
|
|
778
|
+
> (`run_nested_two_round_adaptive_sub`) that uses a new
|
|
779
|
+
> `CLAIM_COORDINATION_BRIEFING` kind for inter-round
|
|
780
|
+
> auditor-to-producer briefings. Theorem P37-5: accuracy
|
|
781
|
+
> equivalence EXTENDS to nested contests at 0 pp gap, but
|
|
782
|
+
> the thread uses 0 inter-round briefings while
|
|
783
|
+
> adaptive_sub_2r uses 18 — a structural-complexity
|
|
784
|
+
> separation beneath the accuracy equivalence. No Phase-35
|
|
785
|
+
> or Phase-36 primitive is modified. See
|
|
786
|
+
> `RESULTS_PHASE37.md`.**
|
|
787
|
+
>
|
|
788
|
+
> **Phase 36 extends the dynamic-coordination layer with three
|
|
789
|
+
> sibling modules at the coordination-primitive layer (above
|
|
790
|
+
> `core/role_handoff`, parallel to `core/dynamic_comm`):
|
|
791
|
+
> (a) `core/reply_noise` — parameterised Bernoulli drop /
|
|
792
|
+
> mislabel wrappers and an adversarial reply wrapper targeting
|
|
793
|
+
> the gold `INDEPENDENT_ROOT` reply on a per-scenario budget
|
|
794
|
+
> (Theorems P36-1 graceful i.i.d. degradation and P36-2
|
|
795
|
+
> targeted-adversarial collapse). (b) `core/llm_thread_replier`
|
|
796
|
+
> — an `LLMThreadReplier` that drives a narrow, bounded LLM
|
|
797
|
+
> call per (producer, candidate) and returns a typed reply
|
|
798
|
+
> filtered against the Phase-35 reply-kind enum (Theorem P36-3
|
|
799
|
+
> LLM-replier substitutivity). (c) `core/adaptive_sub` — a
|
|
800
|
+
> bounded, TTL-expiring subscription-edit primitive
|
|
801
|
+
> (`AdaptiveSubscriptionTable` + `AdaptiveSubRouter` +
|
|
802
|
+
> `AdaptiveEdge`) offered as a serious comparison point to the
|
|
803
|
+
> Phase-35 escalation thread (Theorem P36-4 empirical
|
|
804
|
+
> equivalence). On the Phase-35 contested bank × the Phase-36
|
|
805
|
+
> noise × k × seed grid (96 paired measurements), the
|
|
806
|
+
> dynamic-thread vs adaptive-sub accuracy gap is 0.000 pp at
|
|
807
|
+
> every cell; token overhead is +12 %. The Phase-35 primitive
|
|
808
|
+
> is unchanged byte-for-byte. See `RESULTS_PHASE36.md`.**
|
|
809
|
+
>
|
|
810
|
+
> **Phase 35 adds a single new substrate layer strictly above
|
|
811
|
+
> Phase 31's typed-handoff layer and strictly below any
|
|
812
|
+
> unrestricted group-chat layer: the *escalation thread*
|
|
813
|
+
> (`core/dynamic_comm.EscalationThread` +
|
|
814
|
+
> `ThreadReply` + `ThreadResolution` + `DynamicCommRouter`). A
|
|
815
|
+
> thread has a frozen member set, a typed `issue_kind`
|
|
816
|
+
> (`RESOLVE_ROOT_CAUSE_CONFLICT` / `RESOLVE_SEVERITY_CONFLICT` /
|
|
817
|
+
> `RESOLVE_VERDICT_QUORUM` / `CONFIRM_CLAIM`), a bounded tuple
|
|
818
|
+
> of candidate claims, and three bounded budgets: `max_rounds`,
|
|
819
|
+
> `max_replies_per_member`, `witness_token_cap`. Member roles
|
|
820
|
+
> post typed replies from a small enumerated vocabulary
|
|
821
|
+
> (`INDEPENDENT_ROOT` / `DOWNSTREAM_SYMPTOM` / `UNCERTAIN` /
|
|
822
|
+
> `AGREE` / `DISAGREE` / `DEFER_TO`); the thread closes on
|
|
823
|
+
> quorum-on-agree, max-round exhaustion, or explicit opener
|
|
824
|
+
> close. The thread's single public output is a
|
|
825
|
+
> `CLAIM_THREAD_RESOLUTION` handoff routed through the
|
|
826
|
+
> unchanged Phase-31 `HandoffRouter`; thread-internal events
|
|
827
|
+
> (`THREAD:OPEN` / `THREAD:REPLY` / `THREAD:CLOSE`) are hash-
|
|
828
|
+
> chained in the existing `HandoffLog` for audit but never
|
|
829
|
+
> enter non-member inboxes (Theorem P35-4). Bounded-context is
|
|
830
|
+
> preserved with an additive `T·R_max·W` per role per round
|
|
831
|
+
> (Theorem P35-2), independent of |X|. The companion benchmark
|
|
832
|
+
> (`tasks/contested_incident`) runs a 6-scenario bank — 4
|
|
833
|
+
> contested root-cause pairs where static priority is
|
|
834
|
+
> inverted — showing the dynamic strategy at 100 % contested
|
|
835
|
+
> accuracy (flat at 246 tokens) vs static handoffs at 0 %
|
|
836
|
+
> contested accuracy (Theorem P35-1 separation). See Theorems
|
|
837
|
+
> P35-1..P35-4 + Conjectures C35-5, C35-6 in
|
|
838
|
+
> `RESULTS_PHASE35.md`.**
|
|
839
|
+
>
|
|
840
|
+
> **Phase 34 extends Arc 8 with (a) per-role-adaptive calibration
|
|
841
|
+
> (`core/extractor_calibration.per_role_audit_summary` +
|
|
842
|
+
> `core/extractor_noise.PerRoleNoiseConfig` +
|
|
843
|
+
> `per_role_noisy_extractor`): the pooled quadruple is now
|
|
844
|
+
> decomposed into per-role (δ̂_k, ε̂_k, μ̂_k, π̂_k) with a
|
|
845
|
+
> *limiting-role* argmax; on Phase-34's mock benchmark the per-role
|
|
846
|
+
> drop-rate spread is ≥ 0.33 across all three domains, confirming
|
|
847
|
+
> Conjecture C33-3's "pooled i.i.d. hides structure" on every
|
|
848
|
+
> domain; (b) an adversarial extractor wrapper
|
|
849
|
+
> (`core/extractor_noise.adversarial_extractor`) with three target
|
|
850
|
+
> modes — load-bearing claim drop with priority ordering, role
|
|
851
|
+
> silencing, severity-escalation injection — that provably beats
|
|
852
|
+
> i.i.d. at matched nominal budget (Theorem P34-2: at budget = 1 on
|
|
853
|
+
> all three domains the adversary collapses substrate accuracy to
|
|
854
|
+
> 0 % while matched i.i.d. preserves 20 %–80 %, gap +0.47 pp pooled);
|
|
855
|
+
> (c) the programme's first meaningful regex + LLM ensemble result
|
|
856
|
+
> (`core/ensemble_extractor.UnionExtractor`) on a compliance
|
|
857
|
+
> *mixed* bank (5 canonical + 5 narrative scenarios where regex and
|
|
858
|
+
> LLM have genuinely complementary coverage): regex 50 % / LLM 0 % /
|
|
859
|
+
> ensemble 100 % at pooled δ_u = 0.00 ≤ δ_r · δ_l = 0.188 —
|
|
860
|
+
> Conjecture C33-4 promoted to Theorem P34-3; (d) three theorems
|
|
861
|
+
> (P34-1 role-limited accuracy; P34-2 adversarial-vs-iid separation;
|
|
862
|
+
> P34-3 ensemble union lower bound) and two conjectures (C34-4
|
|
863
|
+
> typed-handoff ensemble-vs-adversary; C34-5 per-role replay as
|
|
864
|
+
> tighter predictor than pooled). The substrate primitive
|
|
865
|
+
> (`core/role_handoff`) remains byte-unchanged. See
|
|
866
|
+
> `RESULTS_PHASE34.md`.**
|
|
867
|
+
>
|
|
868
|
+
> **Phase 33 extends Arc 8 with (a) an LLM-driven extractor path
|
|
869
|
+
> (`core/llm_extractor`) — a drop-in replacement for any
|
|
870
|
+
> Phase-31/32 regex extractor that calls a
|
|
871
|
+
> ``Callable[[str], str]`` LLM per (role, scenario) boundary,
|
|
872
|
+
> parses the reply into typed ``(kind, payload, evids)`` tuples,
|
|
873
|
+
> and filters against ``known_kinds_by_role`` so the substrate's
|
|
874
|
+
> type-safety invariants are preserved under hallucination — the
|
|
875
|
+
> substrate primitive (`core/role_handoff`) is unchanged
|
|
876
|
+
> byte-for-byte; (b) a real-vs-synthetic noise calibration layer
|
|
877
|
+
> (`core/extractor_calibration`) that measures the empirical
|
|
878
|
+
> ``(δ̂ drop, ε̂ spurious, μ̂ mislabel, π̂ payload-corrupt)``
|
|
879
|
+
> quadruple against a gold causal chain and maps it to the
|
|
880
|
+
> closest Phase-32 synthetic sweep point — ``qwen2.5:0.5b`` on
|
|
881
|
+
> compliance review is 0.70 / 0.12 / 0.40 / 0.60, Phase-32
|
|
882
|
+
> closest-match predicts substrate accuracy / recall / precision
|
|
883
|
+
> within max-abs-gap 0.10 ⇒ verdict "approximates"; (c) a *third*
|
|
884
|
+
> non-code domain — security-audit escalation
|
|
885
|
+
> (`tasks/security_escalation`) — with a five-role cast (SOC /
|
|
886
|
+
> IR / threat intel / data steward / CISO), 15 claim kinds, and a
|
|
887
|
+
> novel **max-ordinal severity + claim-set classification**
|
|
888
|
+
> decoder (structurally distinct from Phase 31 priority-order and
|
|
889
|
+
> Phase 32 monotone-verdict shapes). Substrate flat at 242
|
|
890
|
+
> tokens / 100 % accuracy across k ∈ {6, 20, 60, 120}; naive
|
|
891
|
+
> collapses 100 % → 20 % at k = 120 under truncation; (d) three
|
|
892
|
+
> theorems (P33-1 LLM-extractor subsumption under the Phase-32
|
|
893
|
+
> sweep; P33-2 cross-domain correctness at K = 3; P33-3
|
|
894
|
+
> two-regime bound on max-ordinal decoders) and two conjectures
|
|
895
|
+
> (C33-3 role-heterogeneous noise; C33-4 ensemble-extractor
|
|
896
|
+
> composition). See `RESULTS_PHASE33.md`.**
|
|
897
|
+
>
|
|
898
|
+
> **Phase 32 extends Arc 8 with (a) a second non-code domain —
|
|
899
|
+
> vendor-onboarding compliance review (`tasks/compliance_review`)
|
|
900
|
+
> with a distinct role cast (legal / security / privacy / finance
|
|
901
|
+
> / compliance officer) and a priority-monotone-verdict + strict-
|
|
902
|
+
> set-flags decoder — that confirms the substrate's behaviour is
|
|
903
|
+
> domain-agnostic (substrate flat at 171 tokens / 100 % accuracy
|
|
904
|
+
> across k ∈ {6, 20, 60, 120}, same signature as Phase 31); (b) a
|
|
905
|
+
> parameterised extractor-noise module (`core/extractor_noise`)
|
|
906
|
+
> with five noise axes (drop / spurious / mislabel /
|
|
907
|
+
> payload_corrupt / seed) and a 96-point controlled sweep across
|
|
908
|
+
> both domains, confirming the Theorem-P32-2 two-regime
|
|
909
|
+
> graceful-degradation bound; (c) Theorem P32-1 (cross-domain
|
|
910
|
+
> correctness preservation), Theorem P32-2 (noisy-extractor
|
|
911
|
+
> graceful degradation, promoting C31-7 to theorem in the monotone
|
|
912
|
+
> regime), Theorem P32-3 (token-bound preservation under bounded
|
|
913
|
+
> noise — the inbox capacity is the regulariser); and (d) a
|
|
914
|
+
> frontier-model spot check with `qwen2.5-coder:7b` on both
|
|
915
|
+
> non-code benchmarks at k = 6. See Theorems P32-1..P32-3 +
|
|
916
|
+
> Conjectures C32-4, C32-5 in `RESULTS_PHASE32.md`.**
|
|
917
|
+
>
|
|
918
|
+
> **Phase 27 extends the runtime-calibration axis from the curated
|
|
919
|
+
> 21-snippet corpus to REAL CORPUS FUNCTIONS. The Phase-27 observer
|
|
920
|
+
> classifies every function in a corpus into a callability state
|
|
921
|
+
> (`ready_no_args` / `ready_typed` / `ready_curated` or one of
|
|
922
|
+
> several `unsupported_*` states), synthesises recipe-compatible
|
|
923
|
+
> arguments via a `SafeRecipeRegistry`, and runs the Phase-26 probes
|
|
924
|
+
> with additional `sys.settrace`-based entry detection and per-call
|
|
925
|
+
> wall-time budgeting. On `vision-core` (~791 functions) the ready
|
|
926
|
+
> slice is ~35.7 %; the remaining 64 % is structurally unprobable
|
|
927
|
+
> under the default recipe strategy (methods without auto-
|
|
928
|
+
> constructed instances, variadic args, generators, async, untyped
|
|
929
|
+
> positional params). Theorem P27-1 formalises this as a strict
|
|
930
|
+
> inclusion $F_R \subseteq F_A$; Theorem P27-2 shows corpus-scale
|
|
931
|
+
> runtime coverage is witness-availability-bounded, not planner-
|
|
932
|
+
> exactness-bounded — the planner round-trip remains 100 % on
|
|
933
|
+
> every predicate across every corpus.** The full
|
|
934
|
+
> architecture composes as:
|
|
935
|
+
>
|
|
936
|
+
> ```
|
|
937
|
+
> Routing (who talks to whom; O(log N)) — lossy by design
|
|
938
|
+
> ↓
|
|
939
|
+
> Trigger (when to refine) — lossy by design
|
|
940
|
+
> ↓
|
|
941
|
+
> Exact external memory (Merkle DAG) — LOSSLESS, content-addressed
|
|
942
|
+
> ↓ ┌─ text chunks (Phases 19–21)
|
|
943
|
+
> ↓ ├─ source files + AST metadata (Phase 22)
|
|
944
|
+
> ↓ ├─ source files + AST structural metadata
|
|
945
|
+
> ↓ │ + conservative intraprocedural metadata (Phase 24)
|
|
946
|
+
> ↓ └─ source files + AST structural metadata
|
|
947
|
+
> ↓ + conservative intraprocedural metadata
|
|
948
|
+
> ↓ + conservative INTERPROCEDURAL metadata (Phase 25)
|
|
949
|
+
> Retrieval (dense + lexical RRF + multi-hop) — lossy in ranking, never in content
|
|
950
|
+
> ↓
|
|
951
|
+
> Computation / planning (typed operators + planner) — LOSSLESS, deterministic
|
|
952
|
+
> ↓ ┌─ structural patterns (count / list / top / join)
|
|
953
|
+
> ↓ ├─ intraprocedural patterns (may_raise / recursive / io) [P24]
|
|
954
|
+
> ↓ └─ INTERPROCEDURAL patterns (trans_may_raise /
|
|
955
|
+
> ↓ participates_in_cycle /
|
|
956
|
+
> ↓ trans_calls_* / unresolved) [P25]
|
|
957
|
+
> Render: { wrap_llm | direct } — direct path: zero LLM, zero prompt
|
|
958
|
+
> ↓
|
|
959
|
+
> Bounded active context fed to the LLM (only when
|
|
960
|
+
> the wrap path or retrieval fallback is used) — bytes are exact slices of memory
|
|
961
|
+
>
|
|
962
|
+
> ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
|
963
|
+
> Phase-31 typed-handoff substrate (cross-role content channel)
|
|
964
|
+
> ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
|
965
|
+
> role A's events → role A's extractor → TypedHandoff
|
|
966
|
+
> (claim_kind, payload, src_event_ids, cid)
|
|
967
|
+
> ↓
|
|
968
|
+
> RoleSubscriptionTable[(src_role, claim_kind)]
|
|
969
|
+
> → set(consumer roles)
|
|
970
|
+
> ↓
|
|
971
|
+
> bounded RoleInbox (dedup by payload_cid,
|
|
972
|
+
> overflow accounted, wrong_role rejected)
|
|
973
|
+
> ↓
|
|
974
|
+
> hash-chained HandoffLog
|
|
975
|
+
> (SHA-256 over (prev_chain_hash, handoff
|
|
976
|
+
> fields); tamper / truncation detector)
|
|
977
|
+
> ↓
|
|
978
|
+
> per-(src_role, to_role, claim_kind)
|
|
979
|
+
> DeliveryAccount counters for the benchmark
|
|
980
|
+
> (Phase-31 is additive: the layer sits alongside routing and
|
|
981
|
+
> ingestion; teams that do not need typed handoffs can ignore
|
|
982
|
+
> it. The handoff layer lifts load-bearing content into routing
|
|
983
|
+
> headers so downstream roles can subscribe by claim-kind — the
|
|
984
|
+
> mechanism by which the Phase-29 "routing-by-type cannot rescue
|
|
985
|
+
> the aggregator" observation is resolved for general teams.)
|
|
986
|
+
> (Phase-32 adds a controlled noise wrapper
|
|
987
|
+
> `core/extractor_noise.noisy_extractor` that sits between any
|
|
988
|
+
> extractor and the router to exercise Theorem P32-2's
|
|
989
|
+
> graceful-degradation regimes; production runs use identity
|
|
990
|
+
> noise, the Phase-32 sweep uses non-trivial parameters.)
|
|
991
|
+
>
|
|
992
|
+
> ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
|
993
|
+
> Phase-35 dynamic-coordination layer (strictly above P31 layer)
|
|
994
|
+
> ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
|
995
|
+
> Auditor detects contested candidates in its RoleInbox
|
|
996
|
+
> ↓
|
|
997
|
+
> open_thread(issue_kind, frozen(members),
|
|
998
|
+
> candidate_claims, max_rounds,
|
|
999
|
+
> max_replies_per_member,
|
|
1000
|
+
> quorum, witness_token_cap)
|
|
1001
|
+
> ↓
|
|
1002
|
+
> member roles post typed ThreadReply messages:
|
|
1003
|
+
> {INDEPENDENT_ROOT, DOWNSTREAM_SYMPTOM,
|
|
1004
|
+
> UNCERTAIN, AGREE, DISAGREE, DEFER_TO}
|
|
1005
|
+
> ↓
|
|
1006
|
+
> close_thread → ThreadResolution:
|
|
1007
|
+
> {SINGLE_INDEPENDENT_ROOT, QUORUM_AGREE,
|
|
1008
|
+
> CONFLICT, NO_CONSENSUS, TIMEOUT}
|
|
1009
|
+
> ↓
|
|
1010
|
+
> emit(CLAIM_THREAD_RESOLUTION, payload="kind=...
|
|
1011
|
+
> winner=role/kind losers=r/k,...")
|
|
1012
|
+
> ↓ (through unchanged HandoffRouter)
|
|
1013
|
+
> RoleInbox(auditor) — single public output
|
|
1014
|
+
> (Phase-35 is strictly additive: thread-internal events live in
|
|
1015
|
+
> the existing HandoffLog but no inbox subscribes to the
|
|
1016
|
+
> THREAD:* internal claim kinds; non-member roles see zero
|
|
1017
|
+
> thread traffic. Bounded-context invariant extends with an
|
|
1018
|
+
> additive T·R_max·W per role per round — Theorem P35-2.)
|
|
1019
|
+
>
|
|
1020
|
+
> ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
|
1021
|
+
> Phase-26 runtime-calibration observer (additive, off-path)
|
|
1022
|
+
> ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
|
1023
|
+
> Source bytes ┄┄┄→ instrumented execution probes
|
|
1024
|
+
> (monkeypatched subprocess / filesystem /
|
|
1025
|
+
> network APIs; sys.settrace for cycles)
|
|
1026
|
+
> ↓
|
|
1027
|
+
> per-predicate RuntimeObservation:
|
|
1028
|
+
> runtime_flag, n_runs, n_triggered,
|
|
1029
|
+
> witnesses, decidable, applicable
|
|
1030
|
+
> ↓
|
|
1031
|
+
> calibration summary: FP, FN, fp_rate,
|
|
1032
|
+
> fn_rate, per-family breakdown
|
|
1033
|
+
> (source bytes and analyzer flags flow in; the runtime observer
|
|
1034
|
+
> reports a second truth value per predicate; the planner's
|
|
1035
|
+
> direct-exact path is unchanged.)
|
|
1036
|
+
>
|
|
1037
|
+
> ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
|
1038
|
+
> Phase-27 corpus-scale runtime-calibration observer (additive)
|
|
1039
|
+
> ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
|
1040
|
+
> Real corpus ┄┄┄→ CorpusFunctionCandidate per qname:
|
|
1041
|
+
> {ready_no_args | ready_typed | ready_curated
|
|
1042
|
+
> | unsupported_*} (AST + inspect.signature
|
|
1043
|
+
> + SafeRecipeRegistry lookup)
|
|
1044
|
+
> ↓
|
|
1045
|
+
> InvocationRecipe per ready candidate:
|
|
1046
|
+
> (no_args | typed from fuzz pool | curated)
|
|
1047
|
+
> ↓
|
|
1048
|
+
> sandbox + entry-and-budget tracer:
|
|
1049
|
+
> sys.settrace counts enter_count on
|
|
1050
|
+
> target code object; time.monotonic()
|
|
1051
|
+
> check every line event; sentinel on
|
|
1052
|
+
> budget expiry.
|
|
1053
|
+
> ↓
|
|
1054
|
+
> per-predicate CorpusObservation:
|
|
1055
|
+
> runtime_flag, n_runs, n_triggered,
|
|
1056
|
+
> n_entered, n_timeout, witnesses,
|
|
1057
|
+
> recipe_kind, applicable, entered, timeout
|
|
1058
|
+
> ↓
|
|
1059
|
+
> coverage account: per-status buckets +
|
|
1060
|
+
> ready_fraction, calibrated_fraction;
|
|
1061
|
+
> per-predicate metrics restricted to
|
|
1062
|
+
> entered=True.
|
|
1063
|
+
> ```
|
|
1064
|
+
>
|
|
1065
|
+
> See `vision_mvp/RESULTS_PHASE19.md`, `RESULTS_PHASE20.md`,
|
|
1066
|
+
> `RESULTS_PHASE21.md`, `RESULTS_PHASE22.md`, `RESULTS_PHASE23.md`,
|
|
1067
|
+
> `RESULTS_PHASE24.md`, and `RESULTS_PHASE25.md` for the cumulative
|
|
1068
|
+
> evidence: an exact byte-store + bounded-context worker beats
|
|
1069
|
+
> summarise-then-pool on long-document needle questions; hybrid
|
|
1070
|
+
> retrieval + structural multi-hop expansion close most of the
|
|
1071
|
+
> remaining recall gap; a typed operator pipeline answers
|
|
1072
|
+
> aggregation queries the retrieval layer cannot reach (91 % vs 64
|
|
1073
|
+
> % on synthetic aggregation, beating even oracle on that slice);
|
|
1074
|
+
> on a real Python codebase the direct-exact path achieves **7/7
|
|
1075
|
+
> correct with zero LLM calls and zero prompt chars**, while
|
|
1076
|
+
> retrieval-only conditions score **0/7** because aggregation is
|
|
1077
|
+
> structurally unreachable by top-k retrieval; across **six real
|
|
1078
|
+
> Python corpora** direct-exact scores **65/65 (100 %, σ = 0)** on
|
|
1079
|
+
> the structural battery, **44 / 44 (100 %, σ = 0)** on the Phase-
|
|
1080
|
+
> 24 intraprocedural semantic battery, and **50 / 50 (100 %, σ =
|
|
1081
|
+
> 0)** on the Phase-25 interprocedural semantic battery with zero
|
|
1082
|
+
> LLM calls; retrieval-mediated paths average **19.7 % (σ = 17.6)**
|
|
1083
|
+
> on structural aggregation, **49.6 % (σ = 15.8)** on Phase-24
|
|
1084
|
+
> semantic, and **38.0 % (σ = 23.1)** on Phase-25 interprocedural.
|
|
1085
|
+
> **The exact slice now covers syntactic code structure AND
|
|
1086
|
+
> conservative intraprocedural-semantic code properties AND
|
|
1087
|
+
> conservative interprocedural-semantic code properties — the last
|
|
1088
|
+
> including transitive effect propagation over a local call graph
|
|
1089
|
+
> and exact SCC-based recursion-cycle detection.** The CASR spec
|
|
1090
|
+
> below is unchanged for the routing/trigger layers.
|
|
1091
|
+
|
|
1092
|
+
---
|
|
1093
|
+
|
|
1094
|
+
---
|
|
1095
|
+
|
|
1096
|
+
## Design Principles
|
|
1097
|
+
|
|
1098
|
+
1. **Routing decisions must not require reading message content.** The Bloom filter operates on event type metadata (O(1)), not event bodies. Reading content for routing decisions would negate the efficiency gains.
|
|
1099
|
+
|
|
1100
|
+
2. **Scale assignments are declarative, set at agent instantiation.** Scales do not change mid-task. Dynamic scale inference is a research question deferred to Phase 3.
|
|
1101
|
+
|
|
1102
|
+
3. **The world model updates are asynchronous.** The surprise filter does not block message delivery. Updates to M_i happen in a background process.
|
|
1103
|
+
|
|
1104
|
+
4. **No information is destroyed, only routed.** The event log is append-only. Any agent can replay the full event history if needed. CASR is a filter on delivery, not on storage.
|
|
1105
|
+
|
|
1106
|
+
5. **Fail open.** When uncertain (Bloom filter positive hit, world model not yet trained), deliver the message. Over-delivery is safer than under-delivery.
|
|
1107
|
+
|
|
1108
|
+
---
|
|
1109
|
+
|
|
1110
|
+
## Agent Interface
|
|
1111
|
+
|
|
1112
|
+
Every agent in a CASR-enabled team exposes this interface at instantiation:
|
|
1113
|
+
|
|
1114
|
+
```python
|
|
1115
|
+
@dataclass
|
|
1116
|
+
class AgentConfig:
|
|
1117
|
+
agent_id: str
|
|
1118
|
+
role: str # Human-readable role name
|
|
1119
|
+
task_description: str # Current task at instantiation
|
|
1120
|
+
scale: int # 0=Token, 1=Statement, 2=Function, 3=Module, 4=System
|
|
1121
|
+
distortion_budget: float # Acceptable task-error probability increase (0.0 to 1.0)
|
|
1122
|
+
causal_footprint: BloomFilter # Pre-computed relevance filter for this role
|
|
1123
|
+
world_model: Optional[GenerativeModel] # None until trained in Phase 2
|
|
1124
|
+
surprise_threshold: float # τᵢ — KL threshold for transmission (0.0 disables filter)
|
|
1125
|
+
```
|
|
1126
|
+
|
|
1127
|
+
**Scale semantics:**
|
|
1128
|
+
|
|
1129
|
+
| Scale Value | Granularity | Example Events Visible | Example Roles |
|
|
1130
|
+
|-------------|-------------|----------------------|---------------|
|
|
1131
|
+
| 0 | Token | Every token, syntax error, formatting diff | Linter, formatter, syntax checker |
|
|
1132
|
+
| 1 | Statement | Individual tool calls, single code lines, test results | Code writer, unit tester, file editor |
|
|
1133
|
+
| 2 | Function | Function completions, subtask results, local test pass/fail | Subagent, debugger, function-level reviewer |
|
|
1134
|
+
| 3 | Module | Subsystem changes, integration test results, cross-function state | Orchestrator, module-level planner |
|
|
1135
|
+
| 4 | System | Architectural decisions, goal completions, global constraints | Meta-orchestrator, project planner |
|
|
1136
|
+
|
|
1137
|
+
**Distortion budget:** Expressed as maximum acceptable probability of the agent taking a suboptimal action due to missing context. Conservative agents (planners) use low budget (~0.01). Monitoring agents (checking for catastrophic failures only) use high budget (~0.20).
|
|
1138
|
+
|
|
1139
|
+
---
|
|
1140
|
+
|
|
1141
|
+
## The Scale Projection Operators
|
|
1142
|
+
|
|
1143
|
+
For each scale s, the projection operator P_s maps a full event to its representation at scale s.
|
|
1144
|
+
|
|
1145
|
+
**Required property (composability):**
|
|
1146
|
+
```
|
|
1147
|
+
P_{s1}(P_{s2}(e)) = P_{max(s1,s2)}(e) for all events e
|
|
1148
|
+
```
|
|
1149
|
+
|
|
1150
|
+
Applying two projections in sequence gives the coarser projection. This ensures consistency across the hierarchy.
|
|
1151
|
+
|
|
1152
|
+
**Fixed-point events** (preserved at all scales, P_s(e) = e for all s):
|
|
1153
|
+
- Task goal specification messages
|
|
1154
|
+
- Hard constraint declarations
|
|
1155
|
+
- Error/failure events (any unhandled exception or task failure)
|
|
1156
|
+
- Final output/completion events
|
|
1157
|
+
|
|
1158
|
+
**Projection implementations by scale transition:**
|
|
1159
|
+
|
|
1160
|
+
```
|
|
1161
|
+
scale 0 → 1: Aggregate consecutive tokens into statement-level summaries.
|
|
1162
|
+
Discard whitespace, formatting, comments.
|
|
1163
|
+
Preserve: variable names, control flow, function calls.
|
|
1164
|
+
|
|
1165
|
+
scale 1 → 2: Aggregate statements into function-level summaries.
|
|
1166
|
+
Discard: intermediate variable states, loop iterations.
|
|
1167
|
+
Preserve: function signature, return value, side effects, errors.
|
|
1168
|
+
|
|
1169
|
+
scale 2 → 3: Aggregate function results into module-level summaries.
|
|
1170
|
+
Discard: internal function logic.
|
|
1171
|
+
Preserve: module interface changes, integration test results, exported state.
|
|
1172
|
+
|
|
1173
|
+
scale 3 → 4: Aggregate module changes into system-level summaries.
|
|
1174
|
+
Discard: implementation details.
|
|
1175
|
+
Preserve: architectural decisions, constraint violations, goal progress.
|
|
1176
|
+
```
|
|
1177
|
+
|
|
1178
|
+
**Implementation note:** In the MVP, these projections are implemented as LLM calls with structured output schemas. In Phase 2, they can be replaced with fine-tuned smaller models for efficiency.
|
|
1179
|
+
|
|
1180
|
+
---
|
|
1181
|
+
|
|
1182
|
+
## Message Bus Architecture
|
|
1183
|
+
|
|
1184
|
+
The central component is an event-sourced message bus. All agents are publishers and subscribers.
|
|
1185
|
+
|
|
1186
|
+
```
|
|
1187
|
+
┌─────────────────────────────────────────────────────────┐
|
|
1188
|
+
│ EVENT BUS │
|
|
1189
|
+
│ │
|
|
1190
|
+
│ ┌──────────────┐ ┌──────────────────────────────┐ │
|
|
1191
|
+
│ │ Event Log │ │ Subscriber Registry │ │
|
|
1192
|
+
│ │ (append-only)│ │ agent_id → AgentConfig │ │
|
|
1193
|
+
│ └──────────────┘ └──────────────────────────────┘ │
|
|
1194
|
+
│ │
|
|
1195
|
+
│ On new event e published by agent aⱼ: │
|
|
1196
|
+
│ For each subscriber aᵢ: │
|
|
1197
|
+
│ 1. B_i(e.type) → if "definitely not": skip │
|
|
1198
|
+
│ 2. P_{sᵢ}(e) → compute scale projection │
|
|
1199
|
+
│ 3. δᵢ(e) = KL(M_i.predict() || e.embedding) │
|
|
1200
|
+
│ if δ < τᵢ and M_i is trained: skip │
|
|
1201
|
+
│ 4. Deliver P_{sᵢ}(e) to aᵢ's context queue │
|
|
1202
|
+
└─────────────────────────────────────────────────────────┘
|
|
1203
|
+
```
|
|
1204
|
+
|
|
1205
|
+
**Event schema:**
|
|
1206
|
+
|
|
1207
|
+
```python
|
|
1208
|
+
@dataclass
|
|
1209
|
+
class Event:
|
|
1210
|
+
event_id: str # UUID
|
|
1211
|
+
event_type: str # Enumerated type (tool_call, message, state_change, error, goal_update)
|
|
1212
|
+
sender_id: str # Sending agent
|
|
1213
|
+
timestamp: float # Unix timestamp
|
|
1214
|
+
scale_level: int # Scale of the originating agent
|
|
1215
|
+
body: dict # Full event content (not read during routing decision)
|
|
1216
|
+
embedding: np.ndarray # Precomputed embedding for world model comparison
|
|
1217
|
+
is_fixed_point: bool # If True, delivered to all agents unmodified
|
|
1218
|
+
```
|
|
1219
|
+
|
|
1220
|
+
**Delivery guarantee:** At-least-once delivery. Events that pass all three CASR stages are queued for delivery. If an agent's queue is full (context window filling), the bus falls back to delivering only fixed-point events until the agent processes its queue.
|
|
1221
|
+
|
|
1222
|
+
---
|
|
1223
|
+
|
|
1224
|
+
## Bloom Filter Specification
|
|
1225
|
+
|
|
1226
|
+
**Construction (offline, per agent role):**
|
|
1227
|
+
|
|
1228
|
+
```
|
|
1229
|
+
Input: Set of (event_type, is_relevant) pairs for this role
|
|
1230
|
+
Output: BloomFilter with target false positive rate p = 0.01
|
|
1231
|
+
|
|
1232
|
+
false_positive_rate = 0.01 (1% of irrelevant events pass the filter)
|
|
1233
|
+
n = number of event types in the system
|
|
1234
|
+
m = -n * ln(p) / (ln(2))^2 (filter size in bits)
|
|
1235
|
+
k = (m/n) * ln(2) (number of hash functions)
|
|
1236
|
+
```
|
|
1237
|
+
|
|
1238
|
+
**At runtime:**
|
|
1239
|
+
```
|
|
1240
|
+
query(event_type) → {DEFINITELY_NOT_RELEVANT, POSSIBLY_RELEVANT}
|
|
1241
|
+
```
|
|
1242
|
+
|
|
1243
|
+
If DEFINITELY_NOT_RELEVANT: drop the event without reading its body.
|
|
1244
|
+
If POSSIBLY_RELEVANT: proceed to scale projection.
|
|
1245
|
+
|
|
1246
|
+
**Staleness mitigation:** Bloom filters are rebuilt at each task phase transition (e.g., when the orchestrator changes the global task state). Between transitions, the filter is immutable.
|
|
1247
|
+
|
|
1248
|
+
**Conservative initialization:** Before any empirical data is collected, initialize the Bloom filter to include all event types (100% pass rate). Refine using empirical footprint estimation once data is available.
|
|
1249
|
+
|
|
1250
|
+
---
|
|
1251
|
+
|
|
1252
|
+
## World Model Specification
|
|
1253
|
+
|
|
1254
|
+
The world model M_i for agent aᵢ is a lightweight model that predicts the next event's embedding given the agent's current context:
|
|
1255
|
+
|
|
1256
|
+
```
|
|
1257
|
+
M_i : (current_context, recent_events) → predicted_event_embedding
|
|
1258
|
+
```
|
|
1259
|
+
|
|
1260
|
+
**Stage 2 implementation (Phase 2+):**
|
|
1261
|
+
- Small transformer (≤7B parameters) or frozen large model with a fine-tuned prediction head
|
|
1262
|
+
- Input: last K events in aᵢ's context, projected to scale sᵢ
|
|
1263
|
+
- Output: predicted embedding of next event in aᵢ's context
|
|
1264
|
+
- Training: minimize L2 distance between predicted and actual event embeddings
|
|
1265
|
+
|
|
1266
|
+
**Surprise computation:**
|
|
1267
|
+
```python
|
|
1268
|
+
def surprise(M_i, event_e):
|
|
1269
|
+
predicted = M_i.predict(current_context)
|
|
1270
|
+
actual = event_e.embedding
|
|
1271
|
+
return kl_divergence(predicted, actual)
|
|
1272
|
+
# or simpler: cosine_distance(predicted, actual)
|
|
1273
|
+
```
|
|
1274
|
+
|
|
1275
|
+
**World model disabled (MVP):** In the MVP, M_i is not trained. Set τᵢ = 0, which delivers all events that pass the Bloom filter. The surprise filter is enabled incrementally in Phase 2.
|
|
1276
|
+
|
|
1277
|
+
---
|
|
1278
|
+
|
|
1279
|
+
## Failure Modes and Mitigations
|
|
1280
|
+
|
|
1281
|
+
| Failure Mode | Cause | Detection | Mitigation |
|
|
1282
|
+
|---|---|---|---|
|
|
1283
|
+
| Missing critical context | Bloom filter false negative (impossible by construction) | N/A | None needed — Bloom filters have no false negatives |
|
|
1284
|
+
| Context starvation | τᵢ too high, world model over-predicts | Agent produces incorrect output despite low context | Decrease τᵢ or trigger full-sync |
|
|
1285
|
+
| Bloom filter staleness | New event type introduced after filter construction | Agent fails to respond to new event types | Rebuild filters at phase transitions; default-include unknown event types |
|
|
1286
|
+
| World model drift | Team behavior diverges from training distribution | Surprise distribution shifts systematically | Periodic re-training of M_i on recent event logs |
|
|
1287
|
+
| Scale mismatch | Event from scale-0 agent delivered to scale-4 agent without projection | Scale-4 agent context fills with low-level detail | Scale projection is mandatory for all cross-scale delivery |
|
|
1288
|
+
| Orchestrator overload | All N workers complete simultaneously, flood orchestrator | Orchestrator queue depth spikes | Rate-limit delivery to orchestrator; batch completions within a time window |
|
|
1289
|
+
|
|
1290
|
+
**Full-state synchronization:** Every K rounds (K is a hyperparameter, default 50), each agent receives the unfiltered projection of all current state at its scale, bypassing all CASR filters. This corrects accumulated errors from stale Bloom filters and miscalibrated world models. K should be set to the expected task-phase length.
|
|
1291
|
+
|
|
1292
|
+
---
|
|
1293
|
+
|
|
1294
|
+
## Event Type Registry
|
|
1295
|
+
|
|
1296
|
+
A centralized registry of all event types and their default scale assignments. This is the source of truth for Bloom filter construction.
|
|
1297
|
+
|
|
1298
|
+
```
|
|
1299
|
+
Core event types:
|
|
1300
|
+
|
|
1301
|
+
TOOL_CALL scale=1 (statement level by default)
|
|
1302
|
+
TOOL_RESULT scale=1
|
|
1303
|
+
FILE_EDIT scale=1
|
|
1304
|
+
FILE_CREATE scale=2 (function/module level)
|
|
1305
|
+
TEST_RUN scale=2
|
|
1306
|
+
TEST_RESULT scale=2
|
|
1307
|
+
FUNCTION_COMPLETE scale=2
|
|
1308
|
+
MODULE_COMPLETE scale=3
|
|
1309
|
+
TASK_GOAL_UPDATE scale=4, is_fixed_point=True
|
|
1310
|
+
HARD_CONSTRAINT scale=4, is_fixed_point=True
|
|
1311
|
+
ERROR_UNHANDLED scale=4, is_fixed_point=True (always delivers to all)
|
|
1312
|
+
TASK_COMPLETE scale=4, is_fixed_point=True
|
|
1313
|
+
AGENT_SPAWN scale=3
|
|
1314
|
+
AGENT_TERMINATE scale=3
|
|
1315
|
+
MESSAGE_AGENT scale=2 (default; overridden by sender scale)
|
|
1316
|
+
```
|
|
1317
|
+
|
|
1318
|
+
**Custom event types:** Teams can register domain-specific event types with explicit scale assignments and relevance mappings per role.
|
|
1319
|
+
|
|
1320
|
+
---
|
|
1321
|
+
|
|
1322
|
+
## Scaling Characteristics
|
|
1323
|
+
|
|
1324
|
+
| Team Size | History Depth | Naive Context (tokens) | CASR Context (tokens) | Reduction |
|
|
1325
|
+
|-----------|--------------|----------------------|----------------------|-----------|
|
|
1326
|
+
| 5 agents | 50 rounds | ~12,500 | ~2,500 | 5x |
|
|
1327
|
+
| 10 agents | 100 rounds | ~100,000 | ~6,600 | 15x |
|
|
1328
|
+
| 20 agents | 200 rounds | ~800,000 | ~14,600 | 55x |
|
|
1329
|
+
| 50 agents | 500 rounds | ~12,500,000 | ~46,000 | 272x |
|
|
1330
|
+
|
|
1331
|
+
*Estimates based on O(H·log(N)) vs O(N·H²) scaling, with k=50 tokens per event, branching factor b=5.*
|
|
1332
|
+
|
|
1333
|
+
These are theoretical. Empirical validation is the primary goal of Phase 1 (MVP).
|
|
1334
|
+
|
|
1335
|
+
---
|
|
1336
|
+
|
|
1337
|
+
## Interface with Existing Frameworks
|
|
1338
|
+
|
|
1339
|
+
CASR is designed as a drop-in message bus layer for existing multi-agent frameworks.
|
|
1340
|
+
|
|
1341
|
+
**AutoGen integration:** Replace AutoGen's GroupChat or nested conversation patterns with the CASR event bus. Agent-to-agent messages become events; the bus handles routing.
|
|
1342
|
+
|
|
1343
|
+
**LangGraph integration:** Add a CASR routing layer to each graph edge. Before a LangGraph node receives its input state, run the state update through the CASR pipeline.
|
|
1344
|
+
|
|
1345
|
+
**CrewAI integration:** Intercept the task context assembly step. Instead of assembling full context for each agent, assemble CASR-filtered context.
|
|
1346
|
+
|
|
1347
|
+
The goal is not to replace these frameworks but to add principled context routing as a layer beneath their agent orchestration logic.
|
|
1348
|
+
|
|
1349
|
+
|
|
1350
|
+
---
|
|
1351
|
+
|
|
1352
|
+
## Phase-45 Product Surface (operator entrypoint)
|
|
1353
|
+
|
|
1354
|
+
Phase 45 added a thin orchestration surface on top of the Phase
|
|
1355
|
+
31..44 stack at `vision_mvp/product/`:
|
|
1356
|
+
|
|
1357
|
+
- `vision_mvp/product/profiles.py` — six stable, versioned
|
|
1358
|
+
profiles (`local_smoke`, `bundled_57`, `bundled_57_mock_sweep`,
|
|
1359
|
+
`aspen_mac1_coder`, `aspen_mac2_frontier`, `public_jsonl`).
|
|
1360
|
+
Schema: `phase45.profile.v1`.
|
|
1361
|
+
- `vision_mvp/product/runner.py` — `run_profile(...)` composes
|
|
1362
|
+
readiness → sweep → report. Readiness is a hard gate unless
|
|
1363
|
+
overridden (Theorem P45-2). Real-LLM sweeps are *recorded* as
|
|
1364
|
+
a launch command rather than forked from inside the runner.
|
|
1365
|
+
- `vision_mvp/product/report.py` — summary renderer;
|
|
1366
|
+
reusable on any stored `product_report.json`.
|
|
1367
|
+
- One command:
|
|
1368
|
+
`python3 -m vision_mvp.product --profile <name> --out-dir <d>`
|
|
1369
|
+
|
|
1370
|
+
The product surface adds no new substrate semantics; see
|
|
1371
|
+
`vision_mvp/RESULTS_PHASE45.md` and
|
|
1372
|
+
`docs/context_zero_master_plan.md` §9 for the Finished-Product
|
|
1373
|
+
Checklist and release criteria.
|
|
1374
|
+
|
|
1375
|
+
|
|
1376
|
+
---
|
|
1377
|
+
|
|
1378
|
+
## Phase-46 Boundary Surface (external-exercise readiness)
|
|
1379
|
+
|
|
1380
|
+
Phase 46 adds a boundary layer between the Phase-45 product
|
|
1381
|
+
surface and the outside world:
|
|
1382
|
+
|
|
1383
|
+
- `vision_mvp/product/import_data.py` — `audit_jsonl(...)`:
|
|
1384
|
+
schema classification (native / hermetic / ambiguous /
|
|
1385
|
+
unusable), duplicate-id detection, decode / non-object /
|
|
1386
|
+
empty-bank failure modes, delegated Theorem-P44-3 readiness.
|
|
1387
|
+
CLI exit codes distinguish *missing file* (2) from *blocker*
|
|
1388
|
+
(1) from *clean* (0).
|
|
1389
|
+
- `vision_mvp/product/ci_gate.py` — `evaluate_report(...)` +
|
|
1390
|
+
`aggregate(...)`: five-check CI verdict over one or more
|
|
1391
|
+
`product_report.json` files. Threshold knobs for readiness
|
|
1392
|
+
fraction and per-cell pass@1; profile-whitelist support;
|
|
1393
|
+
machine-readable `phase46.ci_verdict.v1`.
|
|
1394
|
+
- Frontier-model slot: `aspen_mac1_coder_70b` profile +
|
|
1395
|
+
`profiles.model_availability()` declarative check. Runner
|
|
1396
|
+
attaches `model_metadata` to recorded launches so downstream
|
|
1397
|
+
consumers can distinguish *slot_pending_availability* from
|
|
1398
|
+
*assumed_resident* without probing Ollama.
|
|
1399
|
+
|
|
1400
|
+
The boundary layer does not change any programme-internal
|
|
1401
|
+
semantics; see `vision_mvp/RESULTS_PHASE46.md` and
|
|
1402
|
+
`docs/context_zero_master_plan.md` §9.9 for the endogenous /
|
|
1403
|
+
exogenous split.
|
|
1404
|
+
|
|
1405
|
+
|
|
1406
|
+
---
|
|
1407
|
+
|
|
1408
|
+
## Stable-vs-Experimental Boundary (SDK v3.29 / W28)
|
|
1409
|
+
|
|
1410
|
+
As of SDK v3.29 the CoordPy public surface is split into **stable**
|
|
1411
|
+
and **experimental** tiers, named explicitly in
|
|
1412
|
+
`vision_mvp/coordpy/__init__.py`:
|
|
1413
|
+
|
|
1414
|
+
* **Stable surface** (everything in `__all__` *not* in
|
|
1415
|
+
`__experimental__`): the run boundary (`RunSpec`, `run`),
|
|
1416
|
+
capsule primitives (`ContextCapsule`, `CapsuleLedger`,
|
|
1417
|
+
`CapsuleView`, lifecycle audit), provenance, the LLM backend
|
|
1418
|
+
abstraction (`LLMBackend`, `OllamaBackend`,
|
|
1419
|
+
`MLXDistributedBackend`), the team coordination ledger primitives
|
|
1420
|
+
(`capsule_team_handoff`, `capsule_role_view`, `capsule_team_decision`,
|
|
1421
|
+
`T_INVARIANTS`), and the layered API (`CoordPySimpleAPI`,
|
|
1422
|
+
`CoordPyBuilderAPI`, `CoordPyAdvancedAPI`). The W3 capsule contract,
|
|
1423
|
+
the W4 team-lifecycle audit, and the run-boundary product
|
|
1424
|
+
runtime contract are all in the stable surface and are subject
|
|
1425
|
+
to semantic-version compatibility within the 0.5.x line.
|
|
1426
|
+
* **Experimental surface** (`vision_mvp.coordpy.__experimental__`):
|
|
1427
|
+
the dense-control / multi-agent-coordination research line —
|
|
1428
|
+
W22 latent digest, W23 cross-cell delta, W24 session compaction,
|
|
1429
|
+
W25 shared fanout, W26 chain-persisted fanout, W27 multi-chain
|
|
1430
|
+
pivot, W28 ensemble-verified ratification. These symbols may
|
|
1431
|
+
evolve between minor versions; external callers should pin a
|
|
1432
|
+
specific SDK version (`__version__` / `SDK_VERSION`) and watch
|
|
1433
|
+
the CHANGELOG for breaking changes.
|
|
1434
|
+
|
|
1435
|
+
The split is *additive* on the v3.28 surface — every prior
|
|
1436
|
+
exported symbol remains exported; the `__experimental__` tuple
|
|
1437
|
+
is a *marker*, not a removal. External callers depending only on
|
|
1438
|
+
the stable surface should see no behavioural change crossing the
|
|
1439
|
+
v3.28 → v3.29 boundary.
|
|
1440
|
+
|
|
1441
|
+
The stability of the stable surface is mechanically asserted by
|
|
1442
|
+
`test_coordpy_public_api.py`; the experimental surface is asserted
|
|
1443
|
+
by the W22..W28 phase tests.
|