extodan-agentsync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentsync/__init__.py +17 -0
- agentsync/__main__.py +55 -0
- agentsync/demo/langgraph_demo.py +232 -0
- agentsync/harness.py +252 -0
- agentsync/models.py +251 -0
- agentsync/py.typed +0 -0
- agentsync/repro.py +146 -0
- agentsync/store.py +223 -0
- agentsync/strategies/__init__.py +43 -0
- agentsync/strategies/crdt.py +275 -0
- agentsync/strategies/lww.py +115 -0
- agentsync/strategies/transactional.py +209 -0
- agentsync/table.py +54 -0
- agentsync/workloads/__init__.py +102 -0
- extodan_agentsync-0.1.0.dist-info/METADATA +197 -0
- extodan_agentsync-0.1.0.dist-info/RECORD +19 -0
- extodan_agentsync-0.1.0.dist-info/WHEEL +4 -0
- extodan_agentsync-0.1.0.dist-info/entry_points.txt +2 -0
- extodan_agentsync-0.1.0.dist-info/licenses/LICENSE +21 -0
agentsync/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Agent-State Sync Engine — benchmark-first.
|
|
2
|
+
|
|
3
|
+
The thesis under test: for multi-agent MERGEABLE shared state (notes, context,
|
|
4
|
+
structured docs), an event-graph CRDT (eg-walker family, here via Loro) gives
|
|
5
|
+
deterministic coordinator-free convergence with full write attribution at
|
|
6
|
+
ZERO model calls — beating naive last-write-wins (silent corruption) and
|
|
7
|
+
LLM-mediated transactional control (costs inference per conflict).
|
|
8
|
+
|
|
9
|
+
The benchmark is the product. See README for the thesis, build order, and
|
|
10
|
+
results.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "0.0.1"
|
|
14
|
+
|
|
15
|
+
from .store import SyncedStore
|
|
16
|
+
|
|
17
|
+
__all__ = ["SyncedStore"]
|
agentsync/__main__.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""`python -m agentsync` / `make bench` entry point.
|
|
2
|
+
|
|
3
|
+
Runs every available strategy against every available workload, prints the
|
|
4
|
+
comparison table, and exits nonzero if any row FAILED its expectation — so the
|
|
5
|
+
benchmark doubles as a regression gate: a strategy that regresses (e.g. CRDT
|
|
6
|
+
suddenly loses a write, or LWW stops reproducing the known corruption) fails
|
|
7
|
+
the run.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
from .harness import run_one
|
|
15
|
+
from .strategies import make_strategy
|
|
16
|
+
from .table import render_json, render_table
|
|
17
|
+
from .workloads import all_workloads
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Which strategies to run. crdt/transactional are added in later build steps;
|
|
21
|
+
# until then the harness runs cleanly with just lww and reports the first
|
|
22
|
+
# number, per the FIRST STEP directive.
|
|
23
|
+
_AVAILABLE_STRATEGIES = ["lww", "transactional", "crdt"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def main() -> int:
|
|
27
|
+
workloads = all_workloads()
|
|
28
|
+
results = []
|
|
29
|
+
for wl in workloads:
|
|
30
|
+
for strat in _AVAILABLE_STRATEGIES:
|
|
31
|
+
results.append(run_one(strat, wl, make_strategy))
|
|
32
|
+
|
|
33
|
+
print()
|
|
34
|
+
print(render_table(results))
|
|
35
|
+
print()
|
|
36
|
+
|
|
37
|
+
# Interpretation note: a row's FAIL verdict is a property of the STRATEGY,
|
|
38
|
+
# not the harness. LWW is *supposed* to fail these workloads — that failure
|
|
39
|
+
# is the corruption the thesis is built to eliminate. So the only thing
|
|
40
|
+
# that makes the benchmark itself fail is if a strategy we EXPECT to pass
|
|
41
|
+
# (crdt on both; transactional on both) regresses. For step 1, only lww is
|
|
42
|
+
# wired up and its failures are expected, so the run is a success.
|
|
43
|
+
expected_pass = {"crdt", "transactional"}
|
|
44
|
+
regressions = [
|
|
45
|
+
r for r in results if r.strategy in expected_pass and r.verdict == "FAIL"
|
|
46
|
+
]
|
|
47
|
+
if regressions:
|
|
48
|
+
print(f"⚠ {len(regressions)} expected-pass strategy regressed — see notes.")
|
|
49
|
+
return 1
|
|
50
|
+
print("✓ benchmark complete; LWW failures above are the demonstrated baseline.")
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if __name__ == "__main__":
|
|
55
|
+
sys.exit(main())
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""LangGraph demo — the asset to send, not plumbing to polish.
|
|
2
|
+
|
|
3
|
+
The 60-second pitch: watch LWW silently corrupt an agent's shared state, then
|
|
4
|
+
watch CRDT not. The SAME two-agent graph runs twice — once with an LWW store
|
|
5
|
+
backend, once with a CRDT store backend — and the demo prints exactly what each
|
|
6
|
+
agent wrote, what survived the merge, and (for CRDT) what got escalated.
|
|
7
|
+
|
|
8
|
+
Why each agent writes to its OWN replica then syncs (rather than both writing
|
|
9
|
+
one shared dict): that's how real multi-agent shared memory actually works.
|
|
10
|
+
Each subagent edits a local copy of the context and the sync layer reconciles.
|
|
11
|
+
langgraph 1.x raises ``InvalidUpdateError`` if two parallel nodes write the same
|
|
12
|
+
state key with no reducer — so we bypass state for the shared memory itself and
|
|
13
|
+
use the graph purely to model the *concurrency* (two agents acting at once),
|
|
14
|
+
which is the honest mapping.
|
|
15
|
+
|
|
16
|
+
Demo runs two scenarios back to back:
|
|
17
|
+
|
|
18
|
+
* ``clean_merge`` — both agents add distinct findings to shared notes/tags.
|
|
19
|
+
LWW drops one agent's findings; CRDT keeps both. Convergence-without-loss.
|
|
20
|
+
* ``conflict`` — both agents set the same scalar to different values.
|
|
21
|
+
LWW silently picks one; CRDT escalates with both contenders attributed.
|
|
22
|
+
|
|
23
|
+
The point is visual: same agents, same writes, opposite outcomes by backend.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from dataclasses import dataclass
|
|
29
|
+
from typing import Callable
|
|
30
|
+
|
|
31
|
+
from langgraph.graph import END, START, StateGraph
|
|
32
|
+
from typing_extensions import TypedDict
|
|
33
|
+
|
|
34
|
+
from ..models import FieldKind, MergeStrategy, Write
|
|
35
|
+
from ..strategies import make_strategy
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GraphState(TypedDict):
|
|
39
|
+
"""Per-agent scratch passed through the graph. The shared memory itself
|
|
40
|
+
lives in the Store (one replica per agent), NOT in graph state — otherwise
|
|
41
|
+
langgraph's reducer guard would intercept the very concurrency we want to
|
|
42
|
+
measure."""
|
|
43
|
+
agent_id: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class StoreHandle:
|
|
48
|
+
"""One agent's handle onto shared memory: its own replica + an op counter.
|
|
49
|
+
|
|
50
|
+
The agent writes locally (its replica is isolated from the other agent's),
|
|
51
|
+
then the harness syncs replicas. This mirrors each subagent holding a local
|
|
52
|
+
copy of shared context and the sync engine reconciling after.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
agent_id: str
|
|
56
|
+
replica: MergeStrategy
|
|
57
|
+
_op: int = 0
|
|
58
|
+
|
|
59
|
+
def write(self, field: str, value, kind: FieldKind = FieldKind.scalar) -> None:
|
|
60
|
+
self.replica.apply(
|
|
61
|
+
Write(
|
|
62
|
+
agent_id=self.agent_id,
|
|
63
|
+
op_id=self._op,
|
|
64
|
+
field=field,
|
|
65
|
+
value=value,
|
|
66
|
+
kind=kind,
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
self._op += 1
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _agent_node(agent_id: str, field: str, value, kind: FieldKind):
|
|
73
|
+
"""Build a node fn that writes one contribution to THIS agent's replica.
|
|
74
|
+
|
|
75
|
+
The agent_id is bound into the closure at construction (not read from graph
|
|
76
|
+
state, which is shared across the whole graph). Returns an empty state
|
|
77
|
+
update because the real payload lives in the replica, not in graph state.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def node(state: GraphState) -> dict:
|
|
81
|
+
handle = _HANDLES[agent_id]
|
|
82
|
+
handle.write(field, value, kind)
|
|
83
|
+
return {}
|
|
84
|
+
|
|
85
|
+
return node
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# Module-level registry so node closures can find their handle without it being
|
|
89
|
+
# serializable graph state. Reset per-run by _run_scenario.
|
|
90
|
+
_HANDLES: dict[str, StoreHandle] = {}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _make_handles(strategy_name: str, agent_ids: list[str]) -> None:
|
|
94
|
+
_HANDLES.clear()
|
|
95
|
+
for aid in agent_ids:
|
|
96
|
+
_HANDLES[aid] = StoreHandle(agent_id=aid, replica=make_strategy(strategy_name))
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _run_scenario(strategy_name: str, agent_work: dict[str, list[tuple]]) -> dict:
|
|
100
|
+
"""Run one backend through the graph. Returns what each agent wrote and the
|
|
101
|
+
merged result.
|
|
102
|
+
|
|
103
|
+
``agent_work`` maps agent_id -> list of (field, value, kind) writes. All
|
|
104
|
+
agents run as parallel branches off START (concurrent), then a sync step
|
|
105
|
+
merges every replica into every other (full mesh) and we read out.
|
|
106
|
+
"""
|
|
107
|
+
agent_ids = list(agent_work.keys())
|
|
108
|
+
_make_handles(strategy_name, agent_ids)
|
|
109
|
+
|
|
110
|
+
g = StateGraph(GraphState)
|
|
111
|
+
for aid in agent_ids:
|
|
112
|
+
# Each agent becomes its own branch off START. Multiple writes per
|
|
113
|
+
# agent chain into a linear sub-sequence ending at END.
|
|
114
|
+
prev = START
|
|
115
|
+
for i, (field, value, kind) in enumerate(agent_work[aid]):
|
|
116
|
+
node_name = f"{aid}_{i}"
|
|
117
|
+
g.add_node(node_name, _agent_node(aid, field, value, kind))
|
|
118
|
+
g.add_edge(prev, node_name)
|
|
119
|
+
prev = node_name
|
|
120
|
+
g.add_edge(prev, END)
|
|
121
|
+
compiled = g.compile()
|
|
122
|
+
|
|
123
|
+
# Fan out: START feeds every agent's first node. We invoke with each agent
|
|
124
|
+
# id present so nodes can route to their handle.
|
|
125
|
+
compiled.invoke({"agent_id": agent_ids[0]})
|
|
126
|
+
|
|
127
|
+
# Phase 2 — sync. Full-mesh merge of replicas (each imports every other).
|
|
128
|
+
handles = [_HANDLES[aid] for aid in agent_ids]
|
|
129
|
+
for h in handles:
|
|
130
|
+
for other in handles:
|
|
131
|
+
if other is h:
|
|
132
|
+
continue
|
|
133
|
+
h.replica.import_state(other.replica.export_state())
|
|
134
|
+
|
|
135
|
+
final = handles[0].replica.finalized_state()
|
|
136
|
+
escalations = []
|
|
137
|
+
drain = getattr(handles[0].replica, "_escalations", None)
|
|
138
|
+
if drain:
|
|
139
|
+
for esc in drain:
|
|
140
|
+
escalations.append(
|
|
141
|
+
{
|
|
142
|
+
"field": esc.field,
|
|
143
|
+
"contenders": [
|
|
144
|
+
{"agent": c.agent_id, "op": c.op_id, "value": c.value}
|
|
145
|
+
for c in esc.contenders
|
|
146
|
+
],
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
return {"final_state": final, "escalations": escalations, "strategy": strategy_name}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# The two scenarios. Mirrored from the benchmark workloads so the demo and the
|
|
154
|
+
# table tell the same story.
|
|
155
|
+
# ---------------------------------------------------------------------------
|
|
156
|
+
|
|
157
|
+
_CLEAN_MERGE = {
|
|
158
|
+
"researcher": [
|
|
159
|
+
("project", "AgentSync", FieldKind.scalar),
|
|
160
|
+
("findings", "found 3 CRDT papers", FieldKind.append_text),
|
|
161
|
+
("tags", {"crdt", "agents"}, FieldKind.grow_set),
|
|
162
|
+
],
|
|
163
|
+
"writer": [
|
|
164
|
+
("findings", "drafted intro section", FieldKind.append_text),
|
|
165
|
+
("tags", {"benchmark"}, FieldKind.grow_set),
|
|
166
|
+
],
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
_CONFLICT = {
|
|
170
|
+
"researcher": [("status", "draft", FieldKind.scalar)],
|
|
171
|
+
"writer": [("status", "published", FieldKind.scalar)],
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _expected_clean() -> set:
|
|
176
|
+
return {"crdt", "agents", "benchmark"}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def main() -> int:
|
|
180
|
+
print()
|
|
181
|
+
print("=" * 72)
|
|
182
|
+
print("AGENTSYNC LANGGRAPH DEMO — same graph, two shared-memory backends")
|
|
183
|
+
print("=" * 72)
|
|
184
|
+
|
|
185
|
+
for label, work, strategy_pair in [
|
|
186
|
+
("SCENARIO 1 — clean merge (mergeable concurrent writes)", _CLEAN_MERGE, ("lww", "crdt")),
|
|
187
|
+
("SCENARIO 2 — semantic conflict (same scalar, different values)", _CONFLICT, ("lww", "crdt")),
|
|
188
|
+
]:
|
|
189
|
+
print()
|
|
190
|
+
print("-" * 72)
|
|
191
|
+
print(label)
|
|
192
|
+
print("-" * 72)
|
|
193
|
+
what = {}
|
|
194
|
+
for aid, writes in work.items():
|
|
195
|
+
what[aid] = [(f, v) for f, v, _ in writes]
|
|
196
|
+
print(f" {aid} writes: {what[aid]}")
|
|
197
|
+
|
|
198
|
+
for strat in strategy_pair:
|
|
199
|
+
res = _run_scenario(strat, work)
|
|
200
|
+
print(f"\n [{strat}] merged state: {res['final_state']}")
|
|
201
|
+
if res["escalations"]:
|
|
202
|
+
print(f" [{strat}] ESCALATED (flagged, not auto-resolved):")
|
|
203
|
+
for esc in res["escalations"]:
|
|
204
|
+
print(f" field={esc['field']!r} contenders={esc['contenders']}")
|
|
205
|
+
else:
|
|
206
|
+
print(f" [{strat}] no escalations")
|
|
207
|
+
|
|
208
|
+
# Verdict line: make the corruption-vs-convergence contrast explicit.
|
|
209
|
+
print()
|
|
210
|
+
print("=" * 72)
|
|
211
|
+
print("VERDICT")
|
|
212
|
+
print("=" * 72)
|
|
213
|
+
lww_clean = _run_scenario("lww", _CLEAN_MERGE)
|
|
214
|
+
crdt_clean = _run_scenario("crdt", _CLEAN_MERGE)
|
|
215
|
+
lww_tags = set(lww_clean["final_state"].get("tags", []))
|
|
216
|
+
crdt_tags = set(crdt_clean["final_state"].get("tags", []))
|
|
217
|
+
expected = _expected_clean()
|
|
218
|
+
print(f" clean merge — expected tags: {sorted(expected)}")
|
|
219
|
+
print(f" LWW kept tags: {sorted(lww_tags)} -> {'OK' if lww_tags == expected else 'LOST ' + str(sorted(expected - lww_tags))}")
|
|
220
|
+
print(f" CRDT kept tags: {sorted(crdt_tags)} -> {'OK' if crdt_tags == expected else 'LOST ' + str(sorted(expected - crdt_tags))}")
|
|
221
|
+
|
|
222
|
+
crdt_conflict = _run_scenario("crdt", _CONFLICT)
|
|
223
|
+
lww_conflict = _run_scenario("lww", _CONFLICT)
|
|
224
|
+
print(f"\n semantic conflict on 'status':")
|
|
225
|
+
print(f" LWW -> status={lww_conflict['final_state'].get('status')!r}, escalated: {bool(lww_conflict['escalations'])} (silently picked)")
|
|
226
|
+
print(f" CRDT -> status={crdt_conflict['final_state'].get('status')!r}, escalated: {bool(crdt_conflict['escalations'])} (flagged for review)")
|
|
227
|
+
print()
|
|
228
|
+
return 0
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
raise SystemExit(main())
|
agentsync/harness.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""The three-way benchmark harness — the core deliverable.
|
|
2
|
+
|
|
3
|
+
For each workload × strategy:
|
|
4
|
+
|
|
5
|
+
1. Create one strategy replica per agent in the workload.
|
|
6
|
+
2. Apply that agent's local writes to its own replica (the "concurrent" phase —
|
|
7
|
+
replicas never see each other here).
|
|
8
|
+
3. Merge every replica into every other (full-mesh import/export), so each
|
|
9
|
+
replica ends up holding the union. This is the sync phase.
|
|
10
|
+
4. Measure: convergence (all replicas identical?), writes lost vs. seen,
|
|
11
|
+
attribution completeness, model calls, wall-clock, peak memory.
|
|
12
|
+
5. Score a PASS/FAIL verdict from the workload's expectation.
|
|
13
|
+
|
|
14
|
+
Strategies are pluggable via :data:`STRATEGIES`; the harness never branches on
|
|
15
|
+
strategy identity, so adding a fourth is a one-line registry change.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import time
|
|
21
|
+
from dataclasses import asdict
|
|
22
|
+
from typing import Callable
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import psutil
|
|
26
|
+
_HAS_PSUTIL = True
|
|
27
|
+
_proc = psutil.Process()
|
|
28
|
+
except ImportError: # pragma: no cover - bench extra is optional
|
|
29
|
+
_HAS_PSUTIL = False
|
|
30
|
+
_proc = None # type: ignore[assignment]
|
|
31
|
+
|
|
32
|
+
from .models import EndState, FieldKind, MergeStrategy, RunResult, Workload
|
|
33
|
+
from .strategies import make_strategy
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _peak_mem_kb() -> float:
|
|
37
|
+
if _proc is None:
|
|
38
|
+
return float("nan")
|
|
39
|
+
# rss is resident set size of this process; the harness is single-process,
|
|
40
|
+
# so this captures the strategy's memory footprint directly.
|
|
41
|
+
return _proc.memory_info().rss / 1024.0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def run_one(
|
|
45
|
+
strategy_name: str,
|
|
46
|
+
workload: Workload,
|
|
47
|
+
strategy_factory: Callable[[str], MergeStrategy] = make_strategy,
|
|
48
|
+
) -> RunResult:
|
|
49
|
+
"""Run ``workload`` through one strategy and return a measured result row.
|
|
50
|
+
|
|
51
|
+
Full-mesh merge: every replica imports every other replica's exported
|
|
52
|
+
state. With N replicas this is N*(N-1) imports; for our 2-agent MVP that's
|
|
53
|
+
a single swap, but the loop generalizes to larger agent counts without
|
|
54
|
+
touching measurement logic.
|
|
55
|
+
"""
|
|
56
|
+
t0 = time.perf_counter()
|
|
57
|
+
mem_before = _peak_mem_kb()
|
|
58
|
+
notes: list[str] = []
|
|
59
|
+
|
|
60
|
+
replica_ids = list(workload.writes_by_replica.keys())
|
|
61
|
+
replicas: dict[str, MergeStrategy] = {
|
|
62
|
+
rid: strategy_factory(strategy_name) for rid in replica_ids
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Phase 1 — local concurrent edits (replicas are isolated).
|
|
66
|
+
for rid in replica_ids:
|
|
67
|
+
for write in workload.writes_by_replica[rid]:
|
|
68
|
+
replicas[rid].apply(write)
|
|
69
|
+
|
|
70
|
+
# Phase 2 — full-mesh sync: each replica pulls every other replica's state.
|
|
71
|
+
for rid in replica_ids:
|
|
72
|
+
for other in replica_ids:
|
|
73
|
+
if other == rid:
|
|
74
|
+
continue
|
|
75
|
+
replicas[rid].import_state(replicas[other].export_state())
|
|
76
|
+
|
|
77
|
+
states = {rid: r.finalized_state() for rid, r in replicas.items()}
|
|
78
|
+
metrics = {rid: r.metrics() for rid, r in replicas.items()}
|
|
79
|
+
|
|
80
|
+
latency_ms = (time.perf_counter() - t0) * 1000.0
|
|
81
|
+
peak_mem_kb = max(_peak_mem_kb(), mem_before)
|
|
82
|
+
|
|
83
|
+
converged = len({str(sorted(s.items())) for s in states.values()}) == 1
|
|
84
|
+
|
|
85
|
+
# --- Correctness scoring against the workload's expectation ---
|
|
86
|
+
sample_metrics = metrics[replica_ids[0]]
|
|
87
|
+
# writes_lost is measured structurally, NOT from per-apply counters: an
|
|
88
|
+
# apply always succeeds locally, so counting seen-applied would be 0 even
|
|
89
|
+
# when a merge later clobbers the write. Instead we ask, for every
|
|
90
|
+
# mergeable field, how many issued writes are reflected in the converged
|
|
91
|
+
# state. The difference is real loss — e.g. LWW keeps one agent's `tags`
|
|
92
|
+
# and silently overwrites the other's.
|
|
93
|
+
converged_state = states[replica_ids[0]]
|
|
94
|
+
writes_lost = _count_lost_writes(converged_state, workload)
|
|
95
|
+
escalations = getattr(sample_metrics, "escalations", 0)
|
|
96
|
+
# Attribution: every surviving write should trace to an agent. We check this
|
|
97
|
+
# structurally — a correct merge keeps one attribution per *write*, LWW
|
|
98
|
+
# keeps one per *field* (overwritten agents vanish).
|
|
99
|
+
attribution_complete = _check_attribution(
|
|
100
|
+
workload, replicas, sample_metrics
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Outcome — HOW the strategy reached its end state. Derived, not declared,
|
|
104
|
+
# because corruption is a measurement, not a self-report. Two rules, in
|
|
105
|
+
# priority order:
|
|
106
|
+
# 1. corrupted: a mergeable write that should have survived didn't, OR a
|
|
107
|
+
# real semantic conflict existed and the strategy produced no signal
|
|
108
|
+
# for it (LWW silently picks a winner). Either way intent was lost
|
|
109
|
+
# with no escalation — the baseline failure mode.
|
|
110
|
+
# 2. otherwise: the strategy's declared mode for conflicts it actually
|
|
111
|
+
# handled — resolved (spent a model call) or escalated (flagged).
|
|
112
|
+
# On a clean merge with nothing lost, every strategy is auto_merged.
|
|
113
|
+
had_conflict = not workload.expectation.clean_merge
|
|
114
|
+
silent_on_conflict = had_conflict and escalations == 0
|
|
115
|
+
if writes_lost > 0 or silent_on_conflict:
|
|
116
|
+
outcome = EndState.corrupted
|
|
117
|
+
elif had_conflict:
|
|
118
|
+
outcome = getattr(
|
|
119
|
+
replicas[replica_ids[0]], "conflict_mode", EndState.escalated
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
outcome = EndState.auto_merged
|
|
123
|
+
|
|
124
|
+
verdict, fail_notes = _score(
|
|
125
|
+
workload, converged, writes_lost, escalations, attribution_complete
|
|
126
|
+
)
|
|
127
|
+
notes.extend(fail_notes)
|
|
128
|
+
|
|
129
|
+
return RunResult(
|
|
130
|
+
strategy=strategy_name,
|
|
131
|
+
workload=workload.name,
|
|
132
|
+
converged=converged,
|
|
133
|
+
writes_lost=writes_lost,
|
|
134
|
+
attribution_complete=attribution_complete,
|
|
135
|
+
escalations=escalations,
|
|
136
|
+
model_calls=sample_metrics.model_calls,
|
|
137
|
+
latency_ms=latency_ms,
|
|
138
|
+
peak_mem_kb=peak_mem_kb,
|
|
139
|
+
outcome=outcome,
|
|
140
|
+
verdict=verdict,
|
|
141
|
+
notes=notes,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _count_lost_writes(converged_state: dict, workload: Workload) -> int:
|
|
146
|
+
"""Count writes the strategy dropped, judged structurally from the merged state.
|
|
147
|
+
|
|
148
|
+
For mergeable fields (grow_set / append_text) the correct converged value
|
|
149
|
+
contains EVERY agent's contribution; each one missing is one lost write.
|
|
150
|
+
For scalars there's nothing to lose — only one value can hold, so a
|
|
151
|
+
conflict there is an escalation concern, not a "lost write". This mirrors
|
|
152
|
+
the thesis: mergeable state must not lose writes; semantic state must
|
|
153
|
+
escalate.
|
|
154
|
+
"""
|
|
155
|
+
# Bucket writes by field so we know each field's expected contributions.
|
|
156
|
+
by_field: dict[str, list] = {}
|
|
157
|
+
for writes in workload.writes_by_replica.values():
|
|
158
|
+
for w in writes:
|
|
159
|
+
by_field.setdefault(w.field, []).append(w)
|
|
160
|
+
|
|
161
|
+
lost = 0
|
|
162
|
+
for field, writes in by_field.items():
|
|
163
|
+
kind = writes[0].kind
|
|
164
|
+
actual = converged_state.get(field)
|
|
165
|
+
if kind is FieldKind.grow_set:
|
|
166
|
+
expected = set()
|
|
167
|
+
for w in writes:
|
|
168
|
+
expected |= set(w.value)
|
|
169
|
+
actual_set = set(actual) if isinstance(actual, (list, tuple, set)) else set()
|
|
170
|
+
# Each distinct contribution missing from the union is a lost write.
|
|
171
|
+
lost += len(expected - actual_set)
|
|
172
|
+
elif kind is FieldKind.append_text:
|
|
173
|
+
expected_frags = [w.value for w in writes]
|
|
174
|
+
if actual is None:
|
|
175
|
+
lost += len(expected_frags)
|
|
176
|
+
else:
|
|
177
|
+
# Order is unspecified on concurrent append, so each fragment
|
|
178
|
+
# must simply appear somewhere in the merged text.
|
|
179
|
+
lost += sum(1 for frag in expected_frags if frag not in actual)
|
|
180
|
+
# scalar: no writes_lost accounting; conflicts are scored as escalations.
|
|
181
|
+
return lost
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _check_attribution(
|
|
185
|
+
workload: Workload, replicas: dict[str, MergeStrategy], metrics
|
|
186
|
+
) -> bool:
|
|
187
|
+
"""Attribution is complete iff every distinct write that survives to the
|
|
188
|
+
merged state still carries an agent_id.
|
|
189
|
+
|
|
190
|
+
We approximate via the per-write attribution the strategy exposes (if any).
|
|
191
|
+
LWW exposes per-FIELD attribution only, so a field written by two agents
|
|
192
|
+
has one attribution for two writes → incomplete. CRDT exposes per-write.
|
|
193
|
+
"""
|
|
194
|
+
# Count how many distinct (agent, op) writes the workload issued.
|
|
195
|
+
distinct_writes = set()
|
|
196
|
+
for writes in workload.writes_by_replica.values():
|
|
197
|
+
for w in writes:
|
|
198
|
+
distinct_writes.add((w.agent_id, w.op_id, w.field))
|
|
199
|
+
|
|
200
|
+
# Strategy-specific attribution introspection.
|
|
201
|
+
surviving = set()
|
|
202
|
+
for r in replicas.values():
|
|
203
|
+
attr_fn = getattr(r, "attribution", None)
|
|
204
|
+
if attr_fn is None:
|
|
205
|
+
continue
|
|
206
|
+
for key, meta in attr_fn().items():
|
|
207
|
+
surviving.add((meta["agent_id"], meta["op_id"], key))
|
|
208
|
+
if not surviving:
|
|
209
|
+
# Strategy exposes no per-write attribution at all — can't be complete.
|
|
210
|
+
return len(distinct_writes) == 0
|
|
211
|
+
return len(surviving) >= len(distinct_writes)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _score(
|
|
215
|
+
workload: Workload,
|
|
216
|
+
converged: bool,
|
|
217
|
+
writes_lost: int,
|
|
218
|
+
escalations: int,
|
|
219
|
+
attribution_complete: bool,
|
|
220
|
+
) -> tuple[str, list[str]]:
|
|
221
|
+
"""Turn raw metrics into a verdict using the workload's expectation."""
|
|
222
|
+
notes: list[str] = []
|
|
223
|
+
ok = True
|
|
224
|
+
|
|
225
|
+
if not converged:
|
|
226
|
+
ok = False
|
|
227
|
+
notes.append("replicas diverged (no convergence)")
|
|
228
|
+
|
|
229
|
+
exp = workload.expectation
|
|
230
|
+
if exp.all_writes_survive and writes_lost > 0:
|
|
231
|
+
ok = False
|
|
232
|
+
notes.append(f"lost {writes_lost} write(s) on a mergeable workload")
|
|
233
|
+
|
|
234
|
+
if exp.clean_merge and escalations > 0:
|
|
235
|
+
ok = False
|
|
236
|
+
notes.append(f"{escalations} spurious escalation(s) on a clean merge")
|
|
237
|
+
|
|
238
|
+
if exp.semantic_conflict_on and escalations < len(exp.semantic_conflict_on):
|
|
239
|
+
ok = False
|
|
240
|
+
notes.append(
|
|
241
|
+
f"expected escalation on {exp.semantic_conflict_on}, "
|
|
242
|
+
f"got {escalations}"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if exp.all_writes_survive and not attribution_complete:
|
|
246
|
+
ok = False
|
|
247
|
+
notes.append("attribution incomplete (a surviving write lost its agent)")
|
|
248
|
+
|
|
249
|
+
return ("PASS" if ok else "FAIL"), notes
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
__all__ = ["run_one"]
|