nexo-brain 7.32.0 → 7.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +1 -1
- package/package.json +1 -1
- package/src/consolidation_prep.py +380 -0
- package/src/db/__init__.py +5 -1
- package/src/db/_episodic.py +32 -0
- package/src/db/_memory_v2.py +276 -0
- package/src/db/_protocol.py +35 -0
- package/src/db/_schema.py +207 -0
- package/src/hooks/auto_capture.py +60 -24
- package/src/learning_resolver.py +42 -0
- package/src/local_context/api.py +237 -33
- package/src/local_context/db.py +3 -2
- package/src/local_context/usage_events.py +2 -0
- package/src/memory_retrieval.py +96 -7
- package/src/message_batch_preview.py +290 -0
- package/src/plugins/protocol.py +218 -27
- package/src/ppr.py +473 -0
- package/src/pre_answer_router.py +316 -3
- package/src/pre_answer_runtime.py +156 -1
- package/src/resolution_cache.py +1119 -0
- package/src/scripts/deep-sleep/apply_findings.py +86 -9
- package/src/scripts/deep-sleep/rewrite.py +625 -0
- package/src/scripts/nexo-deep-sleep.sh +10 -0
- package/src/scripts/nexo-followup-runner.py +110 -8
- package/src/scripts/nexo-morning-agent.py +43 -2
- package/src/scripts/nexo-postmortem-consolidator.py +44 -1
- package/src/self_error_detector.py +414 -0
- package/src/semantic_layers.py +30 -3
- package/templates/core-prompts/morning-agent.md +3 -0
- package/templates/core-prompts/postmortem-consolidator.md +29 -2
|
@@ -0,0 +1,1119 @@
|
|
|
1
|
+
"""Working-memory / resolution cache for the pre-answer router and repo maps.
|
|
2
|
+
|
|
3
|
+
This is the read/write side of the dead-but-already-computed cache key that
|
|
4
|
+
``pre_answer_runtime.select_budget_policy`` already produces (``route_cache_key``
|
|
5
|
+
+ ``cache_ttl_seconds``). The answer-path logged those for telemetry but nobody
|
|
6
|
+
read them; this module wires the ``get``/``set`` so a freshly resolved answer is
|
|
7
|
+
reused instead of re-running the whole router on the next identical question
|
|
8
|
+
within the TTL window.
|
|
9
|
+
|
|
10
|
+
Design (Francisco's brief): "when I mention project X or ask what you know about
|
|
11
|
+
María, don't re-search from zero if I just resolved it; only re-check if >X hours
|
|
12
|
+
passed, the source changed, or something relevant changed."
|
|
13
|
+
|
|
14
|
+
This module is a deliberately REDUCED copy of the proven ``semantic_layers``
|
|
15
|
+
pattern (``_source_fingerprint`` / ``source_version_for``), NOT a reinvention.
|
|
16
|
+
It is non-authoritative: diary / workflows / tasks / evidence / memory /
|
|
17
|
+
learnings / change_log and the git repos remain canonical. We only cache the
|
|
18
|
+
FINAL organized retrieval result.
|
|
19
|
+
|
|
20
|
+
ANTI-STALE RULE OF GOLD — a HIT is valid only if ALL hold:
|
|
21
|
+
1. now() < expires_at (TTL ceiling — cheap fast-fail)
|
|
22
|
+
2. status == 'fresh'
|
|
23
|
+
3. GLOBAL change_watermark == stored (any change_log mutation → MISS)
|
|
24
|
+
4. CONTENT SNAPSHOT matches: every consulted row, re-read by id, has the same
|
|
25
|
+
cheap version it had at write time. If ANY row changed, disappeared, or its
|
|
26
|
+
version cannot be read → MISS. This is the PRIMARY freshness guarantee.
|
|
27
|
+
If ANY fails → MISS → normal route runs, rewrites and re-caches. We never serve
|
|
28
|
+
something that could be stale. The ``instant`` tier (ttl=0) never caches.
|
|
29
|
+
The ``set`` happens after the FINAL route (including escalation), never mid-way.
|
|
30
|
+
|
|
31
|
+
WHY A CONTENT SNAPSHOT (and not just a fingerprint over proxy signals).
|
|
32
|
+
Earlier rounds derived freshness from PROXY signals: a single
|
|
33
|
+
``source_fingerprint`` digest from ``semantic_layers.source_version_for`` plus
|
|
34
|
+
the global ``change_log`` watermark. Both have blind spots for the refs the
|
|
35
|
+
pre-answer router actually emits. The router's ``evidence_refs`` are
|
|
36
|
+
``{source_name}:{id}`` with ``source_name`` ∈ {followups, reminders,
|
|
37
|
+
protocol_tasks, commitments, memory, recent_context, workflows, evidence_ledger,
|
|
38
|
+
causal_graph, learning, local_asset, hot_context, change_log, …} — the literal
|
|
39
|
+
SOURCE NAMES, often PLURAL. ``source_version_for`` keys off CANONICAL,
|
|
40
|
+
mostly-singular prefixes (``followup:``), so ``followups:NF-X`` fell through to
|
|
41
|
+
an ``unsupported`` namespace → empty version → an inert fingerprint. And the
|
|
42
|
+
stores behind those names (followups, reminders, workflows, commitments) are
|
|
43
|
+
mutated by tools that do NOT write ``change_log`` (e.g. ``followup_complete`` is
|
|
44
|
+
a plain UPDATE), so the global watermark does not move either. Net result: a
|
|
45
|
+
``followups:NF-X status=open`` answer was served stale after the followup was
|
|
46
|
+
completed. Verified, reproducible (``test_H*``).
|
|
47
|
+
|
|
48
|
+
The fix stops depending on proxies. ``_SOURCE_VERSIONERS`` is an EXPLICIT map
|
|
49
|
+
``source_name → (reads the concrete row, returns a cheap version)`` covering
|
|
50
|
+
EVERY source_name the router emits. ``set()`` captures a snapshot
|
|
51
|
+
``{ref: version}`` of the REAL rows; ``is_valid()`` re-reads those exact rows by
|
|
52
|
+
id (cheap, indexed lookups — never a full retrieval) and compares. The version
|
|
53
|
+
is whatever changes when the material content changes: a followup's
|
|
54
|
+
``status``/``updated_at``, a learning's ``content``, a local asset's
|
|
55
|
+
fingerprint, etc. Conditions (1)–(3) remain as cheap pre-checks; condition (4)
|
|
56
|
+
is the truth.
|
|
57
|
+
|
|
58
|
+
Condition (3) compares against the GLOBAL watermark (``get_change_watermark``
|
|
59
|
+
with sid=None), never the entry's sid-scoped one. Under the NEXO identity model
|
|
60
|
+
("if another terminal did X, I did X") a mutation landed by a DIFFERENT session
|
|
61
|
+
MUST invalidate this session's cache. The entry's ``sid`` scopes only the cache
|
|
62
|
+
KEY (so session A never serves session B's answer — enforced in ``get`` via
|
|
63
|
+
``expected_sid``), never the freshness.
|
|
64
|
+
|
|
65
|
+
CONSERVATIVE WRITE GATE: an answer is cacheable only if EVERY evidence ref has a
|
|
66
|
+
freshness handle — i.e. its ``source_name`` is in ``_SOURCE_VERSIONERS`` (or it
|
|
67
|
+
resolves, via the canonical resolver, to a versioned/missing-marked row), OR it
|
|
68
|
+
is one of the router's synthetic inline markers whose freshness legitimately
|
|
69
|
+
rides the global watermark (``filesystem:inline`` / ``recent_context:inline`` /
|
|
70
|
+
``commitments:text`` / ``kg:node:…``). A ref backed by NO trackable handle
|
|
71
|
+
(``project_atlas`` JSON, ``doc``/``spec``/``commit``/``audit``/``release``, an
|
|
72
|
+
unknown source_name, or a positional row we cannot identify) makes the whole
|
|
73
|
+
answer un-cacheable: ``set()`` refuses it with ``reason='untrackable_source'``.
|
|
74
|
+
Better an extra MISS than a stale HIT. The repo map carries its OWN handle
|
|
75
|
+
(``git_head`` + 24h TTL) and opts out via ``require_trackable_refs=False``.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
from __future__ import annotations
|
|
79
|
+
|
|
80
|
+
import hashlib
|
|
81
|
+
import json
|
|
82
|
+
import sqlite3
|
|
83
|
+
import threading
|
|
84
|
+
import time
|
|
85
|
+
from typing import Any
|
|
86
|
+
|
|
87
|
+
import db
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# This module exposes a public ``set(cache_key, result, ...)`` writer (the
|
|
91
|
+
# documented cache API), which shadows the builtin ``set`` inside the module
|
|
92
|
+
# scope. Capture the builtin once so internal code can still use it safely.
|
|
93
|
+
_builtin_set = set
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
POLICY_VERSION = "resolution_cache_v1"
|
|
97
|
+
|
|
98
|
+
# Intents that depend on the live session / who-said-what. Never cache them
|
|
99
|
+
# without scoping to the sid, otherwise one session could serve another
|
|
100
|
+
# session's "what did I just do" answer (cross-session leak).
|
|
101
|
+
SESSION_SCOPED_INTENTS = {"prior_work", "identity_authorship", "live_state_claim"}
|
|
102
|
+
|
|
103
|
+
# Module-level lock: the pre-answer executor is shared and MCP/CLI can hit the
|
|
104
|
+
# fast-path concurrently. SQLite is in WAL, but the get→validate→bump-hit and
|
|
105
|
+
# set sequences are guarded so two callers cannot corrupt hit_count / race a
|
|
106
|
+
# write. Small, but mandatory.
|
|
107
|
+
_LOCK = threading.RLock()
|
|
108
|
+
|
|
109
|
+
# Maintenance: the cache table must not grow without bound. We prune expired
|
|
110
|
+
# rows opportunistically on writes, throttled so it is not an extra DELETE on
|
|
111
|
+
# every single ``set``. ``_PRUNE_EVERY`` writes between prunes; ``_MAX_ROWS`` is
|
|
112
|
+
# a hard backstop that trims the oldest expired/stale rows if the table balloons
|
|
113
|
+
# (e.g. a burst of distinct keys) before the throttle window elapses.
|
|
114
|
+
_PRUNE_EVERY = 50
|
|
115
|
+
_MAX_ROWS = 5000
|
|
116
|
+
_writes_since_prune = 0
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _conn() -> sqlite3.Connection:
|
|
120
|
+
return db.get_db()
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _now() -> float:
|
|
124
|
+
try:
|
|
125
|
+
return float(db.now_epoch())
|
|
126
|
+
except Exception:
|
|
127
|
+
return time.time()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _table_ready(conn: sqlite3.Connection) -> bool:
|
|
131
|
+
try:
|
|
132
|
+
row = conn.execute(
|
|
133
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='resolution_cache' LIMIT 1"
|
|
134
|
+
).fetchone()
|
|
135
|
+
return bool(row)
|
|
136
|
+
except Exception:
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _change_watermark(sid: str | None = None) -> int:
|
|
141
|
+
try:
|
|
142
|
+
return int(db.get_change_watermark(sid))
|
|
143
|
+
except Exception:
|
|
144
|
+
return 0
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _parse_refs(value: Any) -> list[str]:
|
|
148
|
+
"""Coerce evidence/source refs into a clean, de-duped, sorted list."""
|
|
149
|
+
refs: list[str] = []
|
|
150
|
+
if isinstance(value, str):
|
|
151
|
+
value = [value]
|
|
152
|
+
if not isinstance(value, (list, tuple, _builtin_set, frozenset)):
|
|
153
|
+
return refs
|
|
154
|
+
for item in value:
|
|
155
|
+
clean = str(item or "").strip()
|
|
156
|
+
if clean and clean not in refs:
|
|
157
|
+
refs.append(clean)
|
|
158
|
+
return refs
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ── Per-row content versioners (the real anti-stale map) ─────────────────────
|
|
162
|
+
#
|
|
163
|
+
# Maps the pre-answer router's SOURCE-NAME prefixes (as emitted by
|
|
164
|
+
# ``pre_answer_router._rows_result`` and the inline ``evidence_refs=[...]``
|
|
165
|
+
# branches) to a cheap, real "version" read from the concrete row. The version
|
|
166
|
+
# is a small hash over the columns that CHANGE when the material content changes
|
|
167
|
+
# (status, updated_at, content, fingerprint…). If the row is gone we return the
|
|
168
|
+
# MISSING sentinel so a deletion still changes the snapshot. ``None`` means "no
|
|
169
|
+
# handle for this ref" → the write gate refuses to cache (untrackable_source).
|
|
170
|
+
#
|
|
171
|
+
# Each entry is (table, id_column, version_columns). The id in the ref is looked
|
|
172
|
+
# up by ``id_column`` — a single indexed SELECT, never a retrieval. This is the
|
|
173
|
+
# explicit map the brief requires: it covers EVERY source_name the router emits,
|
|
174
|
+
# so freshness no longer rides proxy signals (namespace classification +
|
|
175
|
+
# watermark) that miss these stores.
|
|
176
|
+
|
|
177
|
+
_MISSING = "__missing__"
|
|
178
|
+
|
|
179
|
+
# A row-version resolution can land in three states, and the fail-closed
|
|
180
|
+
# invariant treats each differently:
|
|
181
|
+
#
|
|
182
|
+
# * a real version string → the ref resolves to a REAL row whose content we
|
|
183
|
+
# hashed (the only thing safe to cache on);
|
|
184
|
+
# * ``_MISSING`` → the table/id-column are correct but NO row exists
|
|
185
|
+
# right now. On READ this is the deletion signal (real→missing = MISS); on
|
|
186
|
+
# WRITE it means there is no real row backing the answer, so the write gate
|
|
187
|
+
# must REFUSE (untrackable) — never cache an answer about a row that is not
|
|
188
|
+
# there;
|
|
189
|
+
# * ``None`` → the ref shape cannot be resolved to a real row at
|
|
190
|
+
# all (unknown source_name, unknown composite inner-kind, wrong/positional
|
|
191
|
+
# id, constant validator digest) → untrackable, refuse.
|
|
192
|
+
#
|
|
193
|
+
# Only a real version is cacheable. ``_MISSING`` and ``None`` both block the
|
|
194
|
+
# write gate; the difference is that ``_MISSING`` still participates in the
|
|
195
|
+
# stored snapshot's read-time comparison so a deletion of a row that WAS real at
|
|
196
|
+
# write time is caught. By construction a ref can never be cached on a CONSTANT
|
|
197
|
+
# sentinel: if it cannot resolve to a real row when ``set()`` runs, it is
|
|
198
|
+
# refused. "Mejor MISS de mas que un stale."
|
|
199
|
+
|
|
200
|
+
# source_name → (table, id_column, version_columns). version_columns are hashed
|
|
201
|
+
# together; whichever of them exists in the live table is used (schema-tolerant).
|
|
202
|
+
_SOURCE_VERSIONERS: dict[str, tuple[str, str, tuple[str, ...]]] = {
|
|
203
|
+
# Reminder/followup machinery — mutated by *_complete / *_update WITHOUT a
|
|
204
|
+
# change_log write, so these are the heart of the stale gap.
|
|
205
|
+
"followups": ("followups", "id", ("status", "updated_at", "date", "verification", "description")),
|
|
206
|
+
"reminders": ("reminders", "id", ("status", "updated_at", "date", "description", "category")),
|
|
207
|
+
# Protocol / workflow runtime.
|
|
208
|
+
"protocol_tasks": ("protocol_tasks", "task_id", ("status", "opened_at", "closed_at", "goal", "files_changed", "close_evidence", "outcome_notes")),
|
|
209
|
+
"workflows": ("workflow_runs", "run_id", ("status", "updated_at", "next_action", "current_step_key", "last_checkpoint_label")),
|
|
210
|
+
# Episodic / ledger sources.
|
|
211
|
+
"change_log": ("change_log", "id", ("id", "created_at", "files", "what_changed", "why")),
|
|
212
|
+
# The router's ``_source_diary`` adapter builds its ref from ``session_diary.id``
|
|
213
|
+
# (the numeric PK), pinned in ``pre_answer_router._ROUTER_REF_ID_FIELD['diary']``.
|
|
214
|
+
# It must be versioned by that SAME ``id`` column — NOT ``session_id``. A row
|
|
215
|
+
# carries both, and ``session_id`` is free text that can equal another row's
|
|
216
|
+
# numeric ``id``: looking up by ``session_id`` would resolve ``diary:<id>`` to
|
|
217
|
+
# the WRONG row (editing the real row would not move the snapshot → STALE HIT,
|
|
218
|
+
# while editing the colliding row would wrongly invalidate). This mirrors the
|
|
219
|
+
# already-correct ``_EVIDENCE_LEDGER_COMPOSITE['diary']`` entry (also ``id``).
|
|
220
|
+
"diary": ("session_diary", "id", ("id", "created_at", "summary", "pending", "context_next", "mental_state")),
|
|
221
|
+
"commitments": ("commitments", "id", ("status", "updated_at", "closed_at", "deadline", "statement", "evidence_ref", "outcome_id")),
|
|
222
|
+
# Live context. The router's recent_context adapter emits ``hot_context:<key>``
|
|
223
|
+
# (NOT ``recent_context:<key>``); the literal ``recent_context`` prefix only
|
|
224
|
+
# appears as the synthetic ``recent_context:inline`` marker, which rides the
|
|
225
|
+
# global watermark (it is in ``_WATERMARK_TRACKED_SOURCES``). So only
|
|
226
|
+
# ``hot_context`` is per-row versioned here — mapping ``recent_context`` to a
|
|
227
|
+
# hot_context row lookup would wrongly REFUSE the legitimate inline marker.
|
|
228
|
+
"hot_context": ("hot_context", "context_key", ("state", "last_event_at", "updated_at", "summary", "source_id")),
|
|
229
|
+
"continuity": ("continuity_snapshots", "id", ("id", "updated_at", "event_type", "trace_id", "idempotency_key")),
|
|
230
|
+
"transcript_index": ("transcript_index", "id", ("content_hash", "modified_at", "indexed_at", "message_count")),
|
|
231
|
+
"runtime_db": ("lifecycle_events", "event_id", ("delivery_status", "retry_count", "processed_at", "last_error", "created_at")),
|
|
232
|
+
# Knowledge / assets.
|
|
233
|
+
"learning": ("learnings", "id", ("updated_at", "status", "title", "content", "category")),
|
|
234
|
+
"local_asset": ("local_assets", "asset_id", ("updated_at", "quick_fingerprint", "modified_at_fs", "size_bytes", "status")),
|
|
235
|
+
"preference": ("preferences", "key", ("value", "category", "updated_at")),
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
# COMPOSITE-ID resolution for the ``evidence_ledger`` source.
|
|
239
|
+
#
|
|
240
|
+
# The ``evidence_ledger`` router adapter renders ``evidence_ledger.search_evidence``
|
|
241
|
+
# rows whose ``evidence_id`` is itself a COMPOSITE ``<inner_kind>:<id>`` (see
|
|
242
|
+
# ``src/evidence_ledger.py``: ``task:<task_id>``, ``workflow:<run_id>``,
|
|
243
|
+
# ``diary:<id>``, ``lifecycle:<event_id>``, ``continuity:<id>``,
|
|
244
|
+
# ``evidence_record:<event_uid>``, ``change_log:<id>``,
|
|
245
|
+
# ``workflow_checkpoint:<id>``, ``local_context:<id>``,
|
|
246
|
+
# ``local_context_usage:<event_id>``, ``transcript:<file>``). ``_rows_result``
|
|
247
|
+
# then prefixes the SOURCE name, so the cache sees ``evidence_ledger:task:PT-1``.
|
|
248
|
+
#
|
|
249
|
+
# The earlier map sent EVERY such ref to ``memory_events.event_uid`` — only
|
|
250
|
+
# ``evidence_record`` actually lives there. Every other composite id failed to
|
|
251
|
+
# match → a constant ``__missing__`` → a frozen snapshot → STALE. This map routes
|
|
252
|
+
# each inner kind to the CORRECT backing table so the version derives from the
|
|
253
|
+
# real row's content. An inner kind not listed here → None (untrackable).
|
|
254
|
+
#
|
|
255
|
+
# Each value is (table, id_column, version_columns). ``transcript`` is virtual
|
|
256
|
+
# (transcripts come from files, not a row) → mapped to None explicitly so a
|
|
257
|
+
# transcript-backed answer is refused rather than frozen.
|
|
258
|
+
_EVIDENCE_LEDGER_COMPOSITE: dict[str, tuple[str, str, tuple[str, ...]] | None] = {
|
|
259
|
+
"task": ("protocol_tasks", "task_id", ("status", "opened_at", "closed_at", "goal", "files_changed", "close_evidence", "outcome_notes")),
|
|
260
|
+
"workflow": ("workflow_runs", "run_id", ("status", "updated_at", "next_action", "current_step_key", "last_checkpoint_label")),
|
|
261
|
+
"workflow_checkpoint": ("workflow_checkpoints", "id", ("step_status", "run_status", "checkpoint_label", "created_at", "summary")),
|
|
262
|
+
"diary": ("session_diary", "id", ("created_at", "summary", "pending", "context_next", "mental_state")),
|
|
263
|
+
"lifecycle": ("lifecycle_events", "event_id", ("delivery_status", "retry_count", "processed_at", "last_error", "created_at")),
|
|
264
|
+
"continuity": ("continuity_snapshots", "id", ("updated_at", "event_type", "trace_id", "idempotency_key")),
|
|
265
|
+
"change_log": ("change_log", "id", ("id", "created_at", "files", "what_changed", "why")),
|
|
266
|
+
"evidence_record": ("memory_events", "event_uid", ("event_uid", "input_hash", "output_digest", "metadata_json", "created_at")),
|
|
267
|
+
"local_context": ("local_context_queries", "id", ("created_at", "query_hash", "result_count", "intent", "confidence")),
|
|
268
|
+
# Transcript evidence is file-backed (no row to version) → refuse, don't freeze.
|
|
269
|
+
"transcript": None,
|
|
270
|
+
"local_context_usage": None,
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
# source_names whose freshness LEGITIMATELY rides only the global change_log
|
|
274
|
+
# watermark: the router's synthetic inline markers + the canonical kinds whose
|
|
275
|
+
# stores DO flow through change_log. These are NOT untrackable (else the common
|
|
276
|
+
# filesystem / recent-context / kg answers would never cache), but they have no
|
|
277
|
+
# per-row snapshot — condition (3) is their guard. Listed explicitly so an
|
|
278
|
+
# UNKNOWN source_name is never silently assumed to be watermark-tracked.
|
|
279
|
+
_WATERMARK_TRACKED_SOURCES: frozenset[str] = frozenset({
|
|
280
|
+
"filesystem", "recent_context", "kg", "causal_graph", "kg_neighbors",
|
|
281
|
+
"associative_graph", "commitments", "guard_context", "cognitive",
|
|
282
|
+
})
|
|
283
|
+
# NOTE: ``memory_event`` is deliberately NOT watermark-tracked. It resolves to a
|
|
284
|
+
# REAL memory_events row through the canonical resolver (``source_version_for``),
|
|
285
|
+
# giving it per-row freshness — a content edit or deletion is a MISS, stronger
|
|
286
|
+
# than the watermark-only guard. Leaving it in the watermark set would shadow the
|
|
287
|
+
# canonical resolver and let a memory_event that changed without a change_log
|
|
288
|
+
# write be served stale. ``commitments``/``recent_context`` stay listed: the
|
|
289
|
+
# former is per-row via ``_SOURCE_VERSIONERS`` (this entry is an inert fallback),
|
|
290
|
+
# the latter has no per-row prefix emitted by the router (only the synthetic
|
|
291
|
+
# ``recent_context:inline`` marker), so the watermark is its correct guard.
|
|
292
|
+
|
|
293
|
+
# source_names that are genuinely untrackable (no per-row handle AND not in
|
|
294
|
+
# change_log): caching an answer that depends on them risks staleness, so the
|
|
295
|
+
# write gate refuses. Mirrors the canonical "validator_digest" kinds plus the
|
|
296
|
+
# router sources that read flat files / live greps / catalogs.
|
|
297
|
+
_UNTRACKABLE_SOURCES: frozenset[str] = frozenset({
|
|
298
|
+
"project_atlas", "doc", "spec", "commit", "audit", "finding", "release",
|
|
299
|
+
"outcome", "correction", "guard", "local_context", "test",
|
|
300
|
+
"runtime_docs", "source_grep", "system_catalog",
|
|
301
|
+
})
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _split_source_name(ref: str) -> tuple[str, str]:
|
|
305
|
+
"""Split ``{source_name}:{rest}`` → (source_name, rest). rest keeps any
|
|
306
|
+
further ``:`` (e.g. ``memory:learning:42`` → ('memory', 'learning:42'))."""
|
|
307
|
+
raw = str(ref or "").strip()
|
|
308
|
+
if ":" not in raw:
|
|
309
|
+
return raw, ""
|
|
310
|
+
name, rest = raw.split(":", 1)
|
|
311
|
+
return name, rest
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _table_columns(conn: sqlite3.Connection, table: str) -> set[str]:
|
|
315
|
+
try:
|
|
316
|
+
return {str(r["name"]) for r in conn.execute(f"PRAGMA table_info({table})").fetchall() if "name" in r.keys()}
|
|
317
|
+
except Exception:
|
|
318
|
+
return _builtin_set()
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _row_version_from_table(
|
|
322
|
+
conn: sqlite3.Connection,
|
|
323
|
+
table: str,
|
|
324
|
+
id_column: str,
|
|
325
|
+
id_value: str,
|
|
326
|
+
version_columns: tuple[str, ...],
|
|
327
|
+
) -> str | None:
|
|
328
|
+
"""Read ONE row by id and hash its version columns.
|
|
329
|
+
|
|
330
|
+
Returns a real content version ONLY when the row EXISTS. Returns ``None`` in
|
|
331
|
+
every other case — the table/columns are unusable in this DB, the lookup
|
|
332
|
+
raised, OR no row matches the id. ``None`` is the single "no real row" signal:
|
|
333
|
+
the write gate refuses it (nothing real to cache on) and the read path treats
|
|
334
|
+
None != the stored version as a change → MISS (so a row that was real at write
|
|
335
|
+
time and later disappears is caught). There is NO constant sentinel — that was
|
|
336
|
+
the stale class; a version is either a real-row hash or None.
|
|
337
|
+
"""
|
|
338
|
+
cols = _table_columns(conn, table)
|
|
339
|
+
if not cols or id_column not in cols:
|
|
340
|
+
return None
|
|
341
|
+
if not str(id_value or "").strip():
|
|
342
|
+
return None
|
|
343
|
+
usable = [c for c in version_columns if c in cols]
|
|
344
|
+
if not usable:
|
|
345
|
+
usable = [id_column]
|
|
346
|
+
select_cols = ", ".join(usable)
|
|
347
|
+
try:
|
|
348
|
+
row = conn.execute(
|
|
349
|
+
f"SELECT {select_cols} FROM {table} WHERE {id_column}=? LIMIT 1",
|
|
350
|
+
(id_value,),
|
|
351
|
+
).fetchone()
|
|
352
|
+
except Exception:
|
|
353
|
+
return None
|
|
354
|
+
if row is None:
|
|
355
|
+
return None
|
|
356
|
+
data = dict(row)
|
|
357
|
+
return _hash({"t": table, "v": {c: data.get(c) for c in usable}})
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _unified_search_version(conn: sqlite3.Connection, rest: str) -> str | None:
|
|
361
|
+
"""Version a ``memory:<source>:<source_id>`` ref via the unified_search FTS
|
|
362
|
+
snapshot (source, source_id) → updated_at. ``None`` whenever there is no real
|
|
363
|
+
row (pair gone, table absent, or not a resolvable shape).
|
|
364
|
+
|
|
365
|
+
``recall`` returns heterogeneous rows keyed by (source, source_id) with no
|
|
366
|
+
single ``id``; the router now emits ``memory:<source>:<source_id>`` so we
|
|
367
|
+
have a real, resolvable handle. A bare positional ``memory:<n>`` (no nested
|
|
368
|
+
source) is NOT resolvable → return None so the write gate refuses it."""
|
|
369
|
+
if ":" not in rest:
|
|
370
|
+
return None
|
|
371
|
+
src, source_id = rest.split(":", 1)
|
|
372
|
+
src, source_id = src.strip(), source_id.strip()
|
|
373
|
+
if not src or not source_id:
|
|
374
|
+
return None
|
|
375
|
+
try:
|
|
376
|
+
# unified_search is an FTS5 table; if it does not exist the SELECT raises
|
|
377
|
+
# and we treat the ref as unresolvable (→ untrackable, not stale).
|
|
378
|
+
row = conn.execute(
|
|
379
|
+
"SELECT updated_at, title FROM unified_search WHERE source=? AND source_id=? LIMIT 1",
|
|
380
|
+
(src, source_id),
|
|
381
|
+
).fetchone()
|
|
382
|
+
except Exception:
|
|
383
|
+
return None
|
|
384
|
+
if row is None:
|
|
385
|
+
return None
|
|
386
|
+
data = dict(row)
|
|
387
|
+
return _hash({"unified": [src, source_id], "updated_at": data.get("updated_at"), "title_h": _hash(str(data.get("title") or ""))})
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _evidence_ledger_version(conn: sqlite3.Connection, rest: str) -> str | None:
|
|
391
|
+
"""Resolve an ``evidence_ledger:<inner_kind>:<id>`` composite to a real row.
|
|
392
|
+
|
|
393
|
+
``rest`` is the inner composite ``<inner_kind>:<id>`` (``task:PT-1``,
|
|
394
|
+
``workflow:WF-2``, ``evidence_record:EV-9``, …). We parse the inner kind and
|
|
395
|
+
route to the CORRECT backing table (NOT memory_events for everything), so the
|
|
396
|
+
version derives from the real row's content. Returns:
|
|
397
|
+
* a real version (prefixed ``evidence_ledger:<inner_kind>:<hash>``) when the
|
|
398
|
+
row exists;
|
|
399
|
+
* None when no real row matches (missing/deleted), the inner kind is
|
|
400
|
+
unknown, the kind is file-backed (transcript/local_context_usage), or the
|
|
401
|
+
shape is otherwise unresolvable. A None is refused by the write gate and,
|
|
402
|
+
on read, is != the stored version → MISS. Never a constant sentinel.
|
|
403
|
+
"""
|
|
404
|
+
if ":" not in rest:
|
|
405
|
+
# No inner kind (bare ``evidence_ledger:<x>``) — not a router-emitted
|
|
406
|
+
# shape and not resolvable to a row → untrackable.
|
|
407
|
+
return None
|
|
408
|
+
inner_kind, inner_id = rest.split(":", 1)
|
|
409
|
+
inner_kind, inner_id = inner_kind.strip(), inner_id.strip()
|
|
410
|
+
spec = _EVIDENCE_LEDGER_COMPOSITE.get(inner_kind)
|
|
411
|
+
if spec is None:
|
|
412
|
+
# Unknown inner kind, or an explicitly file-backed kind (transcript,
|
|
413
|
+
# local_context_usage) with no row to version → refuse, never freeze.
|
|
414
|
+
return None
|
|
415
|
+
if not inner_id:
|
|
416
|
+
return None
|
|
417
|
+
table, id_column, version_columns = spec
|
|
418
|
+
version = _row_version_from_table(conn, table, id_column, inner_id, version_columns)
|
|
419
|
+
if version is None:
|
|
420
|
+
# No real row (missing/deleted) or unusable table → None. The write gate
|
|
421
|
+
# refuses; the read path sees None != stored → MISS. Never a constant.
|
|
422
|
+
return None
|
|
423
|
+
return f"evidence_ledger:{inner_kind}:{version}"
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def ref_version(ref: str, *, conn: sqlite3.Connection | None = None) -> str | None:
|
|
427
|
+
"""Return a cheap, real version for a single ref, or None if untrackable.
|
|
428
|
+
|
|
429
|
+
The version is a real per-row content hash when the ref resolves to an
|
|
430
|
+
EXISTING row. A ``__missing__`` marker (the table/id-column are correct but no
|
|
431
|
+
row matches right now) is returned EMBEDDED in the version so the read path
|
|
432
|
+
detects a deletion of a row that was real at write time — but the write gate
|
|
433
|
+
(``untrackable_refs`` / ``set``) treats ``__missing__`` as un-cacheable, so an
|
|
434
|
+
answer is never cached on a row that is not there. A shape that cannot resolve
|
|
435
|
+
to a real row at all returns None.
|
|
436
|
+
|
|
437
|
+
Resolution order:
|
|
438
|
+
1. ``evidence_ledger:<inner_kind>:<id>`` composite → its CORRECT backing
|
|
439
|
+
table (task→protocol_tasks, workflow→workflow_runs,
|
|
440
|
+
evidence_record→memory_events, …). Unknown/file-backed inner kind → None.
|
|
441
|
+
2. Router SOURCE-NAME map (``_SOURCE_VERSIONERS``) — per-row version.
|
|
442
|
+
3. ``memory:<source>:<source_id>`` → unified_search snapshot.
|
|
443
|
+
4. Watermark-tracked source_names → ``__wm__`` token (no per-row version;
|
|
444
|
+
their guard is the global watermark) — trackable.
|
|
445
|
+
5. Canonical resolver (``semantic_layers.source_version_for``) for nested
|
|
446
|
+
canonical refs (``memory_event:``, ``local_asset:#chunk``, ``learning:``):
|
|
447
|
+
versioned or missing-marked → trackable; constant ``validator_digest:`` →
|
|
448
|
+
untrackable (None).
|
|
449
|
+
6. Anything else → None (untrackable).
|
|
450
|
+
"""
|
|
451
|
+
conn = conn or _conn()
|
|
452
|
+
raw = str(ref or "").strip()
|
|
453
|
+
if not raw:
|
|
454
|
+
return None
|
|
455
|
+
name, rest = _split_source_name(raw)
|
|
456
|
+
|
|
457
|
+
# (1) evidence_ledger composite — must come BEFORE the source-name map so the
|
|
458
|
+
# composite inner kind decides the table, not a single hard-coded one.
|
|
459
|
+
if name == "evidence_ledger":
|
|
460
|
+
return _evidence_ledger_version(conn, rest)
|
|
461
|
+
|
|
462
|
+
spec = _SOURCE_VERSIONERS.get(name)
|
|
463
|
+
if spec is not None:
|
|
464
|
+
table, id_column, version_columns = spec
|
|
465
|
+
id_value = rest
|
|
466
|
+
# ``local_asset:<id>#chunk:<n>`` — version by the asset, ignore chunk.
|
|
467
|
+
if name == "local_asset" and "#" in id_value:
|
|
468
|
+
id_value = id_value.split("#", 1)[0]
|
|
469
|
+
if not id_value:
|
|
470
|
+
return None
|
|
471
|
+
version = _row_version_from_table(conn, table, id_column, id_value, version_columns)
|
|
472
|
+
if version is None:
|
|
473
|
+
# No real row (missing/deleted) or unusable table/columns → no handle.
|
|
474
|
+
# Write gate refuses; read path sees None != stored → MISS. Never a
|
|
475
|
+
# "fresh forever" constant.
|
|
476
|
+
return None
|
|
477
|
+
return f"{name}:{version}"
|
|
478
|
+
|
|
479
|
+
if name == "memory":
|
|
480
|
+
v = _unified_search_version(conn, rest)
|
|
481
|
+
return None if v is None else f"memory:{v}"
|
|
482
|
+
|
|
483
|
+
if name in _WATERMARK_TRACKED_SOURCES:
|
|
484
|
+
# Trackable via the global watermark (condition 3); no per-row snapshot.
|
|
485
|
+
return f"__wm__:{name}"
|
|
486
|
+
|
|
487
|
+
if name in _UNTRACKABLE_SOURCES:
|
|
488
|
+
return None
|
|
489
|
+
|
|
490
|
+
# Nested canonical ref (e.g. ``memory_event:``, ``local_asset:``) — defer to
|
|
491
|
+
# the canonical resolver so cognitive/local_context/semantic_layers sub-refs
|
|
492
|
+
# stay trackable.
|
|
493
|
+
try:
|
|
494
|
+
from semantic_layers import source_version_for
|
|
495
|
+
|
|
496
|
+
info = source_version_for(raw, conn=conn)
|
|
497
|
+
except Exception:
|
|
498
|
+
return None
|
|
499
|
+
status = str(info.get("validation_status") or "")
|
|
500
|
+
version = str(info.get("source_version") or "")
|
|
501
|
+
if status == "missing":
|
|
502
|
+
# No real row behind the canonical ref → None (refuse at write; on read a
|
|
503
|
+
# real→missing transition is None != stored → MISS). No constant marker.
|
|
504
|
+
return None
|
|
505
|
+
if version.startswith("validator_digest:"):
|
|
506
|
+
return None # constant digest = no real handle
|
|
507
|
+
if info.get("ok") and version:
|
|
508
|
+
return f"canon:{version}"
|
|
509
|
+
return None
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _resolves_to_real_row(version: str | None) -> bool:
|
|
513
|
+
"""True only when ``version`` derives from a REAL existing row.
|
|
514
|
+
|
|
515
|
+
This is the write-gate truth: a real version string is cacheable; a ``None``
|
|
516
|
+
(no handle) or any ``__missing__`` marker embedded in the version (no row
|
|
517
|
+
right now) is NOT — caching either would risk serving an answer about a row
|
|
518
|
+
that is gone or never existed. Watermark tokens (``__wm__``) and the
|
|
519
|
+
canonical resolver's positive versions are real handles.
|
|
520
|
+
|
|
521
|
+
The check is on the WRITE side; the READ side (``is_valid``) still keeps the
|
|
522
|
+
raw ``ref_version`` so a row that was real at write time and later disappears
|
|
523
|
+
(real→__missing__) is detected as a change → MISS.
|
|
524
|
+
"""
|
|
525
|
+
if version is None:
|
|
526
|
+
return False
|
|
527
|
+
if version == "":
|
|
528
|
+
return False
|
|
529
|
+
# Any embedded missing marker means "no real row at write time".
|
|
530
|
+
if _MISSING in version:
|
|
531
|
+
return False
|
|
532
|
+
return True
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def self_check_fail_closed(*, conn: sqlite3.Connection | None = None) -> dict[str, Any]:
|
|
536
|
+
"""Structural self-check that the fail-closed invariant holds for the WHOLE
|
|
537
|
+
map — not just the cases a test happens to exercise.
|
|
538
|
+
|
|
539
|
+
Three guarantees, checked against the LIVE schema so drift is caught:
|
|
540
|
+
|
|
541
|
+
(A) Every per-row versioner (``_SOURCE_VERSIONERS`` + the
|
|
542
|
+
``_EVIDENCE_LEDGER_COMPOSITE`` entries) names a table+id-column that
|
|
543
|
+
ACTUALLY EXISTS. If a versioner points at a non-existent table/column,
|
|
544
|
+
``_row_version_from_table`` returns '' for EVERY id forever — a
|
|
545
|
+
permanent constant masquerading as a handle. That is exactly the stale
|
|
546
|
+
class; this asserts it cannot happen silently.
|
|
547
|
+
|
|
548
|
+
(B) The write gate (``_resolves_to_real_row``) REFUSES every constant /
|
|
549
|
+
sentinel / unresolved version. A missing-row marker, an empty handle,
|
|
550
|
+
and ``None`` must all be non-cacheable, so nothing can be cached on a
|
|
551
|
+
version that does not derive from a real row.
|
|
552
|
+
|
|
553
|
+
(C) ROW-CORRECTNESS / column alignment: every source the router PINS via
|
|
554
|
+
``pre_answer_router._ROUTER_REF_ID_FIELD`` builds its ref from the SAME
|
|
555
|
+
column the versioner reads. (A) proves the versioner column exists; (C)
|
|
556
|
+
proves it is the column the adapter actually emits. Without (C) a
|
|
557
|
+
versioner can point at a real-but-WRONG column (``diary`` was versioned
|
|
558
|
+
by ``session_id`` while the adapter emitted ``id``) — both columns
|
|
559
|
+
exist, so (A) passes, yet ``ref_version`` resolves to another row and a
|
|
560
|
+
value collision serves the wrong row's content as fresh → STALE. This
|
|
561
|
+
closes that gap structurally, not case-by-case.
|
|
562
|
+
|
|
563
|
+
Returns a dict with ``ok`` and any ``problems``. Raises nothing — callers
|
|
564
|
+
(the anti-regression test) assert on the result.
|
|
565
|
+
"""
|
|
566
|
+
conn = conn or _conn()
|
|
567
|
+
problems: list[str] = []
|
|
568
|
+
|
|
569
|
+
# (A) every versioner resolves against a real table + id column.
|
|
570
|
+
def _check_versioner(label: str, table: str, id_column: str) -> None:
|
|
571
|
+
cols = _table_columns(conn, table)
|
|
572
|
+
if not cols:
|
|
573
|
+
problems.append(f"{label}: table '{table}' does not exist in live schema")
|
|
574
|
+
elif id_column not in cols:
|
|
575
|
+
problems.append(f"{label}: id_column '{id_column}' missing from '{table}'")
|
|
576
|
+
|
|
577
|
+
for name, (table, id_column, _cols) in _SOURCE_VERSIONERS.items():
|
|
578
|
+
_check_versioner(f"_SOURCE_VERSIONERS[{name}]", table, id_column)
|
|
579
|
+
for inner, spec in _EVIDENCE_LEDGER_COMPOSITE.items():
|
|
580
|
+
if spec is None:
|
|
581
|
+
continue # intentionally file-backed → untrackable, no table to check.
|
|
582
|
+
table, id_column, _cols = spec
|
|
583
|
+
_check_versioner(f"_EVIDENCE_LEDGER_COMPOSITE[{inner}]", table, id_column)
|
|
584
|
+
|
|
585
|
+
# (C) router ref-id column == versioner id-column for every pinned source.
|
|
586
|
+
try:
|
|
587
|
+
from pre_answer_router import _ROUTER_REF_ID_FIELD
|
|
588
|
+
except Exception as exc: # pragma: no cover — import only fails on broken tree
|
|
589
|
+
problems.append(f"row-correctness: cannot import _ROUTER_REF_ID_FIELD ({exc})")
|
|
590
|
+
_ROUTER_REF_ID_FIELD = {}
|
|
591
|
+
for source, ref_id_field in _ROUTER_REF_ID_FIELD.items():
|
|
592
|
+
spec = _SOURCE_VERSIONERS.get(source)
|
|
593
|
+
if spec is None:
|
|
594
|
+
problems.append(
|
|
595
|
+
f"row-correctness: router pins '{source}'->'{ref_id_field}' but no "
|
|
596
|
+
f"_SOURCE_VERSIONERS entry exists (ref would be untrackable)"
|
|
597
|
+
)
|
|
598
|
+
continue
|
|
599
|
+
_table, versioner_id_column, _cols = spec
|
|
600
|
+
if ref_id_field != versioner_id_column:
|
|
601
|
+
problems.append(
|
|
602
|
+
f"row-correctness: source '{source}' adapter emits ref by column "
|
|
603
|
+
f"'{ref_id_field}' but versioner looks up by '{versioner_id_column}' "
|
|
604
|
+
f"— ref resolves to the WRONG row (stale/over-resolution)"
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
# (B) the write gate refuses every sentinel / unresolved shape.
|
|
608
|
+
sentinels = {
|
|
609
|
+
"None": None,
|
|
610
|
+
"empty": "",
|
|
611
|
+
"bare_missing": _MISSING,
|
|
612
|
+
"prefixed_missing": f"evidence_ledger:task:{_MISSING}",
|
|
613
|
+
"canon_missing": f"canon:{_MISSING}",
|
|
614
|
+
"source_missing": f"runtime_db:{_MISSING}",
|
|
615
|
+
}
|
|
616
|
+
for label, value in sentinels.items():
|
|
617
|
+
if _resolves_to_real_row(value):
|
|
618
|
+
problems.append(f"write-gate accepts sentinel '{label}' ({value!r}) — fail-closed broken")
|
|
619
|
+
|
|
620
|
+
return {"ok": not problems, "problems": problems}
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def row_version_snapshot(source_refs: list[str], *, conn: sqlite3.Connection | None = None) -> dict[str, str]:
|
|
624
|
+
"""Capture ``{ref: version}`` over the REAL rows behind the refs.
|
|
625
|
+
|
|
626
|
+
For each ref, ``ref_version`` reads the concrete row (one indexed lookup) and
|
|
627
|
+
returns a cheap version that changes when the material content changes. Refs
|
|
628
|
+
with no handle (``None``) are omitted — they are caught separately by the
|
|
629
|
+
write gate (``untrackable_refs``), which refuses to cache the whole answer.
|
|
630
|
+
Watermark-tracked sources contribute a stable ``__wm__`` token so the
|
|
631
|
+
snapshot stays deterministic.
|
|
632
|
+
|
|
633
|
+
This snapshot is stored on write and re-read on every ``get``; it is the
|
|
634
|
+
PRIMARY freshness signal, replacing the old proxy fingerprint.
|
|
635
|
+
"""
|
|
636
|
+
conn = conn or _conn()
|
|
637
|
+
snapshot: dict[str, str] = {}
|
|
638
|
+
for ref in dict.fromkeys(str(r or "").strip() for r in source_refs):
|
|
639
|
+
if not ref:
|
|
640
|
+
continue
|
|
641
|
+
version = ref_version(ref, conn=conn)
|
|
642
|
+
if version is not None:
|
|
643
|
+
snapshot[ref] = version
|
|
644
|
+
return snapshot
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def source_fingerprint(source_refs: list[str], *, conn: sqlite3.Connection | None = None) -> str:
|
|
648
|
+
"""Deterministic digest over the per-row snapshot (kept for telemetry and
|
|
649
|
+
backward-compat callers/tests). The authoritative check is the stored
|
|
650
|
+
snapshot compared row-by-row in ``is_valid``; this digest is a convenience
|
|
651
|
+
rollup of the same data, so a content change still moves it.
|
|
652
|
+
|
|
653
|
+
Untrackable refs (``ref_version`` → None) contribute a stable marker so the
|
|
654
|
+
fingerprint is always defined; the write gate is what actually refuses to
|
|
655
|
+
cache them, not this digest.
|
|
656
|
+
"""
|
|
657
|
+
conn = conn or _conn()
|
|
658
|
+
snapshot = row_version_snapshot(source_refs, conn=conn)
|
|
659
|
+
items = [f"{ref}@{snapshot.get(ref, '__untrackable__')}" for ref in sorted(dict.fromkeys(source_refs))]
|
|
660
|
+
return _hash({"policy_version": POLICY_VERSION, "sources": items})
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def untrackable_refs(source_refs: list[str], *, conn: sqlite3.Connection | None = None) -> list[str]:
|
|
664
|
+
"""Return the refs that may NOT be cached on — the write-gate's reject list.
|
|
665
|
+
|
|
666
|
+
THE FAIL-CLOSED INVARIANT (write side): an answer is cacheable iff EVERY ref
|
|
667
|
+
resolves to a freshness handle that is demonstrably fresh RIGHT NOW —
|
|
668
|
+
i.e. ``ref_version`` derives the version from a REAL existing row, or returns
|
|
669
|
+
a ``__wm__`` watermark token (whose guard is the global change_log
|
|
670
|
+
watermark). A ref is rejected here when ``ref_version`` is:
|
|
671
|
+
|
|
672
|
+
* ``None`` — no resolvable shape at all: unknown source_name, unknown
|
|
673
|
+
``evidence_ledger`` composite inner-kind, file-backed evidence,
|
|
674
|
+
positional ``memory:<n>`` / ``runtime_db:<n>``, a constant
|
|
675
|
+
``validator_digest`` canonical kind, flat-file/grep/catalog source; OR
|
|
676
|
+
* a version embedding the ``__missing__`` marker — the table/id-column are
|
|
677
|
+
right but NO row exists at write time, so there is nothing real to cache
|
|
678
|
+
on. (On READ, a stored real version transitioning to ``__missing__`` is a
|
|
679
|
+
deletion → MISS, handled by ``is_valid``; that is a different code path.)
|
|
680
|
+
|
|
681
|
+
Crucially this kills the stale CLASS, not a case: a ref can NEVER be cached
|
|
682
|
+
on a CONSTANT sentinel, because a constant ``__missing__`` (the old bug for
|
|
683
|
+
composite/positional ids) is rejected here at write time. ``set()`` refuses
|
|
684
|
+
the whole answer if this list is non-empty — better an extra MISS than a
|
|
685
|
+
stale HIT. Empty means the answer is safe to cache.
|
|
686
|
+
"""
|
|
687
|
+
conn = conn or _conn()
|
|
688
|
+
bad: list[str] = []
|
|
689
|
+
for ref in dict.fromkeys(str(r or "").strip() for r in source_refs):
|
|
690
|
+
if not ref:
|
|
691
|
+
continue
|
|
692
|
+
if not _resolves_to_real_row(ref_version(ref, conn=conn)):
|
|
693
|
+
bad.append(ref)
|
|
694
|
+
return bad
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def _hash(value: Any) -> str:
|
|
698
|
+
return hashlib.sha256(
|
|
699
|
+
json.dumps(value, ensure_ascii=True, sort_keys=True, separators=(",", ":")).encode(
|
|
700
|
+
"utf-8", errors="ignore"
|
|
701
|
+
)
|
|
702
|
+
).hexdigest()
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def is_valid(entry: dict[str, Any], *, conn: sqlite3.Connection | None = None) -> tuple[bool, str]:
|
|
706
|
+
"""Return (valid, reason). Valid only when ALL conditions hold.
|
|
707
|
+
|
|
708
|
+
Order is cheap → authoritative: TTL and status are O(1); the global watermark
|
|
709
|
+
is one SELECT MAX(id); the content snapshot re-reads the consulted rows by id
|
|
710
|
+
(a handful of indexed lookups). The snapshot is the PRIMARY guarantee — it is
|
|
711
|
+
what catches a followup completed / learning edited / workflow advanced by a
|
|
712
|
+
plain UPDATE that never touched change_log (so the watermark would not move).
|
|
713
|
+
"""
|
|
714
|
+
conn = conn or _conn()
|
|
715
|
+
now = _now()
|
|
716
|
+
# (1) TTL ceiling
|
|
717
|
+
if now >= float(entry.get("expires_at") or 0):
|
|
718
|
+
return False, "expired_ttl"
|
|
719
|
+
# (2) status
|
|
720
|
+
if str(entry.get("status") or "") != "fresh":
|
|
721
|
+
return False, "not_fresh"
|
|
722
|
+
# (3) GLOBAL change watermark — cheap fast-fail. Any change_log mutation in
|
|
723
|
+
# ANY session since the write invalidates. Compared against the GLOBAL
|
|
724
|
+
# watermark (sid=None), never the entry's sid-scoped one: under the NEXO
|
|
725
|
+
# identity model "if another terminal did X, I did X", a change that landed
|
|
726
|
+
# in a different session MUST invalidate this session's cache. The entry's
|
|
727
|
+
# sid scopes the cache KEY only (cross-session leak is blocked in get() via
|
|
728
|
+
# expected_sid), never freshness.
|
|
729
|
+
stored_watermark = int(entry.get("change_watermark") or 0)
|
|
730
|
+
if _change_watermark(None) != stored_watermark:
|
|
731
|
+
return False, "watermark_advanced"
|
|
732
|
+
# (4) CONTENT SNAPSHOT — re-read each consulted row by id and compare its
|
|
733
|
+
# cheap version to what we stored. This is the authoritative anti-stale
|
|
734
|
+
# check: it covers the router's source-name refs (followups/reminders/
|
|
735
|
+
# workflows/commitments/…) whose stores are NOT in change_log, so condition
|
|
736
|
+
# (3) alone would miss them. A changed row, a deleted row (→ __missing__),
|
|
737
|
+
# or a row whose version can no longer be read → MISS.
|
|
738
|
+
stored_snapshot = entry.get("content_snapshot")
|
|
739
|
+
if not isinstance(stored_snapshot, dict):
|
|
740
|
+
stored_snapshot = {}
|
|
741
|
+
for ref, stored_version in stored_snapshot.items():
|
|
742
|
+
current_version = ref_version(str(ref), conn=conn)
|
|
743
|
+
if current_version is None or current_version != str(stored_version):
|
|
744
|
+
return False, "content_snapshot_changed"
|
|
745
|
+
return True, "fresh_hit"
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def get(
|
|
749
|
+
cache_key: str,
|
|
750
|
+
*,
|
|
751
|
+
expected_sid: str = "",
|
|
752
|
+
bump_hit: bool = True,
|
|
753
|
+
conn: sqlite3.Connection | None = None,
|
|
754
|
+
) -> dict[str, Any] | None:
|
|
755
|
+
"""Return a VALID cache entry for ``cache_key`` or None on any MISS.
|
|
756
|
+
|
|
757
|
+
``expected_sid``: for session-scoped entries, the caller's sid must match
|
|
758
|
+
the entry's sid; a mismatch is a MISS (no cross-session leak).
|
|
759
|
+
"""
|
|
760
|
+
if not cache_key:
|
|
761
|
+
return None
|
|
762
|
+
conn = conn or _conn()
|
|
763
|
+
with _LOCK:
|
|
764
|
+
if not _table_ready(conn):
|
|
765
|
+
return None
|
|
766
|
+
try:
|
|
767
|
+
row = conn.execute(
|
|
768
|
+
"SELECT * FROM resolution_cache WHERE cache_key=? LIMIT 1", (cache_key,)
|
|
769
|
+
).fetchone()
|
|
770
|
+
except Exception:
|
|
771
|
+
return None
|
|
772
|
+
if not row:
|
|
773
|
+
return None
|
|
774
|
+
entry = _row_to_entry(row)
|
|
775
|
+
# Session scoping: if the entry is sid-bound it must match the caller.
|
|
776
|
+
entry_sid = str(entry.get("sid") or "")
|
|
777
|
+
if entry_sid and expected_sid and entry_sid != str(expected_sid):
|
|
778
|
+
return None
|
|
779
|
+
if entry_sid and not expected_sid:
|
|
780
|
+
return None
|
|
781
|
+
valid, reason = is_valid(entry, conn=conn)
|
|
782
|
+
if not valid:
|
|
783
|
+
# Mark stale so a later prune can collect it; do not delete on the
|
|
784
|
+
# read path (keep reads cheap and lock-light).
|
|
785
|
+
if reason in {"expired_ttl", "watermark_advanced", "content_snapshot_changed"}:
|
|
786
|
+
try:
|
|
787
|
+
conn.execute(
|
|
788
|
+
"UPDATE resolution_cache SET status='stale' WHERE cache_key=? AND status='fresh'",
|
|
789
|
+
(cache_key,),
|
|
790
|
+
)
|
|
791
|
+
conn.commit()
|
|
792
|
+
except Exception:
|
|
793
|
+
pass
|
|
794
|
+
entry["miss_reason"] = reason
|
|
795
|
+
entry["valid"] = False
|
|
796
|
+
return None
|
|
797
|
+
if bump_hit:
|
|
798
|
+
try:
|
|
799
|
+
conn.execute(
|
|
800
|
+
"UPDATE resolution_cache SET hit_count = hit_count + 1 WHERE cache_key=?",
|
|
801
|
+
(cache_key,),
|
|
802
|
+
)
|
|
803
|
+
conn.commit()
|
|
804
|
+
except Exception:
|
|
805
|
+
pass
|
|
806
|
+
entry["hit_count"] = int(entry.get("hit_count") or 0) + 1
|
|
807
|
+
entry["valid"] = True
|
|
808
|
+
entry["miss_reason"] = ""
|
|
809
|
+
return entry
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def set(
|
|
813
|
+
cache_key: str,
|
|
814
|
+
result: dict[str, Any],
|
|
815
|
+
*,
|
|
816
|
+
ttl_seconds: int,
|
|
817
|
+
kind: str = "route",
|
|
818
|
+
intent: str = "",
|
|
819
|
+
area: str = "",
|
|
820
|
+
sid: str = "",
|
|
821
|
+
source_refs: Any = None,
|
|
822
|
+
policy_version: str = "",
|
|
823
|
+
require_trackable_refs: bool = True,
|
|
824
|
+
conn: sqlite3.Connection | None = None,
|
|
825
|
+
) -> dict[str, Any]:
|
|
826
|
+
"""Persist the FINAL organized result under ``cache_key``.
|
|
827
|
+
|
|
828
|
+
``ttl_seconds <= 0`` (the ``instant`` tier) never caches — returns a no-op.
|
|
829
|
+
Called only after the route is final (post-escalation), so we cache the
|
|
830
|
+
answer the user will actually get, never an intermediate empty pass.
|
|
831
|
+
|
|
832
|
+
``require_trackable_refs`` (default True): refuse to cache an answer whose
|
|
833
|
+
evidence depends on a source with NO freshness handle — a ref whose
|
|
834
|
+
``ref_version`` is None (unknown source_name, flat-file/grep/catalog source,
|
|
835
|
+
a constant ``validator_digest`` canonical kind, or a positional
|
|
836
|
+
``memory:<n>`` we cannot resolve to a row). Refusing returns
|
|
837
|
+
``reason='untrackable_source'``. Better an extra MISS than a stale HIT.
|
|
838
|
+
Callers with an independent freshness handle (the repo map carries
|
|
839
|
+
``git_head``) pass ``require_trackable_refs=False``.
|
|
840
|
+
"""
|
|
841
|
+
if not cache_key:
|
|
842
|
+
return {"ok": False, "reason": "no_cache_key"}
|
|
843
|
+
if int(ttl_seconds or 0) <= 0:
|
|
844
|
+
return {"ok": False, "reason": "ttl_zero_never_cache"}
|
|
845
|
+
# Session-scoped intents must carry a sid or they would leak across
|
|
846
|
+
# sessions; refuse to cache them globally.
|
|
847
|
+
if intent in SESSION_SCOPED_INTENTS and not sid:
|
|
848
|
+
return {"ok": False, "reason": "session_scoped_without_sid"}
|
|
849
|
+
|
|
850
|
+
conn = conn or _conn()
|
|
851
|
+
with _LOCK:
|
|
852
|
+
if not _table_ready(conn):
|
|
853
|
+
return {"ok": False, "reason": "schema_missing"}
|
|
854
|
+
refs = _parse_refs(source_refs if source_refs is not None else result.get("evidence_refs"))
|
|
855
|
+
if require_trackable_refs and refs:
|
|
856
|
+
untrackable = untrackable_refs(refs, conn=conn)
|
|
857
|
+
if untrackable:
|
|
858
|
+
# No freshness handle for these refs → caching could go stale.
|
|
859
|
+
return {
|
|
860
|
+
"ok": False,
|
|
861
|
+
"reason": "untrackable_source",
|
|
862
|
+
"untrackable_refs": untrackable,
|
|
863
|
+
"untrackable_source": untrackable,
|
|
864
|
+
}
|
|
865
|
+
now = _now()
|
|
866
|
+
expires_at = now + float(int(ttl_seconds))
|
|
867
|
+
# PRIMARY anti-stale signal: a snapshot of the REAL rows behind the refs.
|
|
868
|
+
snapshot = row_version_snapshot(refs, conn=conn)
|
|
869
|
+
# Convenience rollup digest (telemetry / backward-compat); the snapshot
|
|
870
|
+
# is the authority. Derived from the same data so it still moves on
|
|
871
|
+
# content change.
|
|
872
|
+
fingerprint = _hash(
|
|
873
|
+
{"policy_version": POLICY_VERSION,
|
|
874
|
+
"sources": [f"{r}@{snapshot.get(r, '__untrackable__')}" for r in sorted(dict.fromkeys(refs))]}
|
|
875
|
+
)
|
|
876
|
+
# GLOBAL watermark (sid=None): freshness must react to mutations in ANY
|
|
877
|
+
# session, not just this entry's. The sid scopes the cache KEY only.
|
|
878
|
+
watermark = _change_watermark(None)
|
|
879
|
+
try:
|
|
880
|
+
result_json = json.dumps(result, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
|
881
|
+
except Exception:
|
|
882
|
+
return {"ok": False, "reason": "result_not_serializable"}
|
|
883
|
+
try:
|
|
884
|
+
snapshot_json = json.dumps(snapshot, ensure_ascii=True, sort_keys=True, separators=(",", ":"))
|
|
885
|
+
except Exception:
|
|
886
|
+
snapshot_json = "{}"
|
|
887
|
+
try:
|
|
888
|
+
conn.execute(
|
|
889
|
+
"""
|
|
890
|
+
INSERT INTO resolution_cache
|
|
891
|
+
(cache_key, kind, intent, area, sid, result_json,
|
|
892
|
+
source_fingerprint, source_refs_json, content_snapshot_json,
|
|
893
|
+
change_watermark, status, policy_version, resolved_at,
|
|
894
|
+
expires_at, hit_count)
|
|
895
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'fresh', ?, ?, ?, 0)
|
|
896
|
+
ON CONFLICT(cache_key) DO UPDATE SET
|
|
897
|
+
kind=excluded.kind,
|
|
898
|
+
intent=excluded.intent,
|
|
899
|
+
area=excluded.area,
|
|
900
|
+
sid=excluded.sid,
|
|
901
|
+
result_json=excluded.result_json,
|
|
902
|
+
source_fingerprint=excluded.source_fingerprint,
|
|
903
|
+
source_refs_json=excluded.source_refs_json,
|
|
904
|
+
content_snapshot_json=excluded.content_snapshot_json,
|
|
905
|
+
change_watermark=excluded.change_watermark,
|
|
906
|
+
status='fresh',
|
|
907
|
+
policy_version=excluded.policy_version,
|
|
908
|
+
resolved_at=excluded.resolved_at,
|
|
909
|
+
expires_at=excluded.expires_at,
|
|
910
|
+
hit_count=0
|
|
911
|
+
""",
|
|
912
|
+
(
|
|
913
|
+
cache_key, kind, intent, area, sid, result_json,
|
|
914
|
+
fingerprint, json.dumps(refs, ensure_ascii=True, separators=(",", ":")),
|
|
915
|
+
snapshot_json, watermark, policy_version or POLICY_VERSION, now, expires_at,
|
|
916
|
+
),
|
|
917
|
+
)
|
|
918
|
+
conn.commit()
|
|
919
|
+
except Exception as exc:
|
|
920
|
+
return {"ok": False, "reason": "store_failed", "detail": f"{type(exc).__name__}: {exc}"}
|
|
921
|
+
# Keep the table bounded (throttled; never blocks/breaks the write).
|
|
922
|
+
_maybe_prune(conn)
|
|
923
|
+
return {
|
|
924
|
+
"ok": True,
|
|
925
|
+
"cache_key": cache_key,
|
|
926
|
+
"kind": kind,
|
|
927
|
+
"expires_at": expires_at,
|
|
928
|
+
"source_fingerprint": fingerprint,
|
|
929
|
+
"content_snapshot": snapshot,
|
|
930
|
+
"change_watermark": watermark,
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
|
|
934
|
+
def invalidate(cache_key: str = "", *, kind: str = "", conn: sqlite3.Connection | None = None) -> int:
|
|
935
|
+
"""Mark entries stale (by key or by kind). Returns rows touched."""
|
|
936
|
+
conn = conn or _conn()
|
|
937
|
+
with _LOCK:
|
|
938
|
+
if not _table_ready(conn):
|
|
939
|
+
return 0
|
|
940
|
+
try:
|
|
941
|
+
if cache_key:
|
|
942
|
+
cur = conn.execute(
|
|
943
|
+
"UPDATE resolution_cache SET status='stale' WHERE cache_key=? AND status='fresh'",
|
|
944
|
+
(cache_key,),
|
|
945
|
+
)
|
|
946
|
+
elif kind:
|
|
947
|
+
cur = conn.execute(
|
|
948
|
+
"UPDATE resolution_cache SET status='stale' WHERE kind=? AND status='fresh'",
|
|
949
|
+
(kind,),
|
|
950
|
+
)
|
|
951
|
+
else:
|
|
952
|
+
cur = conn.execute("UPDATE resolution_cache SET status='stale' WHERE status='fresh'")
|
|
953
|
+
conn.commit()
|
|
954
|
+
return int(cur.rowcount or 0)
|
|
955
|
+
except Exception:
|
|
956
|
+
return 0
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
def prune(*, max_rows: int = _MAX_ROWS, conn: sqlite3.Connection | None = None) -> int:
|
|
960
|
+
"""Delete entries whose TTL elapsed; enforce a hard row cap. Returns rows deleted.
|
|
961
|
+
|
|
962
|
+
First deletes every expired row (``expires_at <= now``). Then, if the table
|
|
963
|
+
is still over ``max_rows``, trims the oldest stale rows, and finally the
|
|
964
|
+
oldest rows by ``resolved_at`` as a last-resort backstop so the table can
|
|
965
|
+
never grow without bound even under a flood of distinct fresh keys.
|
|
966
|
+
"""
|
|
967
|
+
conn = conn or _conn()
|
|
968
|
+
with _LOCK:
|
|
969
|
+
if not _table_ready(conn):
|
|
970
|
+
return 0
|
|
971
|
+
deleted = 0
|
|
972
|
+
try:
|
|
973
|
+
cur = conn.execute("DELETE FROM resolution_cache WHERE expires_at <= ?", (_now(),))
|
|
974
|
+
deleted += int(cur.rowcount or 0)
|
|
975
|
+
if max_rows and max_rows > 0:
|
|
976
|
+
total = int(
|
|
977
|
+
conn.execute("SELECT COUNT(*) FROM resolution_cache").fetchone()[0] or 0
|
|
978
|
+
)
|
|
979
|
+
overflow = total - int(max_rows)
|
|
980
|
+
if overflow > 0:
|
|
981
|
+
# Drop the oldest rows (stale first via status order, then by
|
|
982
|
+
# resolved_at) until back under the cap.
|
|
983
|
+
cur = conn.execute(
|
|
984
|
+
"""
|
|
985
|
+
DELETE FROM resolution_cache WHERE cache_key IN (
|
|
986
|
+
SELECT cache_key FROM resolution_cache
|
|
987
|
+
ORDER BY (status='fresh') ASC, resolved_at ASC
|
|
988
|
+
LIMIT ?
|
|
989
|
+
)
|
|
990
|
+
""",
|
|
991
|
+
(overflow,),
|
|
992
|
+
)
|
|
993
|
+
deleted += int(cur.rowcount or 0)
|
|
994
|
+
conn.commit()
|
|
995
|
+
return deleted
|
|
996
|
+
except Exception:
|
|
997
|
+
return deleted
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def _maybe_prune(conn: sqlite3.Connection) -> None:
|
|
1001
|
+
"""Throttled maintenance prune, called from the write path.
|
|
1002
|
+
|
|
1003
|
+
Runs a real ``prune`` once every ``_PRUNE_EVERY`` writes so the cache table
|
|
1004
|
+
stays bounded without paying a DELETE on every ``set``. Best-effort: any
|
|
1005
|
+
failure is swallowed (maintenance must never break a cache write)."""
|
|
1006
|
+
global _writes_since_prune
|
|
1007
|
+
_writes_since_prune += 1
|
|
1008
|
+
if _writes_since_prune < _PRUNE_EVERY:
|
|
1009
|
+
return
|
|
1010
|
+
_writes_since_prune = 0
|
|
1011
|
+
try:
|
|
1012
|
+
prune(conn=conn)
|
|
1013
|
+
except Exception:
|
|
1014
|
+
pass
|
|
1015
|
+
|
|
1016
|
+
|
|
1017
|
+
def _row_to_entry(row: sqlite3.Row) -> dict[str, Any]:
|
|
1018
|
+
data = dict(row)
|
|
1019
|
+
try:
|
|
1020
|
+
data["result"] = json.loads(data.get("result_json") or "{}")
|
|
1021
|
+
except Exception:
|
|
1022
|
+
data["result"] = {}
|
|
1023
|
+
try:
|
|
1024
|
+
data["source_refs"] = json.loads(data.get("source_refs_json") or "[]")
|
|
1025
|
+
except Exception:
|
|
1026
|
+
data["source_refs"] = []
|
|
1027
|
+
try:
|
|
1028
|
+
snapshot = json.loads(data.get("content_snapshot_json") or "{}")
|
|
1029
|
+
data["content_snapshot"] = snapshot if isinstance(snapshot, dict) else {}
|
|
1030
|
+
except Exception:
|
|
1031
|
+
data["content_snapshot"] = {}
|
|
1032
|
+
return data
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
# ── Repo map (code working memory — Fase 3) ──────────────────────────────────
|
|
1036
|
+
|
|
1037
|
+
def _git_head(repo_dir: str) -> str:
|
|
1038
|
+
"""Short HEAD hash of a repo dir, reusing the adaptive_mode git pattern."""
|
|
1039
|
+
import subprocess
|
|
1040
|
+
try:
|
|
1041
|
+
out = subprocess.run(
|
|
1042
|
+
["git", "rev-parse", "--short", "HEAD"],
|
|
1043
|
+
capture_output=True, text=True, timeout=5, cwd=repo_dir,
|
|
1044
|
+
)
|
|
1045
|
+
if out.returncode != 0:
|
|
1046
|
+
return ""
|
|
1047
|
+
return out.stdout.strip()
|
|
1048
|
+
except Exception:
|
|
1049
|
+
return ""
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
def repo_map_key(project_key: str) -> str:
|
|
1053
|
+
return f"repo:{str(project_key or '').strip().lower()}"
|
|
1054
|
+
|
|
1055
|
+
|
|
1056
|
+
def get_repo_map(
|
|
1057
|
+
project_key: str,
|
|
1058
|
+
repo_dir: str = "",
|
|
1059
|
+
*,
|
|
1060
|
+
conn: sqlite3.Connection | None = None,
|
|
1061
|
+
) -> dict[str, Any] | None:
|
|
1062
|
+
"""Return a fresh repo_map for ``project_key``, or None if it must rebuild.
|
|
1063
|
+
|
|
1064
|
+
Code-specific invalidation: on top of the four standard conditions, the
|
|
1065
|
+
repo map is also a MISS if the repo's current ``git rev-parse --short HEAD``
|
|
1066
|
+
differs from the stored ``git_head`` (the repo moved → re-map). This is what
|
|
1067
|
+
lets "I already know how nexo-desktop works" hold until the repo changes,
|
|
1068
|
+
instead of re-reading the tree every time.
|
|
1069
|
+
"""
|
|
1070
|
+
conn = conn or _conn()
|
|
1071
|
+
entry = get(repo_map_key(project_key), bump_hit=True, conn=conn)
|
|
1072
|
+
if not entry:
|
|
1073
|
+
return None
|
|
1074
|
+
result = entry.get("result") or {}
|
|
1075
|
+
if repo_dir:
|
|
1076
|
+
stored_head = str(result.get("git_head") or "")
|
|
1077
|
+
current_head = _git_head(repo_dir)
|
|
1078
|
+
if current_head and stored_head and current_head != stored_head:
|
|
1079
|
+
invalidate(repo_map_key(project_key), conn=conn)
|
|
1080
|
+
return None
|
|
1081
|
+
return entry
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
def set_repo_map(
|
|
1085
|
+
project_key: str,
|
|
1086
|
+
repo_map: dict[str, Any],
|
|
1087
|
+
repo_dir: str = "",
|
|
1088
|
+
*,
|
|
1089
|
+
ttl_seconds: int = 86400,
|
|
1090
|
+
conn: sqlite3.Connection | None = None,
|
|
1091
|
+
) -> dict[str, Any]:
|
|
1092
|
+
"""Cache a lightweight repo snapshot (tree + key files + atlas gotchas).
|
|
1093
|
+
|
|
1094
|
+
Deliberately NO symbol/LSP parser (decided): the snapshot is the directory
|
|
1095
|
+
tree, key entrypoints/configs, and the project-atlas gotchas/locations.
|
|
1096
|
+
Invalidation is carried by git_head (cheap) + a long TTL (24h default).
|
|
1097
|
+
"""
|
|
1098
|
+
conn = conn or _conn()
|
|
1099
|
+
payload = dict(repo_map)
|
|
1100
|
+
if repo_dir and "git_head" not in payload:
|
|
1101
|
+
payload["git_head"] = _git_head(repo_dir)
|
|
1102
|
+
# The repo map has no source_refs to fingerprint; carry an atlas marker so
|
|
1103
|
+
# the standard fingerprint stays stable and the watermark+git_head do the
|
|
1104
|
+
# real invalidation work.
|
|
1105
|
+
refs = _parse_refs(payload.get("source_refs")) or [f"project_atlas:{str(project_key or '').strip().lower()}"]
|
|
1106
|
+
return set(
|
|
1107
|
+
repo_map_key(project_key),
|
|
1108
|
+
payload,
|
|
1109
|
+
ttl_seconds=ttl_seconds,
|
|
1110
|
+
kind="repo_map",
|
|
1111
|
+
intent="repo_map",
|
|
1112
|
+
area="code",
|
|
1113
|
+
source_refs=refs,
|
|
1114
|
+
# The repo map's atlas ref resolves to a constant validator digest, but
|
|
1115
|
+
# the map carries its OWN freshness handle (git_head + 24h TTL +
|
|
1116
|
+
# watermark), so the trackable-refs gate would be a false positive here.
|
|
1117
|
+
require_trackable_refs=False,
|
|
1118
|
+
conn=conn,
|
|
1119
|
+
)
|