@jhizzard/termdeck 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ -- 022_source_agent_backfill.sql
2
+ -- Sprint 62 T3 (TermDeck) — backfill source_agent for pre-Sprint-50 NULL rows
3
+ -- where the writer can be inferred from row shape, NOT from content content-marker
4
+ -- inspection. Mnestra 0.4.9 (release-pending; orchestrator bumps at sprint close).
5
+ --
6
+ -- Why this exists:
7
+ -- Sprint 50 introduced source_agent (migration 015). Pre-Sprint-50 rows
8
+ -- have source_agent IS NULL and are silently excluded from filtered
9
+ -- memory_recall queries (per the recall tool's docstring: "NULL-source-
10
+ -- agent rows ... are excluded when this filter is set" — see
11
+ -- src/recall.ts:165-169).
12
+ --
13
+ -- 2026-05-08 production probe: 6,381 of 6,483 active memory_items rows
14
+ -- (~98%) have source_agent IS NULL — far above the SOURCE-BRIEF estimate
15
+ -- of "3,000+". Filtered recall has been blind to most of the corpus for
16
+ -- roughly the entire post-Sprint-50 window.
17
+ --
18
+ -- Migration 015 already backfilled session_summary NULL rows -> 'claude'
19
+ -- (015 lines 48-51), so the NULL universe today is exclusively non-
20
+ -- session_summary types. This migration closes the slice where the
21
+ -- writer can be inferred from row shape (architectural / schema /
22
+ -- structural evidence), and deliberately leaves the remaining slice
23
+ -- NULL — to be reached via the additive include_null_source recall
24
+ -- flag rather than by speculative attribution.
25
+ --
26
+ -- Design principle: row-shape attribution, not content-marker attribution.
27
+ -- The original SOURCE-BRIEF proposed content-marker predicates (ILIKE
28
+ -- '%[T-CODEX]%' etc). Sampling proved this unsafe: 100% of NULL rows
29
+ -- matching codex/gemini/grok markers are Claude *describing* those
30
+ -- agents, never authored by them. Marker == "row mentions agent",
31
+ -- not "row authored by agent".
32
+ --
33
+ -- Instead, this migration attributes by the (source_type, has_path,
34
+ -- has_session) tuple — schema-level fingerprints that map 1:1 to the
35
+ -- writer architecture, and that 50+ randomly-sampled rows confirm.
36
+ --
37
+ -- Predicate plan (each with explicit evidence chain):
38
+ --
39
+ -- A. NULL + source_type IN (decision, bug_fix, architecture, preference,
40
+ -- code_context) -> 'claude'.
41
+ -- Architectural evidence: pre-Sprint-50, only Claude shipped a
42
+ -- memory_remember client. The mcp__memory__memory_remember and
43
+ -- mcp__mnestra__memory_remember surfaces both ran exclusively in
44
+ -- Claude sessions. Codex/Gemini/Grok memory_remember capabilities
45
+ -- did not exist until the Sprint 51 per-agent MCP wiring (see
46
+ -- memory: "MCP server wiring patterns for Codex, Gemini, and Grok
47
+ -- CLIs (verified 2026-05-04 ... follow-up to Sprint 51.6's "Codex
48
+ -- MCP not wired" gap)"). All NULL rows of these source_types are
49
+ -- pre-Sprint-50 and therefore architecturally Claude.
50
+ -- Schema fingerprint: 100% of these rows have source_file_path IS NULL
51
+ -- AND source_session_id IS NULL — bare memory_remember shape.
52
+ -- Sample confirmation: 28-row sample showed 100% Claude-summary writing
53
+ -- pattern (project context, dated entries, file:line evidence — the
54
+ -- recognizable Claude memory_remember signature).
55
+ -- Expected count: 560.
56
+ --
57
+ -- B. NULL + source_type='fact' + source_session_id IS NOT NULL -> 'claude'.
58
+ -- Schema evidence: source_session_id is a Claude session UUID format
59
+ -- (matches the existing claude/session_summary tagged rows; same
60
+ -- shape: has_path=false, has_session=true). The Claude SessionEnd
61
+ -- hook is the only writer that populates source_session_id with a
62
+ -- Claude UUID. Other writers either set source_file_path (rag-extractor)
63
+ -- or leave both NULL (bare memory_remember).
64
+ -- Expected count: 4,587.
65
+ --
66
+ -- D. NULL + source_type='document_chunk' -> 'orchestrator'.
67
+ -- Structural evidence: 951/951 rows have source_file_path set + JSONB
68
+ -- metadata containing chunkIndex + heading keys — unmistakable
69
+ -- rag-system batch-chunker output. The chunker is not an LLM session;
70
+ -- 'orchestrator' is the appropriate non-LLM tag per the source_agent
71
+ -- enum (claude|codex|gemini|grok|orchestrator).
72
+ -- Path buckets:
73
+ -- 513 rows ~/.gemini/antigravity/scratch/* (Gemini scratch docs the
74
+ -- rag-extractor ingested — Gemini wrote the source MD,
75
+ -- but the rag-extractor wrote the row.)
76
+ -- 429 rows ~/Documents/* (project docs ingested directly).
77
+ -- 9 rows ~/.claude/projects/*/memory/MEMORY.md (auto-memory MD
78
+ -- ingested by the rag-extractor).
79
+ -- All four buckets are extractor-written, not LLM-written. The
80
+ -- original document author is preserved in source_file_path; the
81
+ -- row writer is the extractor.
82
+ -- Expected count: 951.
83
+ --
84
+ -- Predicate deliberately NOT applied (response to T4-CODEX 20:43 ET concern):
85
+ -- C. NULL + source_type='fact' + source_session_id IS NULL +
86
+ -- source_file_path IS NULL.
87
+ -- These 283 rows are bare memory_remember calls without session
88
+ -- attribution. Sampling (10 rows) showed 100% Claude content pattern,
89
+ -- but they lack the schema fingerprint that makes A/B/D structurally
90
+ -- definitive — there is no architectural lock that PREVENTS a
91
+ -- non-Claude writer from producing this shape (e.g., a manual psql
92
+ -- insert, a non-MCP REST call, or an early rag-extractor variant
93
+ -- that omitted source_file_path).
94
+ -- Migration 015 lines 24-30 explicitly preserved provenance
95
+ -- uncertainty for non-session_summary historical rows; broad
96
+ -- attribution here would erase that bright line. Per T4-CODEX
97
+ -- AUDIT-CONCERN (Sprint 62, 20:43 ET), these rows stay NULL and
98
+ -- are reached via the additive include_null_source recall path
99
+ -- added in src/recall.ts under this same sprint.
100
+ -- Residual NULL after this migration: 283 rows = 4.4% of corpus.
101
+ -- Acceptance target: <5%. Met.
102
+ --
103
+ -- Total backfill: 6,098 rows (A + B + D). Acceptance: residual NULL < 5%
104
+ -- of corpus (4.4% expected; well under threshold).
105
+ --
106
+ -- What this migration deliberately does NOT do:
107
+ -- * Touch session_summary rows (015 already attributed those).
108
+ -- * Touch already-tagged rows (every UPDATE is gated by source_agent IS NULL).
109
+ -- * Use content-marker predicates (sampling proved unreliable; markers
110
+ -- describe agents, not authors).
111
+ -- * Backfill the inferential-only slice (Predicate C, see above).
112
+ --
113
+ -- Idempotent: every UPDATE has WHERE source_agent IS NULL, so re-running
114
+ -- is a no-op on already-tagged rows. Safe to re-apply.
115
+ --
116
+ -- Reversibility: this migration tags rows but does not modify content,
117
+ -- type, or any other column. To revert (in a future migration), run:
118
+ -- UPDATE public.memory_items
119
+ -- SET source_agent = NULL
120
+ -- WHERE source_agent IN ('claude', 'orchestrator')
121
+ -- AND created_at < '2026-05-09'
122
+ -- AND source_type != 'session_summary'; -- preserve 015's backfill
123
+ --
124
+ -- RLS posture (per global CLAUDE.md RLS hygiene gates 1-5): this is a
125
+ -- DO block, not a CREATE FUNCTION. Runs as the migration runner's role
126
+ -- (service_role, which bypasses RLS). search_path is set explicitly to
127
+ -- defend against schema-shadow attacks during execution. No new policies,
128
+ -- no new function executable surface.
129
+
130
+ set search_path = public, pg_catalog;
131
+
132
+ do $$
133
+ declare
134
+ pred_a integer := 0;
135
+ pred_b integer := 0;
136
+ pred_d integer := 0;
137
+ remaining integer;
138
+ total_rows integer;
139
+ begin
140
+ -- Predicate A: structural attribution by source_type for non-fact, non-document_chunk
141
+ -- types. Architectural lock: pre-Sprint-50 only Claude shipped a memory_remember
142
+ -- client. NULL rows of these types are therefore unambiguously Claude.
143
+ update public.memory_items
144
+ set source_agent = 'claude'
145
+ where source_agent is null
146
+ and source_type in ('decision', 'bug_fix', 'architecture', 'preference', 'code_context');
147
+ get diagnostics pred_a = row_count;
148
+
149
+ -- Predicate B: fact rows with Claude-session attribution. source_session_id
150
+ -- is the Claude SessionEnd hook's UUID; same shape as the existing tagged
151
+ -- claude/session_summary rows.
152
+ update public.memory_items
153
+ set source_agent = 'claude'
154
+ where source_agent is null
155
+ and source_type = 'fact'
156
+ and source_session_id is not null;
157
+ get diagnostics pred_b = row_count;
158
+
159
+ -- Predicate D: rag-system document chunks -> 'orchestrator' (non-LLM batch writer).
160
+ -- All 951 rows carry source_file_path + chunkIndex/heading metadata — the
161
+ -- rag-extractor's deterministic fingerprint.
162
+ update public.memory_items
163
+ set source_agent = 'orchestrator'
164
+ where source_agent is null
165
+ and source_type = 'document_chunk';
166
+ get diagnostics pred_d = row_count;
167
+
168
+ select count(*) into remaining
169
+ from public.memory_items
170
+ where source_agent is null;
171
+
172
+ select count(*) into total_rows from public.memory_items;
173
+
174
+ raise notice '[022] backfill complete: A(claude/typed)=% B(claude/fact+session)=% D(orchestrator/doc_chunk)=% remaining_null=% / % total (acceptance: <5%%)',
175
+ pred_a, pred_b, pred_d, remaining, total_rows;
176
+ raise notice '[022] residual NULL = bare memory_remember fact rows (no session, no path); reach via include_null_source recall flag';
177
+ end$$;
178
+
179
+ -- Refresh the column comment to reflect 015 + 022 together as the partial-
180
+ -- backfill story, and document the residual + the recall flag escape hatch.
181
+ comment on column public.memory_items.source_agent is
182
+ 'Agent that produced this memory: claude|codex|gemini|grok|orchestrator|NULL. Populated at write time by per-agent SessionEnd writers from Sprint 50 onward. Pre-Sprint-50 NULL rows backfilled by migration 015 (session_summary -> claude) and migration 022 (decision/bug_fix/architecture/preference/code_context -> claude; fact w/ source_session_id -> claude; document_chunk -> orchestrator). Residual NULL = bare-call fact rows without session or path attribution; intentionally preserved per migration 015''s provenance bright line. Reach those via memory_recall include_null_source=true.';