@shadowforge0/aquifer-memory 1.3.0 → 1.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/consumers/default/index.js +17 -4
- package/consumers/mcp.js +21 -0
- package/consumers/miranda/index.js +15 -4
- package/consumers/miranda/recall-format.js +5 -3
- package/consumers/shared/config.js +8 -0
- package/consumers/shared/factory.js +2 -1
- package/consumers/shared/llm.js +1 -1
- package/consumers/shared/recall-format.js +21 -1
- package/core/aquifer.js +669 -92
- package/core/entity-state.js +483 -0
- package/core/insights.js +499 -0
- package/core/mcp-manifest.js +1 -1
- package/core/storage.js +82 -5
- package/package.json +1 -1
- package/pipeline/extract-state-changes.js +205 -0
- package/schema/001-base.sql +186 -16
- package/schema/002-entities.sql +35 -1
- package/schema/004-completion.sql +23 -7
- package/schema/005-entity-state-history.sql +87 -0
- package/schema/006-insights.sql +138 -0
- package/scripts/diagnose-fts-zh.js +37 -4
- package/scripts/drop-entity-state-history.sql +17 -0
- package/scripts/drop-insights.sql +12 -0
- package/scripts/extract-insights-from-recent-sessions.js +315 -0
- package/scripts/find-dburl-hints.js +29 -0
- package/scripts/queries.json +45 -0
- package/scripts/retro-recall-bench.js +409 -0
- package/scripts/sample-bench-queries.sql +75 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Extract temporal state-change facts from session content.
|
|
4
|
+
//
|
|
5
|
+
// Input: message array + entity context (name→id map) + LLM.
|
|
6
|
+
// Output: array of change objects ready to feed entity-state.applyChanges().
|
|
7
|
+
//
|
|
8
|
+
// Strict rules baked into the prompt:
|
|
9
|
+
// - Only "已發生 / past-tense / 完成式" transitions — reject tentative
|
|
10
|
+
// ("I might / I was thinking about / let's consider").
|
|
11
|
+
// - Must have explicit time anchor ("on 2026-04-18", "as of today",
|
|
12
|
+
// "this morning") — tag to session started_at if only "now".
|
|
13
|
+
// - attribute must be stable snake_case path (version.stable,
|
|
14
|
+
// editor.preference, runtime.node.version).
|
|
15
|
+
// - value must be JSON-serialisable (strings, numbers, bools, nested OK).
|
|
16
|
+
// - confidence ∈ [0,1]; default 0.7, any < threshold is dropped by caller.
|
|
17
|
+
|
|
18
|
+
const ATTRIBUTE_RE = /^[a-z][a-z0-9_]*(\.[a-z][a-z0-9_]*)*$/;
|
|
19
|
+
|
|
20
|
+
function defaultStateChangePrompt(messages, ctx = {}) {
|
|
21
|
+
const conversation = messages
|
|
22
|
+
.map(m => `[${m.role}] ${typeof m.content === 'string' ? m.content : JSON.stringify(m.content)}`)
|
|
23
|
+
.join('\n');
|
|
24
|
+
const entityList = ctx.entities && ctx.entities.length
|
|
25
|
+
? ctx.entities.map(e => ` - "${e.name}" (id=${e.id})`).join('\n')
|
|
26
|
+
: ' (no entities resolved yet)';
|
|
27
|
+
const sessionTime = ctx.sessionStartedAt || new Date().toISOString();
|
|
28
|
+
|
|
29
|
+
return `You extract TEMPORAL STATE-CHANGE FACTS from a conversation.
|
|
30
|
+
A state change means "this specific attribute of this specific entity CHANGED its value at a specific moment."
|
|
31
|
+
|
|
32
|
+
## Strict rules
|
|
33
|
+
|
|
34
|
+
1. Only extract CHANGES — not first-observations, not opinions, not preferences merely mentioned.
|
|
35
|
+
2. Only PAST-TENSE / COMPLETED transitions. Reject tentative language:
|
|
36
|
+
- REJECT: "I might try", "I was thinking about", "let's consider", "maybe", "probably", "planning to"
|
|
37
|
+
- ACCEPT: "I upgraded", "switched to", "changed to", "升到", "改成", "換成", "現在用", "已經改"
|
|
38
|
+
3. Must have explicit TIME ANCHOR — exact date, "today", "this morning", "as of", "自 X 起"
|
|
39
|
+
If only implicit "now", use ctx.sessionStartedAt as valid_from.
|
|
40
|
+
4. attribute MUST be a STABLE snake_case path (lowercase, dots as separators):
|
|
41
|
+
- GOOD: version.stable, editor.preference, runtime.node.version, indexing.pgvector.strategy
|
|
42
|
+
- BAD: "Version Stable", "My Editor", "editor-pref"
|
|
43
|
+
5. Each change MUST match an entity in the list below by entity_name (exact match preferred, alias OK).
|
|
44
|
+
If no matching entity exists, DROP the change silently.
|
|
45
|
+
6. value must be JSON-serialisable. Wrap scalars plain (e.g. "1.3.0"), objects as {key: v}.
|
|
46
|
+
|
|
47
|
+
## Output
|
|
48
|
+
|
|
49
|
+
Emit ONE JSON object, no prose, no code fence, no commentary:
|
|
50
|
+
|
|
51
|
+
{
|
|
52
|
+
"state_changes": [
|
|
53
|
+
{
|
|
54
|
+
"entity_name": "<must match list>",
|
|
55
|
+
"attribute": "<snake_case.dotted.path>",
|
|
56
|
+
"value": <any JSON>,
|
|
57
|
+
"valid_from": "<ISO8601 timestamp>",
|
|
58
|
+
"time_anchor_text": "<the phrase that anchors the time>",
|
|
59
|
+
"evidence_text": "<the sentence that states the change, <= 240 chars>",
|
|
60
|
+
"confidence": <0..1>
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
If no changes, output: {"state_changes": []}
|
|
66
|
+
|
|
67
|
+
## Entities in scope
|
|
68
|
+
|
|
69
|
+
${entityList}
|
|
70
|
+
|
|
71
|
+
## Session started at: ${sessionTime}
|
|
72
|
+
|
|
73
|
+
## Conversation
|
|
74
|
+
|
|
75
|
+
${conversation}
|
|
76
|
+
`;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Attempt to recover a JSON object from LLM output — some models wrap in
|
|
80
|
+
// code fences, some prepend "Here is the JSON:" etc. Tolerant but strict
|
|
81
|
+
// about the resulting shape.
|
|
82
|
+
function extractJsonBlock(text) {
|
|
83
|
+
if (!text || typeof text !== 'string') return null;
|
|
84
|
+
// Strip triple-backtick fences if present.
|
|
85
|
+
let s = text.trim();
|
|
86
|
+
const fenceMatch = s.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
87
|
+
if (fenceMatch) s = fenceMatch[1].trim();
|
|
88
|
+
// Take the substring from the first { to the last }.
|
|
89
|
+
const first = s.indexOf('{');
|
|
90
|
+
const last = s.lastIndexOf('}');
|
|
91
|
+
if (first < 0 || last < first) return null;
|
|
92
|
+
const candidate = s.slice(first, last + 1);
|
|
93
|
+
try {
|
|
94
|
+
return JSON.parse(candidate);
|
|
95
|
+
} catch {
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Normalize one raw LLM-emitted change. Returns { entityName, ... } with the
|
|
101
|
+
// human-facing name intact — resolution to entity_id happens in the caller
|
|
102
|
+
// (enrich) after entity upsert, so state extraction itself doesn't need
|
|
103
|
+
// a populated id lookup.
|
|
104
|
+
function normalizeChange(raw, ctx) {
|
|
105
|
+
if (!raw || typeof raw !== 'object') return null;
|
|
106
|
+
const name = typeof raw.entity_name === 'string' ? raw.entity_name.trim() : null;
|
|
107
|
+
if (!name) return null;
|
|
108
|
+
|
|
109
|
+
// If a scope whitelist is passed, reject names not on it (case-insensitive).
|
|
110
|
+
if (ctx.scopeNames && !ctx.scopeNames.has(name.toLowerCase())) return null;
|
|
111
|
+
|
|
112
|
+
const attribute = typeof raw.attribute === 'string' ? raw.attribute.trim() : '';
|
|
113
|
+
if (!ATTRIBUTE_RE.test(attribute)) return null;
|
|
114
|
+
|
|
115
|
+
if (raw.value === undefined) return null; // explicit null is OK
|
|
116
|
+
|
|
117
|
+
const validFromDate = new Date(raw.valid_from || raw.validFrom || ctx.sessionStartedAt);
|
|
118
|
+
if (!Number.isFinite(validFromDate.getTime())) return null;
|
|
119
|
+
|
|
120
|
+
let confidence = raw.confidence;
|
|
121
|
+
if (typeof confidence !== 'number' || !Number.isFinite(confidence)) confidence = 0.7;
|
|
122
|
+
if (confidence < 0) confidence = 0;
|
|
123
|
+
if (confidence > 1) confidence = 1;
|
|
124
|
+
|
|
125
|
+
const evidenceText = typeof raw.evidence_text === 'string' ? raw.evidence_text.slice(0, 240) : '';
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
entityName: name,
|
|
129
|
+
attribute,
|
|
130
|
+
value: raw.value,
|
|
131
|
+
validFrom: validFromDate.toISOString(),
|
|
132
|
+
evidenceText,
|
|
133
|
+
confidence,
|
|
134
|
+
source: 'llm',
|
|
135
|
+
evidenceSessionId: ctx.evidenceSessionId || null,
|
|
136
|
+
sessionRowId: ctx.sessionRowId ?? null,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async function extractStateChanges(messages, {
|
|
141
|
+
llmFn,
|
|
142
|
+
promptFn,
|
|
143
|
+
entities = [], // [{id, name, aliases?: []}]
|
|
144
|
+
sessionStartedAt,
|
|
145
|
+
evidenceSessionId,
|
|
146
|
+
sessionRowId,
|
|
147
|
+
confidenceThreshold = 0.7,
|
|
148
|
+
timeoutMs = 10000,
|
|
149
|
+
maxOutputTokens = 600,
|
|
150
|
+
logger,
|
|
151
|
+
} = {}) {
|
|
152
|
+
if (!llmFn) return { changes: [], warnings: ['no_llm'] };
|
|
153
|
+
if (!entities.length) return { changes: [], warnings: ['no_entities_in_scope'] };
|
|
154
|
+
|
|
155
|
+
// Build case-insensitive name whitelist (entity name + aliases).
|
|
156
|
+
const scopeNames = new Set();
|
|
157
|
+
for (const e of entities) {
|
|
158
|
+
if (!e || !e.name) continue;
|
|
159
|
+
scopeNames.add(String(e.name).toLowerCase());
|
|
160
|
+
for (const a of (e.aliases || [])) {
|
|
161
|
+
if (typeof a === 'string') scopeNames.add(a.toLowerCase());
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const buildPrompt = promptFn || defaultStateChangePrompt;
|
|
166
|
+
const prompt = buildPrompt(messages, { entities, sessionStartedAt });
|
|
167
|
+
|
|
168
|
+
const warnings = [];
|
|
169
|
+
let rawResponse;
|
|
170
|
+
try {
|
|
171
|
+
// Simple timeout wrapper — llmFn signature in this repo is (prompt) => string.
|
|
172
|
+
rawResponse = await Promise.race([
|
|
173
|
+
llmFn(prompt, { maxTokens: maxOutputTokens }),
|
|
174
|
+
new Promise((_, rej) => setTimeout(() => rej(new Error('llm_timeout')), timeoutMs)),
|
|
175
|
+
]);
|
|
176
|
+
} catch (e) {
|
|
177
|
+
if (logger && logger.warn) logger.warn(`[extract-state-changes] llm call failed: ${e.message}`);
|
|
178
|
+
return { changes: [], warnings: [`llm_error: ${e.message}`] };
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const parsed = extractJsonBlock(rawResponse);
|
|
182
|
+
if (!parsed || !Array.isArray(parsed.state_changes)) {
|
|
183
|
+
if (logger && logger.warn) logger.warn(`[extract-state-changes] malformed output, dropping batch`);
|
|
184
|
+
return { changes: [], warnings: ['malformed_json'] };
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const ctx = { scopeNames, sessionStartedAt, evidenceSessionId, sessionRowId };
|
|
188
|
+
const changes = [];
|
|
189
|
+
let dropped = 0;
|
|
190
|
+
for (const raw of parsed.state_changes) {
|
|
191
|
+
const n = normalizeChange(raw, ctx);
|
|
192
|
+
if (!n) { dropped++; continue; }
|
|
193
|
+
if (n.confidence < confidenceThreshold) { dropped++; continue; }
|
|
194
|
+
changes.push(n);
|
|
195
|
+
}
|
|
196
|
+
if (dropped > 0) warnings.push(`dropped_${dropped}_invalid_or_low_confidence`);
|
|
197
|
+
return { changes, warnings };
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
module.exports = {
|
|
201
|
+
defaultStateChangePrompt,
|
|
202
|
+
extractJsonBlock,
|
|
203
|
+
normalizeChange,
|
|
204
|
+
extractStateChanges,
|
|
205
|
+
};
|
package/schema/001-base.sql
CHANGED
|
@@ -3,6 +3,95 @@
|
|
|
3
3
|
|
|
4
4
|
CREATE EXTENSION IF NOT EXISTS vector;
|
|
5
5
|
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
|
6
|
+
|
|
7
|
+
-- Chinese text search: prefer pg_jieba (dict.txt.big Traditional-aware, proper
|
|
8
|
+
-- word segmentation via jiebaqry search-engine mode that expands compounds into
|
|
9
|
+
-- multi-granularity tokens). Fall back to zhparser if jieba not installed; else
|
|
10
|
+
-- migration silently uses the simple tokenizer (trigram primary path unaffected).
|
|
11
|
+
-- Extension install errors (missing .so, non-superuser, OOM, etc.) are caught
|
|
12
|
+
-- per-extension so one failure doesn't prevent the other from being tried.
|
|
13
|
+
DO $$
|
|
14
|
+
BEGIN
|
|
15
|
+
BEGIN
|
|
16
|
+
CREATE EXTENSION IF NOT EXISTS pg_jieba;
|
|
17
|
+
EXCEPTION WHEN OTHERS THEN
|
|
18
|
+
RAISE NOTICE '[aquifer] pg_jieba install skipped (%); trying zhparser', SQLERRM;
|
|
19
|
+
END;
|
|
20
|
+
BEGIN
|
|
21
|
+
CREATE EXTENSION IF NOT EXISTS zhparser;
|
|
22
|
+
EXCEPTION WHEN OTHERS THEN
|
|
23
|
+
RAISE NOTICE '[aquifer] zhparser install skipped (%); Chinese FTS will use simple tokenizer', SQLERRM;
|
|
24
|
+
END;
|
|
25
|
+
END$$;
|
|
26
|
+
|
|
27
|
+
-- Build/upgrade zhcfg in the public namespace (where Aquifer consumers resolve
|
|
28
|
+
-- `to_tsvector('zhcfg', ...)` from). State machine:
|
|
29
|
+
-- S1: jieba present, no zhcfg in public -> CREATE zhcfg (COPY = jiebaqry)
|
|
30
|
+
-- S2: jieba absent, zhparser present, no zhcfg -> CREATE zhcfg zhparser + simple mapping
|
|
31
|
+
-- S3: jieba present, zhcfg backed by zhparser -> DROP + CREATE (COPY = jiebaqry)
|
|
32
|
+
-- S4: zhcfg already jieba-backed -> noop
|
|
33
|
+
-- S9: no backing extension but zhcfg still there -> rebuild against best available, or drop
|
|
34
|
+
--
|
|
35
|
+
-- zhcfg is a database-wide object; acquire a transaction-scoped global advisory
|
|
36
|
+
-- lock so concurrent migrate() calls on different Aquifer schemas in the same
|
|
37
|
+
-- database don't race on the DROP/CREATE. The lock auto-releases at COMMIT.
|
|
38
|
+
-- Key: hash of 'aquifer:zhcfg' truncated to PG advisory-lock int4 range.
|
|
39
|
+
--
|
|
40
|
+
-- Queries restrict to the public namespace to avoid ambiguity if operators have
|
|
41
|
+
-- created same-named text search configs elsewhere.
|
|
42
|
+
DO $$
|
|
43
|
+
DECLARE
|
|
44
|
+
have_jieba boolean := EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_jieba');
|
|
45
|
+
have_zhparser boolean := EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'zhparser');
|
|
46
|
+
public_oid oid := (SELECT oid FROM pg_namespace WHERE nspname = 'public');
|
|
47
|
+
zhcfg_parser text := NULL;
|
|
48
|
+
BEGIN
|
|
49
|
+
PERFORM pg_advisory_xact_lock(1434531247); -- stable global key
|
|
50
|
+
|
|
51
|
+
IF public_oid IS NOT NULL THEN
|
|
52
|
+
SELECT p.prsname INTO zhcfg_parser
|
|
53
|
+
FROM pg_ts_config c JOIN pg_ts_parser p ON c.cfgparser = p.oid
|
|
54
|
+
WHERE c.cfgname = 'zhcfg' AND c.cfgnamespace = public_oid
|
|
55
|
+
LIMIT 1;
|
|
56
|
+
END IF;
|
|
57
|
+
|
|
58
|
+
BEGIN
|
|
59
|
+
IF have_jieba AND (zhcfg_parser IS NULL OR zhcfg_parser = 'zhparser') THEN
|
|
60
|
+
-- S1 / S3: promote to jieba
|
|
61
|
+
IF zhcfg_parser = 'zhparser' THEN
|
|
62
|
+
EXECUTE 'DROP TEXT SEARCH CONFIGURATION public.zhcfg';
|
|
63
|
+
END IF;
|
|
64
|
+
EXECUTE 'CREATE TEXT SEARCH CONFIGURATION public.zhcfg ( COPY = public.jiebaqry )';
|
|
65
|
+
|
|
66
|
+
ELSIF have_zhparser AND zhcfg_parser IS NULL THEN
|
|
67
|
+
-- S2: zhparser-only new install. `eng` covers English tokens that zhparser
|
|
68
|
+
-- emits for Latin words in mixed-language text; without it they'd be dropped.
|
|
69
|
+
EXECUTE 'CREATE TEXT SEARCH CONFIGURATION public.zhcfg (PARSER = zhparser)';
|
|
70
|
+
EXECUTE 'ALTER TEXT SEARCH CONFIGURATION public.zhcfg
|
|
71
|
+
ADD MAPPING FOR n,v,a,i,e,l,j,nr,ns,nt,nz,vd,vn,m,r,t,c,p,u,d,o,y,w,x,q,b,k,s,f,h,g,eng WITH simple';
|
|
72
|
+
|
|
73
|
+
ELSIF NOT have_jieba AND NOT have_zhparser AND zhcfg_parser IS NOT NULL THEN
|
|
74
|
+
-- S9: backing extension dropped but zhcfg stayed; any `to_tsvector('zhcfg',...)`
|
|
75
|
+
-- would throw "parser does not exist" and break the FTS trigger.
|
|
76
|
+
-- Safer to remove zhcfg and let consumers fall back to 'simple'.
|
|
77
|
+
EXECUTE 'DROP TEXT SEARCH CONFIGURATION public.zhcfg';
|
|
78
|
+
RAISE WARNING '[aquifer] zhcfg removed: neither pg_jieba nor zhparser is installed; Chinese FTS falls back to simple';
|
|
79
|
+
|
|
80
|
+
ELSIF NOT have_jieba AND have_zhparser AND zhcfg_parser NOT IN ('zhparser') THEN
|
|
81
|
+
-- S9 partial: jieba gone but zhparser available; rebuild on zhparser.
|
|
82
|
+
EXECUTE 'DROP TEXT SEARCH CONFIGURATION public.zhcfg';
|
|
83
|
+
EXECUTE 'CREATE TEXT SEARCH CONFIGURATION public.zhcfg (PARSER = zhparser)';
|
|
84
|
+
EXECUTE 'ALTER TEXT SEARCH CONFIGURATION public.zhcfg
|
|
85
|
+
ADD MAPPING FOR n,v,a,i,e,l,j,nr,ns,nt,nz,vd,vn,m,r,t,c,p,u,d,o,y,w,x,q,b,k,s,f,h,g,eng WITH simple';
|
|
86
|
+
RAISE WARNING '[aquifer] zhcfg rebuilt on zhparser: pg_jieba no longer installed';
|
|
87
|
+
END IF;
|
|
88
|
+
EXCEPTION WHEN OTHERS THEN
|
|
89
|
+
-- Ownership mismatch, concurrent-modify race, dependency blocking DROP, etc.
|
|
90
|
+
-- Don't abort the entire migrate(); leave zhcfg as-is and warn.
|
|
91
|
+
RAISE WARNING '[aquifer] zhcfg (re)build skipped (%); existing config left untouched', SQLERRM;
|
|
92
|
+
END;
|
|
93
|
+
END$$;
|
|
94
|
+
|
|
6
95
|
CREATE SCHEMA IF NOT EXISTS ${schema};
|
|
7
96
|
|
|
8
97
|
-- =========================================================================
|
|
@@ -61,7 +150,9 @@ CREATE TABLE IF NOT EXISTS ${schema}.session_summaries (
|
|
|
61
150
|
ended_at TIMESTAMPTZ,
|
|
62
151
|
summary_text TEXT,
|
|
63
152
|
structured_summary JSONB NOT NULL DEFAULT '{}',
|
|
64
|
-
|
|
153
|
+
-- Sized so HNSW can build at migrate time; 1024 matches ollama bge-m3 default.
|
|
154
|
+
-- Coerce DO block below upgrades pre-1.5.2 unsized columns.
|
|
155
|
+
embedding vector(1024),
|
|
65
156
|
search_tsv TSVECTOR,
|
|
66
157
|
search_text TEXT,
|
|
67
158
|
access_count INT NOT NULL DEFAULT 0,
|
|
@@ -99,18 +190,48 @@ CREATE INDEX IF NOT EXISTS idx_summaries_embedding
|
|
|
99
190
|
ON ${schema}.session_summaries (session_row_id)
|
|
100
191
|
WHERE embedding IS NOT NULL;
|
|
101
192
|
|
|
193
|
+
-- Coerce pre-1.5.2 unsized `vector` column to sized so HNSW can be built.
|
|
194
|
+
-- pgvector requires a dim on the COLUMN, not just the data. Dim priority:
|
|
195
|
+
-- existing row dim > `aquifer.embedding_dim` GUC > 1024 default.
|
|
196
|
+
DO $$
|
|
197
|
+
DECLARE
|
|
198
|
+
is_unsized BOOLEAN;
|
|
199
|
+
existing_dim INT;
|
|
200
|
+
target_dim INT;
|
|
201
|
+
BEGIN
|
|
202
|
+
SELECT format_type(atttypid, atttypmod) = 'vector'
|
|
203
|
+
INTO is_unsized
|
|
204
|
+
FROM pg_attribute
|
|
205
|
+
WHERE attrelid = '${schema}.session_summaries'::regclass
|
|
206
|
+
AND attname = 'embedding';
|
|
207
|
+
|
|
208
|
+
IF is_unsized THEN
|
|
209
|
+
EXECUTE 'SELECT vector_dims(embedding) FROM ${schema}.session_summaries WHERE embedding IS NOT NULL LIMIT 1'
|
|
210
|
+
INTO existing_dim;
|
|
211
|
+
target_dim := COALESCE(
|
|
212
|
+
existing_dim,
|
|
213
|
+
NULLIF(current_setting('aquifer.embedding_dim', true), '')::int,
|
|
214
|
+
1024
|
|
215
|
+
);
|
|
216
|
+
EXECUTE 'ALTER TABLE ${schema}.session_summaries ALTER COLUMN embedding TYPE vector('
|
|
217
|
+
|| target_dim::text
|
|
218
|
+
|| ') USING embedding::vector('
|
|
219
|
+
|| target_dim::text
|
|
220
|
+
|| ')';
|
|
221
|
+
RAISE NOTICE '[aquifer] session_summaries.embedding coerced from unsized vector to vector(%)', target_dim;
|
|
222
|
+
END IF;
|
|
223
|
+
END$$;
|
|
224
|
+
|
|
102
225
|
-- HNSW approximate nearest-neighbor index for cosine-distance vector search.
|
|
103
|
-
--
|
|
104
|
-
--
|
|
105
|
-
--
|
|
106
|
-
--
|
|
226
|
+
-- Column is sized via CREATE TABLE or the coerce block above, so the index
|
|
227
|
+
-- builds on fresh installs too. Safety-net EXCEPTION handlers stay for the
|
|
228
|
+
-- genuine recoverable failures; invalid_parameter_value is intentionally
|
|
229
|
+
-- NOT caught — it used to mask the unsized-column schema bug.
|
|
107
230
|
DO $$
|
|
108
231
|
BEGIN
|
|
109
232
|
BEGIN
|
|
110
233
|
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_summaries_embedding_hnsw ON ${schema}.session_summaries USING hnsw (embedding vector_cosine_ops)';
|
|
111
234
|
EXCEPTION
|
|
112
|
-
WHEN invalid_parameter_value THEN
|
|
113
|
-
RAISE NOTICE '[aquifer] HNSW index on session_summaries.embedding deferred; re-run migrate() after the first embedded row';
|
|
114
235
|
WHEN feature_not_supported THEN
|
|
115
236
|
RAISE NOTICE '[aquifer] HNSW not available on this pgvector; upgrade to >= 0.5.0 for index-accelerated vector search';
|
|
116
237
|
WHEN out_of_memory THEN
|
|
@@ -155,11 +276,28 @@ BEGIN
|
|
|
155
276
|
INTO facts_text
|
|
156
277
|
FROM jsonb_array_elements(COALESCE(ss->'important_facts', '[]'::jsonb)) AS elem;
|
|
157
278
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
279
|
+
-- Use zhcfg if available (Chinese segmentation — pg_jieba jiebaqry on new
|
|
280
|
+
-- installs, zhparser as legacy fallback; zhcfg name is a stable indirection
|
|
281
|
+
-- managed by the DO block above). Else fall back to simple tokenizer.
|
|
282
|
+
-- The per-row IF EXISTS lookup hits a tiny fully-cached system catalog
|
|
283
|
+
-- (pg_ts_config, ~12 rows) — effectively free. Chose this over migrate-time
|
|
284
|
+
-- codegen because installing pg_jieba POST-install immediately benefits new
|
|
285
|
+
-- inserts without requiring a manual re-migrate.
|
|
286
|
+
IF EXISTS (SELECT 1 FROM pg_ts_config
|
|
287
|
+
WHERE cfgname = 'zhcfg'
|
|
288
|
+
AND cfgnamespace = 'public'::regnamespace) THEN
|
|
289
|
+
NEW.search_tsv :=
|
|
290
|
+
setweight(to_tsvector('zhcfg', title_text), 'A') ||
|
|
291
|
+
setweight(to_tsvector('zhcfg', overview_text || ' ' || topics_text || ' ' || decisions_text), 'B') ||
|
|
292
|
+
setweight(to_tsvector('zhcfg', COALESCE(NEW.summary_text, '')), 'C') ||
|
|
293
|
+
setweight(to_tsvector('zhcfg', open_loops_text || ' ' || facts_text), 'D');
|
|
294
|
+
ELSE
|
|
295
|
+
NEW.search_tsv :=
|
|
296
|
+
setweight(to_tsvector('simple', title_text), 'A') ||
|
|
297
|
+
setweight(to_tsvector('simple', overview_text || ' ' || topics_text || ' ' || decisions_text), 'B') ||
|
|
298
|
+
setweight(to_tsvector('simple', COALESCE(NEW.summary_text, '')), 'C') ||
|
|
299
|
+
setweight(to_tsvector('simple', open_loops_text || ' ' || facts_text), 'D');
|
|
300
|
+
END IF;
|
|
163
301
|
|
|
164
302
|
NEW.search_text :=
|
|
165
303
|
title_text || ' ' || overview_text || ' ' || topics_text || ' ' ||
|
|
@@ -198,7 +336,9 @@ CREATE TABLE IF NOT EXISTS ${schema}.turn_embeddings (
|
|
|
198
336
|
role TEXT NOT NULL DEFAULT 'user' CHECK (role = 'user'),
|
|
199
337
|
content_text TEXT NOT NULL,
|
|
200
338
|
content_hash TEXT NOT NULL,
|
|
201
|
-
|
|
339
|
+
-- Sized so HNSW can build at migrate time. Coerce DO block below upgrades
|
|
340
|
+
-- pre-1.5.2 unsized columns.
|
|
341
|
+
embedding vector(1024) NOT NULL,
|
|
202
342
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
203
343
|
UNIQUE (session_row_id, message_index)
|
|
204
344
|
);
|
|
@@ -209,15 +349,45 @@ CREATE INDEX IF NOT EXISTS idx_turn_emb_session_row
|
|
|
209
349
|
CREATE INDEX IF NOT EXISTS idx_turn_emb_tenant_agent
|
|
210
350
|
ON ${schema}.turn_embeddings (tenant_id, agent_id, source);
|
|
211
351
|
|
|
352
|
+
-- Coerce pre-1.5.2 unsized `vector` column for turn_embeddings.
|
|
353
|
+
-- NOT NULL so every row has a dim; existing_dim should always resolve.
|
|
354
|
+
DO $$
|
|
355
|
+
DECLARE
|
|
356
|
+
is_unsized BOOLEAN;
|
|
357
|
+
existing_dim INT;
|
|
358
|
+
target_dim INT;
|
|
359
|
+
BEGIN
|
|
360
|
+
SELECT format_type(atttypid, atttypmod) = 'vector'
|
|
361
|
+
INTO is_unsized
|
|
362
|
+
FROM pg_attribute
|
|
363
|
+
WHERE attrelid = '${schema}.turn_embeddings'::regclass
|
|
364
|
+
AND attname = 'embedding';
|
|
365
|
+
|
|
366
|
+
IF is_unsized THEN
|
|
367
|
+
EXECUTE 'SELECT vector_dims(embedding) FROM ${schema}.turn_embeddings WHERE embedding IS NOT NULL LIMIT 1'
|
|
368
|
+
INTO existing_dim;
|
|
369
|
+
target_dim := COALESCE(
|
|
370
|
+
existing_dim,
|
|
371
|
+
NULLIF(current_setting('aquifer.embedding_dim', true), '')::int,
|
|
372
|
+
1024
|
|
373
|
+
);
|
|
374
|
+
EXECUTE 'ALTER TABLE ${schema}.turn_embeddings ALTER COLUMN embedding TYPE vector('
|
|
375
|
+
|| target_dim::text
|
|
376
|
+
|| ') USING embedding::vector('
|
|
377
|
+
|| target_dim::text
|
|
378
|
+
|| ')';
|
|
379
|
+
RAISE NOTICE '[aquifer] turn_embeddings.embedding coerced from unsized vector to vector(%)', target_dim;
|
|
380
|
+
END IF;
|
|
381
|
+
END$$;
|
|
382
|
+
|
|
212
383
|
-- HNSW approximate nearest-neighbor index for turn-level vector search.
|
|
213
|
-
-- See notes on session_summaries.embedding HNSW above.
|
|
384
|
+
-- See notes on session_summaries.embedding HNSW above. invalid_parameter_value
|
|
385
|
+
-- intentionally NOT caught — it used to mask the unsized-column schema bug.
|
|
214
386
|
DO $$
|
|
215
387
|
BEGIN
|
|
216
388
|
BEGIN
|
|
217
389
|
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_turn_emb_embedding_hnsw ON ${schema}.turn_embeddings USING hnsw (embedding vector_cosine_ops)';
|
|
218
390
|
EXCEPTION
|
|
219
|
-
WHEN invalid_parameter_value THEN
|
|
220
|
-
RAISE NOTICE '[aquifer] HNSW index on turn_embeddings.embedding deferred; re-run migrate() after the first embedded row';
|
|
221
391
|
WHEN feature_not_supported THEN
|
|
222
392
|
RAISE NOTICE '[aquifer] HNSW not available on this pgvector; upgrade to >= 0.5.0 for index-accelerated vector search';
|
|
223
393
|
WHEN out_of_memory THEN
|
package/schema/002-entities.sql
CHANGED
|
@@ -23,7 +23,10 @@ CREATE TABLE IF NOT EXISTS ${schema}.entities (
|
|
|
23
23
|
entity_scope TEXT NOT NULL DEFAULT 'default',
|
|
24
24
|
created_by TEXT,
|
|
25
25
|
metadata JSONB NOT NULL DEFAULT '{}',
|
|
26
|
-
embedding
|
|
26
|
+
-- Sized so future HNSW index on entities.embedding builds cleanly. No HNSW
|
|
27
|
+
-- currently — entity lookup is name-trgm, not vector. Coerce block below
|
|
28
|
+
-- upgrades pre-1.5.2 installs.
|
|
29
|
+
embedding vector(1024),
|
|
27
30
|
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
28
31
|
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
29
32
|
);
|
|
@@ -48,6 +51,37 @@ BEGIN
|
|
|
48
51
|
END$$;
|
|
49
52
|
ALTER TABLE ${schema}.entities ALTER COLUMN entity_scope SET NOT NULL;
|
|
50
53
|
|
|
54
|
+
-- Coerce pre-1.5.2 unsized `vector` column to sized for HNSW-ready shape.
|
|
55
|
+
-- Mirrors the session_summaries / turn_embeddings / insights coerce blocks.
|
|
56
|
+
DO $$
|
|
57
|
+
DECLARE
|
|
58
|
+
is_unsized BOOLEAN;
|
|
59
|
+
existing_dim INT;
|
|
60
|
+
target_dim INT;
|
|
61
|
+
BEGIN
|
|
62
|
+
SELECT format_type(atttypid, atttypmod) = 'vector'
|
|
63
|
+
INTO is_unsized
|
|
64
|
+
FROM pg_attribute
|
|
65
|
+
WHERE attrelid = '${schema}.entities'::regclass
|
|
66
|
+
AND attname = 'embedding';
|
|
67
|
+
|
|
68
|
+
IF is_unsized THEN
|
|
69
|
+
EXECUTE 'SELECT vector_dims(embedding) FROM ${schema}.entities WHERE embedding IS NOT NULL LIMIT 1'
|
|
70
|
+
INTO existing_dim;
|
|
71
|
+
target_dim := COALESCE(
|
|
72
|
+
existing_dim,
|
|
73
|
+
NULLIF(current_setting('aquifer.embedding_dim', true), '')::int,
|
|
74
|
+
1024
|
|
75
|
+
);
|
|
76
|
+
EXECUTE 'ALTER TABLE ${schema}.entities ALTER COLUMN embedding TYPE vector('
|
|
77
|
+
|| target_dim::text
|
|
78
|
+
|| ') USING embedding::vector('
|
|
79
|
+
|| target_dim::text
|
|
80
|
+
|| ')';
|
|
81
|
+
RAISE NOTICE '[aquifer] entities.embedding coerced from unsized vector to vector(%)', target_dim;
|
|
82
|
+
END IF;
|
|
83
|
+
END$$;
|
|
84
|
+
|
|
51
85
|
-- Unique constraint: entity identity is (tenant, name, scope)
|
|
52
86
|
-- Drop legacy agent-based constraint if it exists
|
|
53
87
|
DROP INDEX IF EXISTS ${schema}.idx_entities_tenant_name_agent;
|
|
@@ -90,7 +90,11 @@ LANGUAGE plpgsql
|
|
|
90
90
|
AS $$
|
|
91
91
|
BEGIN
|
|
92
92
|
NEW.search_text := COALESCE(NEW.text, '') || ' ' || COALESCE(NEW.metadata::text, '');
|
|
93
|
-
|
|
93
|
+
IF EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'zhcfg') THEN
|
|
94
|
+
NEW.search_tsv := setweight(to_tsvector('zhcfg', COALESCE(NEW.text, '')), 'A');
|
|
95
|
+
ELSE
|
|
96
|
+
NEW.search_tsv := setweight(to_tsvector('simple', COALESCE(NEW.text, '')), 'A');
|
|
97
|
+
END IF;
|
|
94
98
|
RETURN NEW;
|
|
95
99
|
END;
|
|
96
100
|
$$;
|
|
@@ -184,9 +188,15 @@ BEGIN
|
|
|
184
188
|
COALESCE(NEW.text, '') || ' ' ||
|
|
185
189
|
COALESCE(NEW.metadata::text, '');
|
|
186
190
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
191
|
+
IF EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'zhcfg') THEN
|
|
192
|
+
NEW.search_tsv :=
|
|
193
|
+
setweight(to_tsvector('zhcfg', COALESCE(NEW.category, '')), 'B') ||
|
|
194
|
+
setweight(to_tsvector('zhcfg', COALESCE(NEW.text, '')), 'A');
|
|
195
|
+
ELSE
|
|
196
|
+
NEW.search_tsv :=
|
|
197
|
+
setweight(to_tsvector('simple', COALESCE(NEW.category, '')), 'B') ||
|
|
198
|
+
setweight(to_tsvector('simple', COALESCE(NEW.text, '')), 'A');
|
|
199
|
+
END IF;
|
|
190
200
|
|
|
191
201
|
RETURN NEW;
|
|
192
202
|
END;
|
|
@@ -310,9 +320,15 @@ BEGIN
|
|
|
310
320
|
COALESCE(NEW.reason_text, '') || ' ' ||
|
|
311
321
|
COALESCE(NEW.metadata::text, '');
|
|
312
322
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
323
|
+
IF EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'zhcfg') THEN
|
|
324
|
+
NEW.search_tsv :=
|
|
325
|
+
setweight(to_tsvector('zhcfg', COALESCE(NEW.decision_text, '')), 'A') ||
|
|
326
|
+
setweight(to_tsvector('zhcfg', COALESCE(NEW.reason_text, '')), 'B');
|
|
327
|
+
ELSE
|
|
328
|
+
NEW.search_tsv :=
|
|
329
|
+
setweight(to_tsvector('simple', COALESCE(NEW.decision_text, '')), 'A') ||
|
|
330
|
+
setweight(to_tsvector('simple', COALESCE(NEW.reason_text, '')), 'B');
|
|
331
|
+
END IF;
|
|
316
332
|
|
|
317
333
|
RETURN NEW;
|
|
318
334
|
END;
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
-- entity_state_history: temporal state-change tracking on entities.
|
|
2
|
+
--
|
|
3
|
+
-- Captures discrete attribute transitions (e.g. version.stable=1.2.1 -> 1.3.0,
|
|
4
|
+
-- editor.preference=vim -> nvim). Designed as additive overlay on the entities
|
|
5
|
+
-- table; DROP-clean — no triggers/functions/views, removing this table leaves
|
|
6
|
+
-- the rest of Aquifer untouched.
|
|
7
|
+
--
|
|
8
|
+
-- See spec.md Q3 and ~/.claude/develop-runs/20260419-142432-aquifer-memory-routes/.
|
|
9
|
+
|
|
10
|
+
CREATE TABLE IF NOT EXISTS ${schema}.entity_state_history (
|
|
11
|
+
id BIGSERIAL PRIMARY KEY,
|
|
12
|
+
tenant_id TEXT NOT NULL DEFAULT 'default',
|
|
13
|
+
agent_id TEXT NOT NULL DEFAULT 'main',
|
|
14
|
+
entity_id BIGINT NOT NULL
|
|
15
|
+
REFERENCES ${schema}.entities(id) ON DELETE CASCADE,
|
|
16
|
+
session_row_id BIGINT
|
|
17
|
+
REFERENCES ${schema}.sessions(id) ON DELETE SET NULL,
|
|
18
|
+
evidence_session_id TEXT,
|
|
19
|
+
attribute TEXT NOT NULL CHECK (btrim(attribute) <> ''),
|
|
20
|
+
value JSONB NOT NULL,
|
|
21
|
+
valid_from TIMESTAMPTZ NOT NULL,
|
|
22
|
+
valid_to TIMESTAMPTZ,
|
|
23
|
+
evidence_text TEXT NOT NULL DEFAULT '',
|
|
24
|
+
confidence NUMERIC(4,3) NOT NULL DEFAULT 0.7
|
|
25
|
+
CHECK (confidence >= 0 AND confidence <= 1),
|
|
26
|
+
source TEXT NOT NULL DEFAULT 'llm'
|
|
27
|
+
CHECK (source IN ('llm', 'manual', 'infra')),
|
|
28
|
+
idempotency_key TEXT,
|
|
29
|
+
supersedes_state_id BIGINT
|
|
30
|
+
REFERENCES ${schema}.entity_state_history(id) ON DELETE SET NULL,
|
|
31
|
+
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
32
|
+
CHECK (valid_to IS NULL OR valid_to > valid_from)
|
|
33
|
+
);
|
|
34
|
+
|
|
35
|
+
-- Partial UNIQUE: only one "current" (valid_to IS NULL) row per
|
|
36
|
+
-- (tenant, agent, entity, attribute). This is the temporal invariant —
|
|
37
|
+
-- two open intervals on the same key would mean the table is corrupt.
|
|
38
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_entity_state_history_current
|
|
39
|
+
ON ${schema}.entity_state_history (tenant_id, agent_id, entity_id, attribute)
|
|
40
|
+
WHERE valid_to IS NULL;
|
|
41
|
+
|
|
42
|
+
-- Idempotency: same caller-supplied key writes once. Partial allows NULL keys
|
|
43
|
+
-- (manual writes don't always need them).
|
|
44
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_entity_state_history_idempotency
|
|
45
|
+
ON ${schema}.entity_state_history (idempotency_key)
|
|
46
|
+
WHERE idempotency_key IS NOT NULL;
|
|
47
|
+
|
|
48
|
+
-- Hot path: history-by-attribute timeline scan, newest-first.
|
|
49
|
+
CREATE INDEX IF NOT EXISTS idx_entity_state_history_entity_attr_time
|
|
50
|
+
ON ${schema}.entity_state_history
|
|
51
|
+
(tenant_id, agent_id, entity_id, attribute, valid_from DESC, id DESC);
|
|
52
|
+
|
|
53
|
+
-- Hot path: full history for an entity (no attribute filter).
|
|
54
|
+
CREATE INDEX IF NOT EXISTS idx_entity_state_history_entity_time
|
|
55
|
+
ON ${schema}.entity_state_history
|
|
56
|
+
(tenant_id, agent_id, entity_id, valid_from DESC, id DESC);
|
|
57
|
+
|
|
58
|
+
-- Diagnostic: trace all state changes captured from a single session.
|
|
59
|
+
CREATE INDEX IF NOT EXISTS idx_entity_state_history_evidence_session
|
|
60
|
+
ON ${schema}.entity_state_history
|
|
61
|
+
(tenant_id, agent_id, evidence_session_id, created_at DESC)
|
|
62
|
+
WHERE evidence_session_id IS NOT NULL;
|
|
63
|
+
|
|
64
|
+
CREATE INDEX IF NOT EXISTS idx_entity_state_history_session_row
|
|
65
|
+
ON ${schema}.entity_state_history (session_row_id)
|
|
66
|
+
WHERE session_row_id IS NOT NULL;
|
|
67
|
+
|
|
68
|
+
COMMENT ON TABLE ${schema}.entity_state_history IS
|
|
69
|
+
'Bi-temporal state changes on entities. Each row = one (entity, attribute) value valid over [valid_from, valid_to). NULL valid_to = current. supersedes_state_id chains supersession history.';
|
|
70
|
+
|
|
71
|
+
COMMENT ON COLUMN ${schema}.entity_state_history.attribute IS
|
|
72
|
+
'Stable snake_case path identifying what changed (e.g. version.stable, editor.preference, runtime.node.version). Caller-defined; treat as opaque key.';
|
|
73
|
+
|
|
74
|
+
COMMENT ON COLUMN ${schema}.entity_state_history.valid_from IS
|
|
75
|
+
'When the new value became true in the real world (not when it was observed). Use evidence anchor; fall back to session started_at if unspecified.';
|
|
76
|
+
|
|
77
|
+
COMMENT ON COLUMN ${schema}.entity_state_history.valid_to IS
|
|
78
|
+
'NULL = currently valid. Otherwise, the timestamp at which a successor row took over. Closed intervals must satisfy valid_to > valid_from.';
|
|
79
|
+
|
|
80
|
+
COMMENT ON COLUMN ${schema}.entity_state_history.idempotency_key IS
|
|
81
|
+
'Caller-supplied dedupe key. Default: sha256(tenant, agent, entity, attribute, canonical_json(value), valid_from, source). Replay safe.';
|
|
82
|
+
|
|
83
|
+
COMMENT ON COLUMN ${schema}.entity_state_history.supersedes_state_id IS
|
|
84
|
+
'Chain pointer to the row this one closed (set valid_to on). NULL if this is the first known value for (entity, attribute).';
|
|
85
|
+
|
|
86
|
+
COMMENT ON COLUMN ${schema}.entity_state_history.evidence_session_id IS
|
|
87
|
+
'Session that produced this evidence (text-level session_id, not session_row_id). For audit / re-extraction.';
|