@shadowforge0/aquifer-memory 1.0.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +29 -20
  2. package/consumers/claude-code.js +117 -0
  3. package/consumers/cli.js +17 -0
  4. package/consumers/default/daily-entries.js +196 -0
  5. package/consumers/default/index.js +282 -0
  6. package/consumers/default/prompts/summary.js +153 -0
  7. package/consumers/mcp.js +3 -23
  8. package/consumers/miranda/context-inject.js +119 -0
  9. package/consumers/miranda/daily-entries.js +224 -0
  10. package/consumers/miranda/index.js +353 -0
  11. package/consumers/miranda/instance.js +55 -0
  12. package/consumers/miranda/llm.js +99 -0
  13. package/consumers/miranda/prompts/summary.js +303 -0
  14. package/consumers/miranda/recall-format.js +74 -0
  15. package/consumers/miranda/workspace-files.js +91 -0
  16. package/consumers/openclaw-ext/index.js +38 -0
  17. package/consumers/openclaw-ext/openclaw.plugin.json +9 -0
  18. package/consumers/openclaw-ext/package.json +10 -0
  19. package/consumers/openclaw-plugin.js +66 -74
  20. package/consumers/opencode.js +21 -24
  21. package/consumers/shared/autodetect.js +64 -0
  22. package/consumers/shared/entity-parser.js +119 -0
  23. package/consumers/shared/ingest.js +148 -0
  24. package/consumers/shared/llm-autodetect.js +137 -0
  25. package/consumers/shared/normalize.js +129 -0
  26. package/consumers/shared/recall-format.js +110 -0
  27. package/core/aquifer.js +180 -71
  28. package/core/entity.js +1 -3
  29. package/core/storage.js +86 -28
  30. package/docs/postprocess-contract.md +132 -0
  31. package/index.js +9 -1
  32. package/package.json +23 -2
  33. package/pipeline/_http.js +1 -1
  34. package/pipeline/consolidation/apply.js +176 -0
  35. package/pipeline/consolidation/index.js +21 -0
  36. package/pipeline/extract-entities.js +2 -2
  37. package/pipeline/rerank.js +1 -1
  38. package/pipeline/summarize.js +4 -1
  39. package/schema/001-base.sql +61 -24
  40. package/schema/002-entities.sql +17 -3
  41. package/schema/004-facts.sql +67 -0
  42. package/scripts/diagnose-fts-zh.js +168 -134
  43. package/scripts/diagnose-vector.js +188 -0
  44. package/scripts/install-openclaw.sh +59 -0
  45. package/scripts/smoke.mjs +2 -2
@@ -0,0 +1,67 @@
1
+ -- Aquifer facts / consolidation extension
2
+ -- Requires: 001-base.sql applied first
3
+ -- Usage: replace ${schema} with actual schema name
4
+ --
5
+ -- Facts store long-lived subject/statement pairs with a lifecycle:
6
+ -- candidate → active → (stale | archived | superseded)
7
+ -- Consumers write candidates during enrich (via writeFactCandidates).
8
+ -- consolidate() then promotes / updates / confirms / archives them.
9
+
10
+ -- =========================================================================
11
+ -- Facts: long-lived current-state statements per (subject, agent)
12
+ -- =========================================================================
13
+ CREATE TABLE IF NOT EXISTS ${schema}.facts (
14
+ id BIGSERIAL PRIMARY KEY,
15
+ tenant_id TEXT NOT NULL DEFAULT 'default',
16
+ subject_key TEXT NOT NULL,
17
+ subject_label TEXT NOT NULL,
18
+ statement TEXT NOT NULL,
19
+ status TEXT NOT NULL DEFAULT 'candidate'
20
+ CHECK (status IN ('candidate','active','stale','archived','superseded')),
21
+ importance SMALLINT NOT NULL DEFAULT 5,
22
+ source_session_id TEXT,
23
+ agent_id TEXT NOT NULL DEFAULT 'main',
24
+ evidence JSONB NOT NULL DEFAULT '[]'::jsonb,
25
+ superseded_by BIGINT REFERENCES ${schema}.facts(id) ON DELETE SET NULL,
26
+ first_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
27
+ last_confirmed_at TIMESTAMPTZ NOT NULL DEFAULT now()
28
+ );
29
+
30
+ -- Migration: add tenant_id if upgrading from a legacy facts table (no tenant column).
31
+ ALTER TABLE ${schema}.facts ADD COLUMN IF NOT EXISTS tenant_id TEXT NOT NULL DEFAULT 'default';
32
+
33
+ -- At most one active fact per (tenant, subject, agent)
34
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_facts_active_subject
35
+ ON ${schema}.facts (tenant_id, subject_key, agent_id)
36
+ WHERE status = 'active';
37
+
38
+ CREATE INDEX IF NOT EXISTS idx_facts_active_agent
39
+ ON ${schema}.facts (tenant_id, agent_id, importance DESC, last_confirmed_at DESC)
40
+ WHERE status = 'active';
41
+
42
+ CREATE INDEX IF NOT EXISTS idx_facts_status_created
43
+ ON ${schema}.facts (tenant_id, status, first_seen_at DESC);
44
+
45
+ CREATE INDEX IF NOT EXISTS idx_facts_subject
46
+ ON ${schema}.facts (tenant_id, subject_key);
47
+
48
+ CREATE INDEX IF NOT EXISTS idx_facts_source_session
49
+ ON ${schema}.facts (source_session_id)
50
+ WHERE source_session_id IS NOT NULL;
51
+
52
+ COMMENT ON TABLE ${schema}.facts IS 'Fact candidates and active facts per (tenant, subject, agent) with consolidation lifecycle';
53
+
54
+ -- =========================================================================
55
+ -- Fact ↔ Entity join (optional, only when entities enabled)
56
+ -- =========================================================================
57
+ CREATE TABLE IF NOT EXISTS ${schema}.fact_entities (
58
+ id BIGSERIAL PRIMARY KEY,
59
+ fact_id BIGINT NOT NULL REFERENCES ${schema}.facts(id) ON DELETE CASCADE,
60
+ entity_id BIGINT NOT NULL,
61
+ UNIQUE (fact_id, entity_id)
62
+ );
63
+
64
+ CREATE INDEX IF NOT EXISTS idx_fact_entities_entity_id
65
+ ON ${schema}.fact_entities (entity_id);
66
+
67
+ COMMENT ON TABLE ${schema}.fact_entities IS 'Join table linking facts to entities (FK to entities is soft — entities table is optional)';
@@ -1,161 +1,195 @@
1
1
  'use strict';
2
2
 
3
3
  /**
4
- * FTS 中文診斷:檢查 'simple' tokenizer 在實際中文資料上的表現
4
+ * Aquifer FTS 中文診斷
5
5
  *
6
- * 測試項目:
7
- * 1. FTS tokenization 實際 token 長什麼樣
8
- * 2. FTS recall — 常見中文查詢的命中率
9
- * 3. FTS vs vector — FTS 有沒有在幫忙還是在拖後腿
6
+ * 測 aquifer 實際搜尋主路徑(trigram ILIKE on search_text + similarity ranking)
7
+ * vs fallback 路徑(tsvector @@ plainto_tsquery('simple', q))對中文 query 的表現。
8
+ *
9
+ * env:
10
+ * DATABASE_URL — required
11
+ * AQUIFER_SCHEMA — default 'public'
12
+ * DIAGNOSE_QUERIES — comma-separated, overrides built-in set
10
13
  */
11
14
 
12
15
  const { Pool } = require('pg');
13
16
 
14
- const DB_URL = process.env.DATABASE_URL || 'postgresql://burk:790476@localhost:5432/openclaw_db';
15
- const SCHEMA = process.env.AQUIFER_SCHEMA || 'miranda';
17
+ const DB_URL = process.env.DATABASE_URL;
18
+ if (!DB_URL) {
19
+ console.error('DATABASE_URL is required');
20
+ process.exit(2);
21
+ }
22
+ const SCHEMA = process.env.AQUIFER_SCHEMA || 'public';
23
+
24
+ const DEFAULT_QUERIES = [
25
+ // latin
26
+ 'afterburn', 'bootstrap', 'session', 'recall', 'entity', 'OpenCode', 'Jenny', 'Aquifer',
27
+ // CJK short tokens — 最容易暴露 tokenizer 問題
28
+ '記憶', '時區', '去重', '架構', '修復',
29
+ // CJK phrase
30
+ '消化模式', 'daily entries',
31
+ ];
32
+ const QUERIES = process.env.DIAGNOSE_QUERIES
33
+ ? process.env.DIAGNOSE_QUERIES.split(',').map(s => s.trim()).filter(Boolean)
34
+ : DEFAULT_QUERIES;
16
35
 
17
36
  const pool = new Pool({ connectionString: DB_URL });
37
+ const qi = (s) => `"${s.replace(/"/g, '""')}"`;
18
38
 
19
- async function run() {
20
- const qi = (s) => `"${s}"`;
21
-
22
- console.log('=== FTS 中文診斷 ===\n');
23
-
24
- // 1. 看 token 分佈
25
- console.log('--- 1. Token 分析 ---');
26
- const tokenSample = await pool.query(`
27
- SELECT ss.session_id,
28
- array_length(tsvector_to_array(ss.search_tsv), 1) as token_count,
29
- left(ss.summary_text, 80) as preview
30
- FROM ${qi(SCHEMA)}.session_summaries ss
31
- WHERE ss.search_tsv IS NOT NULL
32
- ORDER BY ss.updated_at DESC
33
- LIMIT 10
34
- `);
35
-
36
- let totalTokens = 0;
37
- let sessionCount = 0;
38
- for (const r of tokenSample.rows) {
39
- totalTokens += r.token_count || 0;
40
- sessionCount++;
41
- console.log(` ${r.session_id?.slice(0, 8)} | ${r.token_count || 0} tokens | ${r.preview}`);
42
- }
43
- console.log(` avg: ${sessionCount ? Math.round(totalTokens / sessionCount) : 0} tokens/session\n`);
39
+ function pct(n, d) {
40
+ if (d === 0) return n === 0 ? '—' : '∞%';
41
+ return `${Math.round((n / d) * 100)}%`;
42
+ }
44
43
 
45
- // 2. 看一個 session 的實際 token
46
- console.log('--- 2. Token 範例(最近 session)---');
44
+ async function main() {
45
+ console.log(`=== Aquifer FTS 中文診斷 (schema=${SCHEMA}) ===\n`);
46
+
47
+ // -------------------------------------------------------------------------
48
+ // 0. 覆蓋率:search_text NULL 率 → 看 fallback 觸發比例
49
+ // -------------------------------------------------------------------------
50
+ const cov = await pool.query(`
51
+ SELECT
52
+ COUNT(*) AS total,
53
+ COUNT(*) FILTER (WHERE search_text IS NOT NULL) AS with_text,
54
+ COUNT(*) FILTER (WHERE search_tsv IS NOT NULL) AS with_tsv,
55
+ COUNT(*) FILTER (WHERE search_text IS NULL
56
+ AND search_tsv IS NOT NULL) AS tsv_only
57
+ FROM ${qi(SCHEMA)}.session_summaries
58
+ `);
59
+ const c = cov.rows[0];
60
+ console.log('--- 0. 搜尋欄位覆蓋率 ---');
61
+ console.log(` total rows : ${c.total}`);
62
+ console.log(` has search_text : ${c.with_text} (${pct(c.with_text, c.total)})`);
63
+ console.log(` has search_tsv : ${c.with_tsv} (${pct(c.with_tsv, c.total)})`);
64
+ console.log(` tsv-only (NULL search_text, falls back to FTS): ${c.tsv_only} (${pct(c.tsv_only, c.total)})\n`);
65
+
66
+ // -------------------------------------------------------------------------
67
+ // 1. Token 範例(tsvector lexeme 粒度觀察)
68
+ // -------------------------------------------------------------------------
69
+ console.log('--- 1. tsvector lexeme 粒度範例(最近 1 筆)---');
47
70
  const tokenDetail = await pool.query(`
48
- SELECT ss.session_id,
49
- array_to_string(tsvector_to_array(ss.search_tsv), ' | ') as tokens
50
- FROM ${qi(SCHEMA)}.session_summaries ss
51
- WHERE ss.search_tsv IS NOT NULL
52
- ORDER BY ss.updated_at DESC
71
+ SELECT session_id,
72
+ array_length(tsvector_to_array(search_tsv), 1) AS token_count,
73
+ array_to_string(tsvector_to_array(search_tsv), ' | ') AS tokens
74
+ FROM ${qi(SCHEMA)}.session_summaries
75
+ WHERE search_tsv IS NOT NULL
76
+ ORDER BY updated_at DESC
53
77
  LIMIT 1
54
78
  `);
55
79
  if (tokenDetail.rows[0]) {
56
- console.log(` session: ${tokenDetail.rows[0].session_id?.slice(0, 8)}`);
57
- const tokens = tokenDetail.rows[0].tokens || '';
58
- // 分類 token
59
- const all = tokens.split(' | ');
80
+ const r = tokenDetail.rows[0];
81
+ const all = (r.tokens || '').split(' | ').filter(Boolean);
60
82
  const cjk = all.filter(t => /[\u4e00-\u9fff]/.test(t));
61
83
  const latin = all.filter(t => /^[a-z0-9]/.test(t));
62
- const other = all.filter(t => !(/[\u4e00-\u9fff]/.test(t)) && !(/^[a-z0-9]/.test(t)));
63
- console.log(` total: ${all.length} | latin: ${latin.length} | cjk: ${cjk.length} | other: ${other.length}`);
64
- console.log(` CJK tokens (前 20): ${cjk.slice(0, 20).join(' | ')}`);
65
- console.log(` Latin tokens (前 20): ${latin.slice(0, 20).join(' | ')}\n`);
84
+ console.log(` session: ${String(r.session_id).slice(0, 8)} | total tokens: ${r.token_count || 0}`);
85
+ console.log(` latin: ${latin.length} | cjk-containing: ${cjk.length}`);
86
+ console.log(` CJK lexemes (前 15): ${cjk.slice(0, 15).join(' | ')}`);
87
+ console.log(` CJK lexeme 若是 phrase 級(整句無空白),簡 tokenizer 對中文短 query 會 miss\n`);
88
+ } else {
89
+ console.log(' (no rows)\n');
66
90
  }
67
91
 
68
- // 3. 中文查詢命中率測試
69
- console.log('--- 3. 中文查詢 FTS 命中率 ---');
70
- const testQueries = [
71
- 'afterburn',
72
- 'bootstrap',
73
- 'session',
74
- 'recall',
75
- '記憶',
76
- '修復',
77
- '架構',
78
- '時區',
79
- '去重',
80
- 'daily entries',
81
- 'OpenCode',
82
- 'entity',
83
- 'Jenny',
84
- 'Aquifer',
85
- '消化模式',
86
- ];
87
-
88
- // 總 session 數
89
- const totalResult = await pool.query(`
90
- SELECT COUNT(*) as cnt FROM ${qi(SCHEMA)}.session_summaries WHERE search_tsv IS NOT NULL
91
- `);
92
- const totalSessions = parseInt(totalResult.rows[0].cnt);
93
- console.log(` total sessions with FTS index: ${totalSessions}\n`);
94
-
95
- for (const q of testQueries) {
96
- const ftsResult = await pool.query(`
97
- SELECT COUNT(*) as cnt
98
- FROM ${qi(SCHEMA)}.session_summaries ss
99
- WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
100
- `, [q]);
101
- const ftsHits = parseInt(ftsResult.rows[0].cnt);
102
-
103
- // 同時看 summary_text ILIKE 能找到幾筆(ground truth)
104
- const ilikeResult = await pool.query(`
105
- SELECT COUNT(*) as cnt
106
- FROM ${qi(SCHEMA)}.session_summaries ss
107
- WHERE ss.summary_text ILIKE $1
108
- OR ss.structured_summary::text ILIKE $1
109
- `, [`%${q}%`]);
110
- const ilikeHits = parseInt(ilikeResult.rows[0].cnt);
111
-
112
- const ftsRecall = ilikeHits > 0 ? Math.round(ftsHits / ilikeHits * 100) : (ftsHits === 0 ? 100 : 0);
113
- const status = ftsHits === ilikeHits ? '✓' : (ftsHits < ilikeHits ? '✗ MISS' : '?');
114
- console.log(` "${q}" | FTS: ${ftsHits} | ILIKE: ${ilikeHits} | recall: ${ftsRecall}% | ${status}`);
115
- }
116
-
117
- // 4. FTS 對 RRF 的貢獻度
118
- console.log('\n--- 4. FTS 在 hybrid search 中的貢獻度 ---');
119
- // 跑幾個查詢,看 FTS 跟 vector 的 session 重疊率
120
- const overlapQueries = ['afterburn', 'bootstrap', '記憶', 'recall', 'entity'];
121
- for (const q of overlapQueries) {
122
- const ftsResult = await pool.query(`
123
- SELECT ss.session_id
124
- FROM ${qi(SCHEMA)}.session_summaries ss
125
- JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
126
- WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
127
- AND s.processing_status = 'succeeded'
128
- ORDER BY ts_rank(ss.search_tsv, plainto_tsquery('simple', $1)) DESC
129
- LIMIT 10
130
- `, [q]);
131
- const ftsIds = new Set(ftsResult.rows.map(r => r.session_id));
132
-
133
- // vector search (if embedding available)
134
- const embResult = await pool.query(`
135
- SELECT ss.session_id
136
- FROM ${qi(SCHEMA)}.session_summaries ss
137
- JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
138
- WHERE ss.embedding IS NOT NULL
139
- AND s.processing_status = 'succeeded'
140
- ORDER BY ss.embedding <=> (
141
- SELECT ss2.embedding FROM ${qi(SCHEMA)}.session_summaries ss2
142
- WHERE ss2.search_tsv @@ plainto_tsquery('simple', $1)
143
- ORDER BY ts_rank(ss2.search_tsv, plainto_tsquery('simple', $1)) DESC
144
- LIMIT 1
92
+ // -------------------------------------------------------------------------
93
+ // 2. 主路徑 vs fallback:binary match 比對
94
+ //
95
+ // Ground truth = search_text ILIKE '%q%'(所有源欄位拼出的純文字 superset)
96
+ // 主路徑 = search_text ILIKE(GIN trgm 加速,語意等價 ILIKE)
97
+ // Fallback = search_tsv @@ plainto_tsquery('simple', q)
98
+ // -------------------------------------------------------------------------
99
+ console.log('--- 2. 主路徑(trigram)vs fallback(tsvector)binary match ---');
100
+ console.log(' query | truth | trgm | tsv | trgm% | tsv% | tsv-extra');
101
+ console.log(' ' + '-'.repeat(82));
102
+
103
+ const rowCount = await pool.query(
104
+ `SELECT COUNT(*) AS n FROM ${qi(SCHEMA)}.session_summaries WHERE search_text IS NOT NULL`
105
+ );
106
+ const withTextN = parseInt(rowCount.rows[0].n, 10);
107
+ console.log(` (ground truth 基數:含 search_text 的 row ${withTextN})`);
108
+
109
+ const summary = [];
110
+ for (const q of QUERIES) {
111
+ const r = await pool.query(
112
+ `
113
+ WITH base AS (
114
+ SELECT search_text,
115
+ search_tsv,
116
+ (search_text ILIKE '%' || $1 || '%') AS trgm_hit,
117
+ (search_tsv @@ plainto_tsquery('simple', $2)) AS tsv_hit
118
+ FROM ${qi(SCHEMA)}.session_summaries
119
+ WHERE search_text IS NOT NULL
145
120
  )
146
- LIMIT 10
147
- `, [q]);
148
- const embIds = new Set(embResult.rows.map(r => r.session_id));
149
-
150
- const overlap = [...ftsIds].filter(id => embIds.has(id)).length;
151
- const ftsOnly = [...ftsIds].filter(id => !embIds.has(id)).length;
152
- const embOnly = [...embIds].filter(id => !ftsIds.has(id)).length;
153
-
154
- console.log(` "${q}" | FTS top10: ${ftsIds.size} | Vec top10: ${embIds.size} | overlap: ${overlap} | FTS-only: ${ftsOnly} | Vec-only: ${embOnly}`);
121
+ SELECT
122
+ COUNT(*) FILTER (WHERE trgm_hit) AS truth,
123
+ COUNT(*) FILTER (WHERE trgm_hit) AS trgm,
124
+ COUNT(*) FILTER (WHERE tsv_hit) AS tsv,
125
+ COUNT(*) FILTER (WHERE tsv_hit AND NOT trgm_hit) AS tsv_extra
126
+ FROM base
127
+ `,
128
+ [q.replace(/[%_\\]/g, '\\$&'), q]
129
+ );
130
+ const { truth, trgm, tsv, tsv_extra } = r.rows[0];
131
+ const T = parseInt(truth, 10);
132
+ const A = parseInt(trgm, 10);
133
+ const B = parseInt(tsv, 10);
134
+ const E = parseInt(tsv_extra, 10);
135
+ summary.push({ q, T, A, B, E });
136
+ console.log(
137
+ ` ${q.padEnd(19)} | ${String(T).padStart(5)} | ${String(A).padStart(5)} | ${String(B).padStart(5)} | ${pct(A, T).padStart(5)} | ${pct(B, T).padStart(5)} | ${String(E).padStart(5)}`
138
+ );
139
+ }
140
+ console.log(' (tsv-extra = tsvector 命中但 trigram 沒命中 → 通常是 0,代表 tsv 對整體搜尋無額外貢獻)\n');
141
+
142
+ // -------------------------------------------------------------------------
143
+ // 3. Ranking 品質對比:舊 ranking (similarity only) vs 新 ranking (substr-hit first)
144
+ // -------------------------------------------------------------------------
145
+ console.log('--- 3. Ranking 品質對比:top-5 substring-hit 命中率 ---');
146
+ console.log(' query | truth | old (sim only) | new (hit+sim)');
147
+ console.log(' ' + '-'.repeat(70));
148
+ for (const q of QUERIES) {
149
+ const like = q.replace(/[%_\\]/g, '\\$&');
150
+ const truthR = await pool.query(
151
+ `SELECT COUNT(*) AS n
152
+ FROM ${qi(SCHEMA)}.session_summaries
153
+ WHERE search_text ILIKE '%' || $1 || '%'`,
154
+ [like]
155
+ );
156
+ const T = parseInt(truthR.rows[0].n, 10);
157
+
158
+ const oldR = await pool.query(
159
+ `
160
+ SELECT (search_text ILIKE '%' || $1 || '%') AS substr_hit
161
+ FROM ${qi(SCHEMA)}.session_summaries
162
+ WHERE search_text IS NOT NULL
163
+ ORDER BY similarity(search_text, $2) DESC
164
+ LIMIT 5
165
+ `,
166
+ [like, q]
167
+ );
168
+ const oldHits = oldR.rows.filter(x => x.substr_hit).length;
169
+
170
+ const newR = await pool.query(
171
+ `
172
+ SELECT (search_text ILIKE '%' || $1 || '%') AS substr_hit
173
+ FROM ${qi(SCHEMA)}.session_summaries
174
+ WHERE search_text IS NOT NULL
175
+ ORDER BY
176
+ (search_text ILIKE '%' || $1 || '%') DESC,
177
+ similarity(search_text, $2) DESC
178
+ LIMIT 5
179
+ `,
180
+ [like, q]
181
+ );
182
+ const newHits = newR.rows.filter(x => x.substr_hit).length;
183
+
184
+ const expected = Math.min(5, T);
185
+ console.log(
186
+ ` ${q.padEnd(19)} | ${String(T).padStart(5)} | ${String(oldHits).padStart(3)}/5 → ${String(expected).padStart(1)}/5 ${oldHits < expected ? '✗' : '✓'} | ${String(newHits).padStart(3)}/5 ${newHits < expected ? '✗' : '✓'}`
187
+ );
155
188
  }
189
+ console.log(' (truth = 含該字串的 row 數;ideal top-5 substr-hit = min(truth, 5))');
156
190
 
157
191
  await pool.end();
158
192
  console.log('\n=== 完成 ===');
159
193
  }
160
194
 
161
- run().catch(err => { console.error(err); process.exit(1); });
195
+ main().catch(err => { console.error(err); process.exit(1); });
@@ -0,0 +1,188 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Aquifer vector recall 診斷
5
+ *
6
+ * 驗 summary-vector + turn-vector 兩路 infrastructure:
7
+ * - embedding coverage
8
+ * - vector dim 是否一致(summary vs turn)
9
+ * - self-retrieval sanity(拿自己 embedding 當 query,top-1 distance 應 ≈ 0)
10
+ *
11
+ * env:
12
+ * DATABASE_URL — required
13
+ * AQUIFER_SCHEMA — default 'public'
14
+ */
15
+
16
+ const { Pool } = require('pg');
17
+
18
+ const DB_URL = process.env.DATABASE_URL;
19
+ if (!DB_URL) {
20
+ console.error('DATABASE_URL is required');
21
+ process.exit(2);
22
+ }
23
+ const SCHEMA = process.env.AQUIFER_SCHEMA || 'public';
24
+
25
+ const pool = new Pool({ connectionString: DB_URL });
26
+ const qi = (s) => `"${s.replace(/"/g, '""')}"`;
27
+ const pct = (n, d) => (d === 0 ? (n === 0 ? '—' : '∞%') : `${Math.round((n / d) * 100)}%`);
28
+ const clean = (s) => (s ? String(s).replace(/\s+/g, ' ').slice(0, 70) : '');
29
+
30
+ async function main() {
31
+ console.log(`=== Aquifer vector recall 診斷 (schema=${SCHEMA}) ===\n`);
32
+
33
+ // -------------------------------------------------------------------------
34
+ // 1. Summary embedding coverage + dim
35
+ // -------------------------------------------------------------------------
36
+ const s = (await pool.query(`
37
+ SELECT
38
+ COUNT(*) AS total,
39
+ COUNT(*) FILTER (WHERE embedding IS NOT NULL) AS with_emb,
40
+ MIN(vector_dims(embedding)) AS min_dim,
41
+ MAX(vector_dims(embedding)) AS max_dim
42
+ FROM ${qi(SCHEMA)}.session_summaries
43
+ `)).rows[0];
44
+ console.log('--- 1. session_summaries.embedding ---');
45
+ console.log(` total ${s.total} | with_emb ${s.with_emb} (${pct(s.with_emb, s.total)})`);
46
+ const summaryDim = s.min_dim;
47
+ console.log(` dim min=${s.min_dim} max=${s.max_dim}${s.min_dim !== s.max_dim ? ' ⚠ 不一致' : ''}\n`);
48
+
49
+ // -------------------------------------------------------------------------
50
+ // 2. Turn embedding coverage + dim
51
+ // -------------------------------------------------------------------------
52
+ const t = (await pool.query(`
53
+ SELECT
54
+ COUNT(*) AS total,
55
+ COUNT(DISTINCT session_row_id) AS distinct_sessions,
56
+ MIN(vector_dims(embedding)) AS min_dim,
57
+ MAX(vector_dims(embedding)) AS max_dim
58
+ FROM ${qi(SCHEMA)}.turn_embeddings
59
+ `)).rows[0];
60
+ console.log('--- 2. turn_embeddings.embedding ---');
61
+ console.log(` total turns ${t.total} | distinct sessions ${t.distinct_sessions}`);
62
+ console.log(` dim min=${t.min_dim} max=${t.max_dim}${t.min_dim !== t.max_dim ? ' ⚠ 不一致' : ''}`);
63
+ const turnDim = t.min_dim;
64
+ if (turnDim && summaryDim && turnDim !== summaryDim) {
65
+ console.log(` ⚠ summary dim ${summaryDim} != turn dim ${turnDim} → query embedding 只會對得上其中一條`);
66
+ }
67
+ console.log();
68
+
69
+ // -------------------------------------------------------------------------
70
+ // 3. 缺 turn 但有 summary 的 session 比例
71
+ // -------------------------------------------------------------------------
72
+ const gap = (await pool.query(`
73
+ SELECT
74
+ COUNT(DISTINCT ss.session_row_id) AS with_summary_emb,
75
+ COUNT(DISTINCT te.session_row_id) AS with_turn_emb,
76
+ COUNT(DISTINCT ss.session_row_id) FILTER (
77
+ WHERE te.session_row_id IS NULL
78
+ ) AS summary_no_turn
79
+ FROM ${qi(SCHEMA)}.session_summaries ss
80
+ LEFT JOIN ${qi(SCHEMA)}.turn_embeddings te
81
+ ON te.session_row_id = ss.session_row_id
82
+ WHERE ss.embedding IS NOT NULL
83
+ `)).rows[0];
84
+ console.log('--- 3. 兩路覆蓋差 ---');
85
+ console.log(` sessions with summary emb : ${gap.with_summary_emb}`);
86
+ console.log(` sessions with turn emb : ${gap.with_turn_emb}`);
87
+ console.log(` summary-only (no turns) : ${gap.summary_no_turn} (${pct(gap.summary_no_turn, gap.with_summary_emb)})`);
88
+ console.log(' (summary-only 是常見的—某些 session 沒有合適的 user turn 可 embed)\n');
89
+
90
+ // -------------------------------------------------------------------------
91
+ // 4. Self-retrieval sanity: summary vector
92
+ // 拿最近一筆 summary.embedding 當 query,top-1 應該是自己且 distance ≈ 0
93
+ // -------------------------------------------------------------------------
94
+ console.log('--- 4. Summary vector self-retrieval sanity ---');
95
+ const seedS = (await pool.query(`
96
+ SELECT s.session_id, ss.summary_text, ss.embedding
97
+ FROM ${qi(SCHEMA)}.session_summaries ss
98
+ JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
99
+ WHERE ss.embedding IS NOT NULL
100
+ ORDER BY ss.updated_at DESC
101
+ LIMIT 1
102
+ `)).rows[0];
103
+
104
+ if (!seedS) {
105
+ console.log(' (no summary with embedding)\n');
106
+ } else {
107
+ const r = await pool.query(`
108
+ SELECT s.session_id,
109
+ (ss.embedding <=> $1::vector) AS distance,
110
+ ss.summary_text
111
+ FROM ${qi(SCHEMA)}.session_summaries ss
112
+ JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
113
+ WHERE ss.embedding IS NOT NULL
114
+ ORDER BY ss.embedding <=> $1::vector ASC
115
+ LIMIT 5
116
+ `, [seedS.embedding]);
117
+ console.log(` seed : ${String(seedS.session_id).slice(0, 8)} | ${clean(seedS.summary_text)}`);
118
+ for (const row of r.rows) {
119
+ const mark = String(row.session_id) === String(seedS.session_id) ? ' ← self' : '';
120
+ console.log(` [${Number(row.distance).toFixed(4)}] ${String(row.session_id).slice(0, 8)} | ${clean(row.summary_text)}${mark}`);
121
+ }
122
+ const top = r.rows[0];
123
+ const selfOK = top && String(top.session_id) === String(seedS.session_id) && Number(top.distance) < 0.001;
124
+ console.log(` → self top-1 @ distance≈0: ${selfOK ? 'YES ✓' : 'NO ✗'}\n`);
125
+ }
126
+
127
+ // -------------------------------------------------------------------------
128
+ // 5. Self-retrieval sanity: turn vector
129
+ // -------------------------------------------------------------------------
130
+ console.log('--- 5. Turn vector self-retrieval sanity ---');
131
+ const seedT = (await pool.query(`
132
+ SELECT te.session_row_id, te.turn_index, te.content_text, te.embedding,
133
+ s.session_id
134
+ FROM ${qi(SCHEMA)}.turn_embeddings te
135
+ JOIN ${qi(SCHEMA)}.sessions s ON s.id = te.session_row_id
136
+ ORDER BY te.created_at DESC
137
+ LIMIT 1
138
+ `)).rows[0];
139
+
140
+ if (!seedT) {
141
+ console.log(' (no turn embeddings)\n');
142
+ } else {
143
+ const r = await pool.query(`
144
+ SELECT s.session_id, te.turn_index, te.content_text,
145
+ (te.embedding <=> $1::vector) AS distance
146
+ FROM ${qi(SCHEMA)}.turn_embeddings te
147
+ JOIN ${qi(SCHEMA)}.sessions s ON s.id = te.session_row_id
148
+ ORDER BY te.embedding <=> $1::vector ASC
149
+ LIMIT 5
150
+ `, [seedT.embedding]);
151
+ console.log(` seed : ${String(seedT.session_id).slice(0, 8)} turn=${seedT.turn_index} | ${clean(seedT.content_text)}`);
152
+ for (const row of r.rows) {
153
+ const self = String(row.session_id) === String(seedT.session_id) && row.turn_index === seedT.turn_index;
154
+ console.log(` [${Number(row.distance).toFixed(4)}] ${String(row.session_id).slice(0, 8)} turn=${row.turn_index} | ${clean(row.content_text)}${self ? ' ← self' : ''}`);
155
+ }
156
+ const top = r.rows[0];
157
+ const selfOK = top && Number(top.distance) < 0.001;
158
+ console.log(` → self top-1 @ distance≈0: ${selfOK ? 'YES ✓' : 'NO ✗'}\n`);
159
+ }
160
+
161
+ // -------------------------------------------------------------------------
162
+ // 6. 跨路比較:用同一筆 summary embedding 去 turn table 找鄰居
163
+ // 只在 dim 一致時做;看 summary 代表 vs 其最近 turn 的距離分佈
164
+ // -------------------------------------------------------------------------
165
+ if (summaryDim && turnDim && summaryDim === turnDim && seedS) {
166
+ console.log('--- 6. Cross-path:summary emb → turn search (dim 相同才跑) ---');
167
+ const r = await pool.query(`
168
+ SELECT DISTINCT ON (te.session_row_id)
169
+ s.session_id, te.turn_index,
170
+ (te.embedding <=> $1::vector) AS distance,
171
+ te.content_text
172
+ FROM ${qi(SCHEMA)}.turn_embeddings te
173
+ JOIN ${qi(SCHEMA)}.sessions s ON s.id = te.session_row_id
174
+ ORDER BY te.session_row_id, te.embedding <=> $1::vector ASC
175
+ `, [seedS.embedding]);
176
+ r.rows.sort((a, b) => Number(a.distance) - Number(b.distance));
177
+ for (const row of r.rows.slice(0, 5)) {
178
+ const mark = String(row.session_id) === String(seedS.session_id) ? ' ← same session' : '';
179
+ console.log(` [${Number(row.distance).toFixed(4)}] ${String(row.session_id).slice(0, 8)} turn=${row.turn_index} | ${clean(row.content_text)}${mark}`);
180
+ }
181
+ console.log(' (不要求 top-1 是 seed session,兩路語意不同;只看距離是否合理 ≪ 1)\n');
182
+ }
183
+
184
+ await pool.end();
185
+ console.log('=== 完成 ===');
186
+ }
187
+
188
+ main().catch(err => { console.error(err); process.exit(1); });
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env bash
2
+ # Aquifer — Install drop-in OpenClaw extension
3
+ #
4
+ # Usage:
5
+ # bash scripts/install-openclaw.sh [OPENCLAW_HOME]
6
+ #
7
+ # Default OPENCLAW_HOME: $HOME/.openclaw
8
+ #
9
+ # What it does:
10
+ # 1. Creates / overwrites $OPENCLAW_HOME/extensions/aquifer-memory/
11
+ # as a symlink to <this_package>/consumers/openclaw-ext/
12
+ # 2. Prints follow-up instructions: set the .env keys, restart the gateway.
13
+ #
14
+ # Idempotent; safe to re-run.
15
+
16
+ set -euo pipefail
17
+
18
+ OPENCLAW_HOME="${1:-${OPENCLAW_HOME:-$HOME/.openclaw}}"
19
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
20
+ PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
21
+ EXT_SRC="$PKG_ROOT/consumers/openclaw-ext"
22
+ EXT_DEST="$OPENCLAW_HOME/extensions/aquifer-memory"
23
+
24
+ if [[ ! -d "$EXT_SRC" ]]; then
25
+ echo "error: $EXT_SRC not found (expected inside the Aquifer package)" >&2
26
+ exit 1
27
+ fi
28
+
29
+ if [[ ! -d "$OPENCLAW_HOME" ]]; then
30
+ echo "error: OPENCLAW_HOME=$OPENCLAW_HOME not found" >&2
31
+ exit 1
32
+ fi
33
+
34
+ mkdir -p "$OPENCLAW_HOME/extensions"
35
+
36
+ if [[ -L "$EXT_DEST" || -e "$EXT_DEST" ]]; then
37
+ echo "note: $EXT_DEST already exists — replacing"
38
+ rm -rf "$EXT_DEST"
39
+ fi
40
+
41
+ ln -s "$EXT_SRC" "$EXT_DEST"
42
+ echo "ok: linked $EXT_DEST → $EXT_SRC"
43
+
44
+ cat <<'EOF'
45
+
46
+ Next steps:
47
+ 1. Edit $OPENCLAW_HOME/.env and set:
48
+ DATABASE_URL=postgresql://user:pass@host:5432/db
49
+ EMBED_PROVIDER=ollama # or openai
50
+ AQUIFER_LLM_PROVIDER=minimax # or openai / openrouter / opencode
51
+ MINIMAX_API_KEY=... # (or the key for your chosen provider)
52
+ # Optional:
53
+ AQUIFER_SCHEMA=my_namespace
54
+ AQUIFER_PERSONA=/path/to/host-local/persona-module
55
+ 2. Restart OpenClaw:
56
+ systemctl --user restart openclaw-gateway
57
+ 3. Verify:
58
+ journalctl --user -u openclaw-gateway -f | grep aquifer-memory
59
+ EOF
package/scripts/smoke.mjs CHANGED
@@ -99,8 +99,8 @@ try {
99
99
  const { Pool } = require('pg');
100
100
  const pool = new Pool({ connectionString: config.db.url });
101
101
  const schema = config.schema || 'aquifer';
102
- await pool.query(`DELETE FROM ${schema}.turn_embeddings WHERE session_id IN (SELECT id FROM ${schema}.sessions WHERE session_id = $1)`, [SESSION_ID]);
103
- await pool.query(`DELETE FROM ${schema}.session_summaries WHERE session_id IN (SELECT id FROM ${schema}.sessions WHERE session_id = $1)`, [SESSION_ID]);
102
+ await pool.query(`DELETE FROM ${schema}.turn_embeddings WHERE session_id = $1`, [SESSION_ID]);
103
+ await pool.query(`DELETE FROM ${schema}.session_summaries WHERE session_id = $1`, [SESSION_ID]);
104
104
  await pool.query(`DELETE FROM ${schema}.sessions WHERE session_id = $1`, [SESSION_ID]);
105
105
  await pool.end();
106
106
  console.log(' OK');