@shadowforge0/aquifer-memory 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/aquifer.js +23 -27
- package/core/entity.js +29 -15
- package/core/storage.js +45 -37
- package/package.json +1 -1
- package/schema/001-base.sql +11 -1
- package/scripts/diagnose-fts-zh.js +161 -0
package/core/aquifer.js
CHANGED
|
@@ -100,19 +100,6 @@ function createAquifer(config) {
|
|
|
100
100
|
const entityPromptFn = config.entities && config.entities.prompt ? config.entities.prompt : null;
|
|
101
101
|
const entityScope = (config.entities && config.entities.scope) || 'default';
|
|
102
102
|
|
|
103
|
-
// FTS config — locked to 'simple'.
|
|
104
|
-
// The search_tsv trigger always uses to_tsvector('simple', ...), so query-time
|
|
105
|
-
// config must match. Warn and override if someone passes anything else.
|
|
106
|
-
const _rawFtsConfig = config.ftsConfig || 'simple';
|
|
107
|
-
if (_rawFtsConfig !== 'simple') {
|
|
108
|
-
console.warn(
|
|
109
|
-
`[aquifer] ftsConfig '${_rawFtsConfig}' is not currently supported. ` +
|
|
110
|
-
`The search_tsv index is built with 'simple'; only 'simple' is valid at query time. ` +
|
|
111
|
-
`Overriding to 'simple'.`
|
|
112
|
-
);
|
|
113
|
-
}
|
|
114
|
-
const ftsConfig = 'simple';
|
|
115
|
-
|
|
116
103
|
// Rank weights
|
|
117
104
|
const rankWeights = {
|
|
118
105
|
rrf: 0.65,
|
|
@@ -200,21 +187,30 @@ function createAquifer(config) {
|
|
|
200
187
|
// --- lifecycle ---
|
|
201
188
|
|
|
202
189
|
async migrate() {
|
|
203
|
-
//
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
190
|
+
// Advisory lock prevents concurrent migrations across processes.
|
|
191
|
+
// Lock key is derived from schema name to allow parallel migration
|
|
192
|
+
// of different schemas in the same database.
|
|
193
|
+
const lockKey = Buffer.from(`aquifer:${schema}`).reduce((h, b) => (h * 31 + b) & 0x7fffffff, 0);
|
|
194
|
+
await pool.query('SELECT pg_advisory_lock($1)', [lockKey]);
|
|
195
|
+
try {
|
|
196
|
+
// 1. Run base DDL
|
|
197
|
+
const baseSql = loadSql('001-base.sql', schema);
|
|
198
|
+
await pool.query(baseSql);
|
|
199
|
+
|
|
200
|
+
// 2. If entities enabled, run entity DDL
|
|
201
|
+
if (entitiesEnabled) {
|
|
202
|
+
const entitySql = loadSql('002-entities.sql', schema);
|
|
203
|
+
await pool.query(entitySql);
|
|
204
|
+
}
|
|
212
205
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
206
|
+
// 3. Trust + feedback (always, not gated by entities)
|
|
207
|
+
const trustSql = loadSql('003-trust-feedback.sql', schema);
|
|
208
|
+
await pool.query(trustSql);
|
|
216
209
|
|
|
217
|
-
|
|
210
|
+
migrated = true;
|
|
211
|
+
} finally {
|
|
212
|
+
await pool.query('SELECT pg_advisory_unlock($1)', [lockKey]).catch(() => {});
|
|
213
|
+
}
|
|
218
214
|
},
|
|
219
215
|
|
|
220
216
|
async close() {
|
|
@@ -706,7 +702,7 @@ function createAquifer(config) {
|
|
|
706
702
|
const [ftsRows, embRows, turnResult] = await Promise.all([
|
|
707
703
|
runFts
|
|
708
704
|
? storage.searchSessions(pool, query, {
|
|
709
|
-
schema, tenantId, agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit,
|
|
705
|
+
schema, tenantId, agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit,
|
|
710
706
|
}).catch((err) => {
|
|
711
707
|
recordSearchError('fts', err);
|
|
712
708
|
return [];
|
package/core/entity.js
CHANGED
|
@@ -222,27 +222,41 @@ async function upsertEntityRelations(pool, {
|
|
|
222
222
|
}) {
|
|
223
223
|
if (!pairs || pairs.length === 0) return { upserted: 0 };
|
|
224
224
|
const ts = occurredAt || new Date().toISOString();
|
|
225
|
-
let upserted = 0;
|
|
226
225
|
|
|
226
|
+
// Filter and normalize pairs
|
|
227
|
+
const validPairs = [];
|
|
227
228
|
for (const { srcEntityId, dstEntityId } of pairs) {
|
|
228
229
|
if (!srcEntityId || !dstEntityId || srcEntityId === dstEntityId) continue;
|
|
230
|
+
validPairs.push({
|
|
231
|
+
lo: Math.min(srcEntityId, dstEntityId),
|
|
232
|
+
hi: Math.max(srcEntityId, dstEntityId),
|
|
233
|
+
});
|
|
234
|
+
}
|
|
229
235
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
);
|
|
242
|
-
upserted++;
|
|
236
|
+
if (validPairs.length === 0) return { upserted: 0 };
|
|
237
|
+
|
|
238
|
+
// Batch insert: multi-row VALUES
|
|
239
|
+
const COLS_PER_ROW = 3;
|
|
240
|
+
const valueClauses = [];
|
|
241
|
+
const params = [];
|
|
242
|
+
|
|
243
|
+
for (const { lo, hi } of validPairs) {
|
|
244
|
+
const off = params.length;
|
|
245
|
+
params.push(lo, hi, ts);
|
|
246
|
+
valueClauses.push(`($${off+1}, $${off+2}, 1, $${off+3}, $${off+3})`);
|
|
243
247
|
}
|
|
244
248
|
|
|
245
|
-
|
|
249
|
+
await pool.query(
|
|
250
|
+
`INSERT INTO ${qi(schema)}.entity_relations
|
|
251
|
+
(src_entity_id, dst_entity_id, co_occurrence_count, first_seen_at, last_seen_at)
|
|
252
|
+
VALUES ${valueClauses.join(',\n')}
|
|
253
|
+
ON CONFLICT (src_entity_id, dst_entity_id) DO UPDATE SET
|
|
254
|
+
co_occurrence_count = ${qi(schema)}.entity_relations.co_occurrence_count + 1,
|
|
255
|
+
last_seen_at = GREATEST(${qi(schema)}.entity_relations.last_seen_at, EXCLUDED.last_seen_at)`,
|
|
256
|
+
params
|
|
257
|
+
);
|
|
258
|
+
|
|
259
|
+
return { upserted: validPairs.length };
|
|
246
260
|
}
|
|
247
261
|
|
|
248
262
|
// ---------------------------------------------------------------------------
|
package/core/storage.js
CHANGED
|
@@ -211,7 +211,7 @@ async function getMessages(pool, sessionId, agentId, { schema, tenantId } = {})
|
|
|
211
211
|
}
|
|
212
212
|
|
|
213
213
|
// ---------------------------------------------------------------------------
|
|
214
|
-
// searchSessions (FTS)
|
|
214
|
+
// searchSessions (trigram + FTS fallback)
|
|
215
215
|
// ---------------------------------------------------------------------------
|
|
216
216
|
|
|
217
217
|
async function searchSessions(pool, query, {
|
|
@@ -220,34 +220,27 @@ async function searchSessions(pool, query, {
|
|
|
220
220
|
agentId,
|
|
221
221
|
agentIds: rawAgentIds,
|
|
222
222
|
source,
|
|
223
|
-
dateFrom,
|
|
223
|
+
dateFrom,
|
|
224
224
|
dateTo,
|
|
225
225
|
limit = 20,
|
|
226
|
-
ftsConfig = 'simple',
|
|
227
226
|
} = {}) {
|
|
228
227
|
const clampedLimit = Math.max(1, Math.min(100, limit));
|
|
229
|
-
// FTS config is locked to 'simple' — the search_tsv trigger always uses
|
|
230
|
-
// to_tsvector('simple', ...) so query semantics must match. Warn callers
|
|
231
|
-
// that pass a different value rather than silently honouring it.
|
|
232
|
-
if (ftsConfig !== 'simple') {
|
|
233
|
-
console.warn(
|
|
234
|
-
`[aquifer/storage] searchSessions: ftsConfig '${ftsConfig}' ignored. ` +
|
|
235
|
-
`Only 'simple' is supported (index is built with simple tokenizer). ` +
|
|
236
|
-
`Using 'simple'.`
|
|
237
|
-
);
|
|
238
|
-
}
|
|
239
|
-
const safeFts = 'simple';
|
|
240
228
|
|
|
241
229
|
// Normalize agentId/agentIds
|
|
242
230
|
const agentIds = rawAgentIds && rawAgentIds.length > 0
|
|
243
231
|
? rawAgentIds
|
|
244
232
|
: (agentId ? [agentId] : null);
|
|
245
233
|
|
|
234
|
+
// Escape LIKE special characters in query
|
|
235
|
+
const likeQuery = query.replace(/[%_\\]/g, '\\$&');
|
|
236
|
+
|
|
237
|
+
// Primary: trigram ILIKE on search_text (works for CJK + Latin)
|
|
238
|
+
// Fallback: tsvector FTS (for installations without search_text populated)
|
|
246
239
|
const where = [
|
|
247
|
-
`ss.search_tsv @@ plainto_tsquery('
|
|
248
|
-
`s.tenant_id = $
|
|
240
|
+
`(ss.search_text ILIKE '%' || $1 || '%' OR ss.search_tsv @@ plainto_tsquery('simple', $2))`,
|
|
241
|
+
`s.tenant_id = $3`,
|
|
249
242
|
];
|
|
250
|
-
const params = [query, tenantId];
|
|
243
|
+
const params = [likeQuery, query, tenantId];
|
|
251
244
|
|
|
252
245
|
if (agentIds) {
|
|
253
246
|
params.push(agentIds);
|
|
@@ -281,8 +274,10 @@ async function searchSessions(pool, query, {
|
|
|
281
274
|
ss.access_count,
|
|
282
275
|
ss.last_accessed_at,
|
|
283
276
|
ss.trust_score,
|
|
284
|
-
|
|
285
|
-
|
|
277
|
+
CASE WHEN ss.search_text IS NOT NULL
|
|
278
|
+
THEN similarity(ss.search_text, $2)
|
|
279
|
+
ELSE ts_rank(ss.search_tsv, plainto_tsquery('simple', $2))
|
|
280
|
+
END AS fts_rank
|
|
286
281
|
FROM ${qi(schema)}.sessions s
|
|
287
282
|
LEFT JOIN ${qi(schema)}.session_summaries ss ON ss.session_row_id = s.id
|
|
288
283
|
WHERE ${where.join(' AND ')}
|
|
@@ -365,32 +360,45 @@ async function upsertTurnEmbeddings(pool, sessionRowId, {
|
|
|
365
360
|
throw new Error(`turns.length (${turns.length}) !== vectors.length (${vectors.length})`);
|
|
366
361
|
}
|
|
367
362
|
|
|
363
|
+
// Batch insert: build multi-row VALUES clause
|
|
364
|
+
const COLS_PER_ROW = 10;
|
|
365
|
+
const valueClauses = [];
|
|
366
|
+
const params = [];
|
|
367
|
+
|
|
368
368
|
for (let i = 0; i < turns.length; i++) {
|
|
369
369
|
const t = turns[i];
|
|
370
370
|
const vec = vectors[i];
|
|
371
371
|
if (!vec) continue;
|
|
372
372
|
|
|
373
373
|
const contentHash = crypto.createHash('sha256').update(t.text).digest('hex').slice(0, 16);
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
embedding = CASE
|
|
383
|
-
WHEN ${qi(schema)}.turn_embeddings.content_hash = EXCLUDED.content_hash
|
|
384
|
-
THEN ${qi(schema)}.turn_embeddings.embedding
|
|
385
|
-
ELSE EXCLUDED.embedding
|
|
386
|
-
END`,
|
|
387
|
-
[
|
|
388
|
-
sessionRowId, tenantId, sessionId, agentId, source || null,
|
|
389
|
-
t.turnIndex, t.messageIndex,
|
|
390
|
-
t.text, contentHash, vecToStr(vec),
|
|
391
|
-
]
|
|
374
|
+
const off = params.length;
|
|
375
|
+
params.push(
|
|
376
|
+
sessionRowId, tenantId, sessionId, agentId, source || null,
|
|
377
|
+
t.turnIndex, t.messageIndex,
|
|
378
|
+
t.text, contentHash, vecToStr(vec),
|
|
379
|
+
);
|
|
380
|
+
valueClauses.push(
|
|
381
|
+
`($${off+1},$${off+2},$${off+3},$${off+4},$${off+5},$${off+6},$${off+7},'user',$${off+8},$${off+9},$${off+10}::vector)`
|
|
392
382
|
);
|
|
393
383
|
}
|
|
384
|
+
|
|
385
|
+
if (valueClauses.length === 0) return;
|
|
386
|
+
|
|
387
|
+
await pool.query(
|
|
388
|
+
`INSERT INTO ${qi(schema)}.turn_embeddings
|
|
389
|
+
(session_row_id, tenant_id, session_id, agent_id, source,
|
|
390
|
+
turn_index, message_index, role, content_text, content_hash, embedding)
|
|
391
|
+
VALUES ${valueClauses.join(',\n')}
|
|
392
|
+
ON CONFLICT (session_row_id, message_index) DO UPDATE SET
|
|
393
|
+
content_text = EXCLUDED.content_text,
|
|
394
|
+
content_hash = EXCLUDED.content_hash,
|
|
395
|
+
embedding = CASE
|
|
396
|
+
WHEN ${qi(schema)}.turn_embeddings.content_hash = EXCLUDED.content_hash
|
|
397
|
+
THEN ${qi(schema)}.turn_embeddings.embedding
|
|
398
|
+
ELSE EXCLUDED.embedding
|
|
399
|
+
END`,
|
|
400
|
+
params
|
|
401
|
+
);
|
|
394
402
|
}
|
|
395
403
|
|
|
396
404
|
// ---------------------------------------------------------------------------
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@shadowforge0/aquifer-memory",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.3",
|
|
4
4
|
"description": "PG-native long-term memory for AI agents. Turn-level embedding, hybrid RRF ranking, optional knowledge graph. MCP server, CLI, and library API.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"files": [
|
package/schema/001-base.sql
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
-- Usage: replace ${schema} with actual schema name (e.g., 'aquifer')
|
|
3
3
|
|
|
4
4
|
CREATE EXTENSION IF NOT EXISTS vector;
|
|
5
|
+
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
|
5
6
|
CREATE SCHEMA IF NOT EXISTS ${schema};
|
|
6
7
|
|
|
7
8
|
-- =========================================================================
|
|
@@ -85,6 +86,7 @@ CREATE TABLE IF NOT EXISTS ${schema}.session_summaries (
|
|
|
85
86
|
structured_summary JSONB NOT NULL DEFAULT '{}',
|
|
86
87
|
embedding vector,
|
|
87
88
|
search_tsv TSVECTOR,
|
|
89
|
+
search_text TEXT,
|
|
88
90
|
access_count INT NOT NULL DEFAULT 0,
|
|
89
91
|
last_accessed_at TIMESTAMPTZ,
|
|
90
92
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
@@ -96,6 +98,9 @@ CREATE INDEX IF NOT EXISTS idx_summaries_tenant
|
|
|
96
98
|
CREATE INDEX IF NOT EXISTS idx_summaries_search_tsv
|
|
97
99
|
ON ${schema}.session_summaries USING GIN (search_tsv);
|
|
98
100
|
|
|
101
|
+
CREATE INDEX IF NOT EXISTS idx_summaries_search_text_trgm
|
|
102
|
+
ON ${schema}.session_summaries USING GIN (search_text gin_trgm_ops);
|
|
103
|
+
|
|
99
104
|
CREATE INDEX IF NOT EXISTS idx_summaries_embedding
|
|
100
105
|
ON ${schema}.session_summaries (session_row_id)
|
|
101
106
|
WHERE embedding IS NOT NULL;
|
|
@@ -141,6 +146,11 @@ BEGIN
|
|
|
141
146
|
setweight(to_tsvector('simple', COALESCE(NEW.summary_text, '')), 'C') ||
|
|
142
147
|
setweight(to_tsvector('simple', open_loops_text || ' ' || facts_text), 'D');
|
|
143
148
|
|
|
149
|
+
NEW.search_text :=
|
|
150
|
+
title_text || ' ' || overview_text || ' ' || topics_text || ' ' ||
|
|
151
|
+
decisions_text || ' ' || COALESCE(NEW.summary_text, '') || ' ' ||
|
|
152
|
+
open_loops_text || ' ' || facts_text;
|
|
153
|
+
|
|
144
154
|
RETURN NEW;
|
|
145
155
|
END;
|
|
146
156
|
$$;
|
|
@@ -149,7 +159,7 @@ DROP TRIGGER IF EXISTS trg_session_summaries_search_tsv
|
|
|
149
159
|
ON ${schema}.session_summaries;
|
|
150
160
|
|
|
151
161
|
CREATE TRIGGER trg_session_summaries_search_tsv
|
|
152
|
-
BEFORE INSERT OR UPDATE OF summary_text, structured_summary
|
|
162
|
+
BEFORE INSERT OR UPDATE OF summary_text, structured_summary, search_text
|
|
153
163
|
ON ${schema}.session_summaries
|
|
154
164
|
FOR EACH ROW
|
|
155
165
|
EXECUTE FUNCTION ${schema}.session_summaries_search_tsv_update();
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* FTS 中文診斷:檢查 'simple' tokenizer 在實際中文資料上的表現
|
|
5
|
+
*
|
|
6
|
+
* 測試項目:
|
|
7
|
+
* 1. FTS tokenization — 實際 token 長什麼樣
|
|
8
|
+
* 2. FTS recall — 常見中文查詢的命中率
|
|
9
|
+
* 3. FTS vs vector — FTS 有沒有在幫忙還是在拖後腿
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
const { Pool } = require('pg');
|
|
13
|
+
|
|
14
|
+
const DB_URL = process.env.DATABASE_URL || 'postgresql://burk:790476@localhost:5432/openclaw_db';
|
|
15
|
+
const SCHEMA = process.env.AQUIFER_SCHEMA || 'miranda';
|
|
16
|
+
|
|
17
|
+
const pool = new Pool({ connectionString: DB_URL });
|
|
18
|
+
|
|
19
|
+
async function run() {
|
|
20
|
+
const qi = (s) => `"${s}"`;
|
|
21
|
+
|
|
22
|
+
console.log('=== FTS 中文診斷 ===\n');
|
|
23
|
+
|
|
24
|
+
// 1. 看 token 分佈
|
|
25
|
+
console.log('--- 1. Token 分析 ---');
|
|
26
|
+
const tokenSample = await pool.query(`
|
|
27
|
+
SELECT ss.session_id,
|
|
28
|
+
array_length(tsvector_to_array(ss.search_tsv), 1) as token_count,
|
|
29
|
+
left(ss.summary_text, 80) as preview
|
|
30
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
31
|
+
WHERE ss.search_tsv IS NOT NULL
|
|
32
|
+
ORDER BY ss.updated_at DESC
|
|
33
|
+
LIMIT 10
|
|
34
|
+
`);
|
|
35
|
+
|
|
36
|
+
let totalTokens = 0;
|
|
37
|
+
let sessionCount = 0;
|
|
38
|
+
for (const r of tokenSample.rows) {
|
|
39
|
+
totalTokens += r.token_count || 0;
|
|
40
|
+
sessionCount++;
|
|
41
|
+
console.log(` ${r.session_id?.slice(0, 8)} | ${r.token_count || 0} tokens | ${r.preview}`);
|
|
42
|
+
}
|
|
43
|
+
console.log(` avg: ${sessionCount ? Math.round(totalTokens / sessionCount) : 0} tokens/session\n`);
|
|
44
|
+
|
|
45
|
+
// 2. 看一個 session 的實際 token
|
|
46
|
+
console.log('--- 2. Token 範例(最近 session)---');
|
|
47
|
+
const tokenDetail = await pool.query(`
|
|
48
|
+
SELECT ss.session_id,
|
|
49
|
+
array_to_string(tsvector_to_array(ss.search_tsv), ' | ') as tokens
|
|
50
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
51
|
+
WHERE ss.search_tsv IS NOT NULL
|
|
52
|
+
ORDER BY ss.updated_at DESC
|
|
53
|
+
LIMIT 1
|
|
54
|
+
`);
|
|
55
|
+
if (tokenDetail.rows[0]) {
|
|
56
|
+
console.log(` session: ${tokenDetail.rows[0].session_id?.slice(0, 8)}`);
|
|
57
|
+
const tokens = tokenDetail.rows[0].tokens || '';
|
|
58
|
+
// 分類 token
|
|
59
|
+
const all = tokens.split(' | ');
|
|
60
|
+
const cjk = all.filter(t => /[\u4e00-\u9fff]/.test(t));
|
|
61
|
+
const latin = all.filter(t => /^[a-z0-9]/.test(t));
|
|
62
|
+
const other = all.filter(t => !(/[\u4e00-\u9fff]/.test(t)) && !(/^[a-z0-9]/.test(t)));
|
|
63
|
+
console.log(` total: ${all.length} | latin: ${latin.length} | cjk: ${cjk.length} | other: ${other.length}`);
|
|
64
|
+
console.log(` CJK tokens (前 20): ${cjk.slice(0, 20).join(' | ')}`);
|
|
65
|
+
console.log(` Latin tokens (前 20): ${latin.slice(0, 20).join(' | ')}\n`);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 3. 中文查詢命中率測試
|
|
69
|
+
console.log('--- 3. 中文查詢 FTS 命中率 ---');
|
|
70
|
+
const testQueries = [
|
|
71
|
+
'afterburn',
|
|
72
|
+
'bootstrap',
|
|
73
|
+
'session',
|
|
74
|
+
'recall',
|
|
75
|
+
'記憶',
|
|
76
|
+
'修復',
|
|
77
|
+
'架構',
|
|
78
|
+
'時區',
|
|
79
|
+
'去重',
|
|
80
|
+
'daily entries',
|
|
81
|
+
'OpenCode',
|
|
82
|
+
'entity',
|
|
83
|
+
'Jenny',
|
|
84
|
+
'Aquifer',
|
|
85
|
+
'消化模式',
|
|
86
|
+
];
|
|
87
|
+
|
|
88
|
+
// 總 session 數
|
|
89
|
+
const totalResult = await pool.query(`
|
|
90
|
+
SELECT COUNT(*) as cnt FROM ${qi(SCHEMA)}.session_summaries WHERE search_tsv IS NOT NULL
|
|
91
|
+
`);
|
|
92
|
+
const totalSessions = parseInt(totalResult.rows[0].cnt);
|
|
93
|
+
console.log(` total sessions with FTS index: ${totalSessions}\n`);
|
|
94
|
+
|
|
95
|
+
for (const q of testQueries) {
|
|
96
|
+
const ftsResult = await pool.query(`
|
|
97
|
+
SELECT COUNT(*) as cnt
|
|
98
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
99
|
+
WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
|
|
100
|
+
`, [q]);
|
|
101
|
+
const ftsHits = parseInt(ftsResult.rows[0].cnt);
|
|
102
|
+
|
|
103
|
+
// 同時看 summary_text ILIKE 能找到幾筆(ground truth)
|
|
104
|
+
const ilikeResult = await pool.query(`
|
|
105
|
+
SELECT COUNT(*) as cnt
|
|
106
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
107
|
+
WHERE ss.summary_text ILIKE $1
|
|
108
|
+
OR ss.structured_summary::text ILIKE $1
|
|
109
|
+
`, [`%${q}%`]);
|
|
110
|
+
const ilikeHits = parseInt(ilikeResult.rows[0].cnt);
|
|
111
|
+
|
|
112
|
+
const ftsRecall = ilikeHits > 0 ? Math.round(ftsHits / ilikeHits * 100) : (ftsHits === 0 ? 100 : 0);
|
|
113
|
+
const status = ftsHits === ilikeHits ? '✓' : (ftsHits < ilikeHits ? '✗ MISS' : '?');
|
|
114
|
+
console.log(` "${q}" | FTS: ${ftsHits} | ILIKE: ${ilikeHits} | recall: ${ftsRecall}% | ${status}`);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// 4. FTS 對 RRF 的貢獻度
|
|
118
|
+
console.log('\n--- 4. FTS 在 hybrid search 中的貢獻度 ---');
|
|
119
|
+
// 跑幾個查詢,看 FTS 跟 vector 的 session 重疊率
|
|
120
|
+
const overlapQueries = ['afterburn', 'bootstrap', '記憶', 'recall', 'entity'];
|
|
121
|
+
for (const q of overlapQueries) {
|
|
122
|
+
const ftsResult = await pool.query(`
|
|
123
|
+
SELECT ss.session_id
|
|
124
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
125
|
+
JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
|
|
126
|
+
WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
|
|
127
|
+
AND s.processing_status = 'succeeded'
|
|
128
|
+
ORDER BY ts_rank(ss.search_tsv, plainto_tsquery('simple', $1)) DESC
|
|
129
|
+
LIMIT 10
|
|
130
|
+
`, [q]);
|
|
131
|
+
const ftsIds = new Set(ftsResult.rows.map(r => r.session_id));
|
|
132
|
+
|
|
133
|
+
// vector search (if embedding available)
|
|
134
|
+
const embResult = await pool.query(`
|
|
135
|
+
SELECT ss.session_id
|
|
136
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
137
|
+
JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
|
|
138
|
+
WHERE ss.embedding IS NOT NULL
|
|
139
|
+
AND s.processing_status = 'succeeded'
|
|
140
|
+
ORDER BY ss.embedding <=> (
|
|
141
|
+
SELECT ss2.embedding FROM ${qi(SCHEMA)}.session_summaries ss2
|
|
142
|
+
WHERE ss2.search_tsv @@ plainto_tsquery('simple', $1)
|
|
143
|
+
ORDER BY ts_rank(ss2.search_tsv, plainto_tsquery('simple', $1)) DESC
|
|
144
|
+
LIMIT 1
|
|
145
|
+
)
|
|
146
|
+
LIMIT 10
|
|
147
|
+
`, [q]);
|
|
148
|
+
const embIds = new Set(embResult.rows.map(r => r.session_id));
|
|
149
|
+
|
|
150
|
+
const overlap = [...ftsIds].filter(id => embIds.has(id)).length;
|
|
151
|
+
const ftsOnly = [...ftsIds].filter(id => !embIds.has(id)).length;
|
|
152
|
+
const embOnly = [...embIds].filter(id => !ftsIds.has(id)).length;
|
|
153
|
+
|
|
154
|
+
console.log(` "${q}" | FTS top10: ${ftsIds.size} | Vec top10: ${embIds.size} | overlap: ${overlap} | FTS-only: ${ftsOnly} | Vec-only: ${embOnly}`);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
await pool.end();
|
|
158
|
+
console.log('\n=== 完成 ===');
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
run().catch(err => { console.error(err); process.exit(1); });
|