@shadowforge0/aquifer-memory 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -130,13 +130,14 @@ Full env-to-config mapping is in [consumers/shared/config.js](consumers/shared/c
130
130
 
131
131
  ## Host Integration
132
132
 
133
- MCP is the primary integration surface. Agent hosts connect to the Aquifer MCP server, which exposes four tools: `session_recall`, `session_feedback`, `memory_stats`, `memory_pending`.
133
+ MCP is the primary integration surface. Agent hosts connect to the Aquifer MCP server, which exposes five tools: `session_recall`, `session_feedback`, `session_bootstrap`, `memory_stats`, `memory_pending`.
134
134
 
135
135
  | Integration | Route | Status | When to use |
136
136
  |-------------|-------|--------|-------------|
137
137
  | MCP server | `consumers/mcp.js` | Primary | Claude Code, OpenClaw, Codex, any MCP-capable host |
138
138
  | Library API | `createAquifer()` | Primary | Backend apps, custom pipelines, direct Node.js usage |
139
- | CLI | `consumers/cli.js` | Secondary | Operations, debugging, manual recall/backfill |
139
+ | CLI | `consumers/cli.js` | Secondary | Operations, debugging, manual recall/backfill (`aquifer bootstrap`, `aquifer ingest-opencode`, etc.) |
140
+ | OpenCode ingest | `consumers/opencode.js` | Secondary | Import sessions from OpenCode's SQLite DB |
140
141
  | OpenClaw plugin | `consumers/openclaw-plugin.js` | Compatibility only | Session capture via `before_reset` — not for tool delivery |
141
142
 
142
143
  ### Claude Code
@@ -160,7 +161,7 @@ Add to your project's `.claude.json` or user-level MCP config:
160
161
  }
161
162
  ```
162
163
 
163
- Tools appear as `mcp__aquifer__session_recall`, `mcp__aquifer__session_feedback`, etc.
164
+ Tools appear as `mcp__aquifer__session_recall`, `mcp__aquifer__session_feedback`, `mcp__aquifer__session_bootstrap`, etc.
164
165
 
165
166
  ### OpenClaw
166
167
 
@@ -184,7 +185,7 @@ Add to `openclaw.json` under `mcp.servers`:
184
185
  }
185
186
  ```
186
187
 
187
- Tools materialize as `aquifer__session_recall`, `aquifer__session_feedback`, `aquifer__memory_stats`, `aquifer__memory_pending` (server name prefix added by the host).
188
+ Tools materialize as `aquifer__session_recall`, `aquifer__session_feedback`, `aquifer__session_bootstrap`, `aquifer__memory_stats`, `aquifer__memory_pending` (server name prefix added by the host).
188
189
 
189
190
  The OpenClaw plugin (`consumers/openclaw-plugin.js`) is retained for session capture via `before_reset` but is **not** the recommended tool delivery path. Use MCP.
190
191
 
@@ -245,6 +246,7 @@ Any host that supports MCP stdio can connect the same way — point it at `node
245
246
  | `pipeline/extract-entities.js` | LLM-powered entity extraction (12 types) |
246
247
  | `pipeline/rerank.js` | Cross-encoder reranking (TEI, Jina, OpenRouter) |
247
248
  | `pipeline/normalize/` | Session normalization for Claude Code / gateway noise |
249
+ | `consumers/opencode.js` | OpenCode SQLite ingest — reads sessions from OpenCode's local DB |
248
250
  | `schema/001-base.sql` | DDL: sessions, summaries, turn_embeddings, FTS indexes |
249
251
  | `schema/002-entities.sql` | DDL: entities, mentions, relations, entity_sessions |
250
252
  | `schema/003-trust-feedback.sql` | DDL: trust_score column, session_feedback audit trail |
@@ -435,6 +437,24 @@ await aquifer.feedback('session-id', {
435
437
  });
436
438
  ```
437
439
 
440
+ #### `aquifer.bootstrap(opts)`
441
+
442
+ Loads recent session context for a new conversation — summaries, open loops, and decisions. Time-based (no embedding search), designed for session-start injection.
443
+
444
+ ```javascript
445
+ const result = await aquifer.bootstrap({
446
+ agentId: 'main',
447
+ limit: 5, // max sessions (default: 5)
448
+ lookbackDays: 14, // how far back (default: 14)
449
+ maxChars: 4000, // max output chars (default: 4000)
450
+ format: 'text', // 'text', 'structured', or 'both'
451
+ });
452
+ // format='text': result.text contains XML block ready for injection
453
+ // format='structured': result.sessions, result.openLoops, result.recentDecisions
454
+ ```
455
+
456
+ Cross-session dedup on open loops and decisions, sentinel filtering (removes 無/none/n/a), and maxChars truncation.
457
+
438
458
  #### `aquifer.close()`
439
459
 
440
460
  Closes the PostgreSQL connection pool (only if Aquifer created it).
package/core/aquifer.js CHANGED
@@ -100,19 +100,6 @@ function createAquifer(config) {
100
100
  const entityPromptFn = config.entities && config.entities.prompt ? config.entities.prompt : null;
101
101
  const entityScope = (config.entities && config.entities.scope) || 'default';
102
102
 
103
- // FTS config — locked to 'simple'.
104
- // The search_tsv trigger always uses to_tsvector('simple', ...), so query-time
105
- // config must match. Warn and override if someone passes anything else.
106
- const _rawFtsConfig = config.ftsConfig || 'simple';
107
- if (_rawFtsConfig !== 'simple') {
108
- console.warn(
109
- `[aquifer] ftsConfig '${_rawFtsConfig}' is not currently supported. ` +
110
- `The search_tsv index is built with 'simple'; only 'simple' is valid at query time. ` +
111
- `Overriding to 'simple'.`
112
- );
113
- }
114
- const ftsConfig = 'simple';
115
-
116
103
  // Rank weights
117
104
  const rankWeights = {
118
105
  rrf: 0.65,
@@ -706,7 +693,7 @@ function createAquifer(config) {
706
693
  const [ftsRows, embRows, turnResult] = await Promise.all([
707
694
  runFts
708
695
  ? storage.searchSessions(pool, query, {
709
- schema, tenantId, agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit, ftsConfig,
696
+ schema, tenantId, agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit,
710
697
  }).catch((err) => {
711
698
  recordSearchError('fts', err);
712
699
  return [];
@@ -918,7 +905,6 @@ function createAquifer(config) {
918
905
  },
919
906
 
920
907
  async getSessionFull(sessionId) {
921
- // Try to find the session across agents by querying directly
922
908
  const result = await pool.query(
923
909
  `SELECT * FROM ${qi(schema)}.sessions
924
910
  WHERE session_id = $1 AND tenant_id = $2
@@ -928,24 +914,15 @@ function createAquifer(config) {
928
914
  const session = result.rows[0];
929
915
  if (!session) return null;
930
916
 
931
- const [segResult, sumResult] = await Promise.all([
932
- pool.query(
933
- `SELECT * FROM ${qi(schema)}.session_segments
934
- WHERE session_row_id = $1
935
- ORDER BY segment_no ASC`,
936
- [session.id]
937
- ),
938
- pool.query(
939
- `SELECT * FROM ${qi(schema)}.session_summaries
940
- WHERE session_row_id = $1
941
- LIMIT 1`,
942
- [session.id]
943
- ),
944
- ]);
917
+ const sumResult = await pool.query(
918
+ `SELECT * FROM ${qi(schema)}.session_summaries
919
+ WHERE session_row_id = $1
920
+ LIMIT 1`,
921
+ [session.id]
922
+ );
945
923
 
946
924
  return {
947
925
  session,
948
- segments: segResult.rows,
949
926
  summary: sumResult.rows[0] || null,
950
927
  };
951
928
  },
package/core/storage.js CHANGED
@@ -96,44 +96,6 @@ async function upsertSession(pool, {
96
96
  };
97
97
  }
98
98
 
99
- // ---------------------------------------------------------------------------
100
- // upsertSegments
101
- // ---------------------------------------------------------------------------
102
-
103
- async function upsertSegments(pool, sessionRowId, segments, { schema } = {}) {
104
- if (!segments || segments.length === 0) return;
105
- for (const seg of segments) {
106
- await pool.query(
107
- `INSERT INTO ${qi(schema)}.session_segments
108
- (session_row_id, segment_no, start_msg_idx, end_msg_idx,
109
- started_at, ended_at, raw_msg_count, effective_msg_count,
110
- boundary_type, boundary_meta)
111
- VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
112
- ON CONFLICT (session_row_id, segment_no) DO UPDATE SET
113
- start_msg_idx = EXCLUDED.start_msg_idx,
114
- end_msg_idx = EXCLUDED.end_msg_idx,
115
- started_at = EXCLUDED.started_at,
116
- ended_at = EXCLUDED.ended_at,
117
- raw_msg_count = EXCLUDED.raw_msg_count,
118
- effective_msg_count = EXCLUDED.effective_msg_count,
119
- boundary_type = EXCLUDED.boundary_type,
120
- boundary_meta = EXCLUDED.boundary_meta`,
121
- [
122
- sessionRowId,
123
- seg.segmentNo,
124
- seg.startMsgIdx !== null && seg.startMsgIdx !== undefined ? seg.startMsgIdx : null,
125
- seg.endMsgIdx !== null && seg.endMsgIdx !== undefined ? seg.endMsgIdx : null,
126
- seg.startedAt || null,
127
- seg.endedAt || null,
128
- seg.rawMsgCount || 0,
129
- seg.effectiveMsgCount || 0,
130
- seg.boundaryType || null,
131
- seg.boundaryMeta ? JSON.stringify(seg.boundaryMeta) : '{}',
132
- ]
133
- );
134
- }
135
- }
136
-
137
99
  // ---------------------------------------------------------------------------
138
100
  // upsertSummary
139
101
  // ---------------------------------------------------------------------------
@@ -159,9 +121,8 @@ async function upsertSummary(pool, sessionRowId, {
159
121
  `INSERT INTO ${qi(schema)}.session_summaries
160
122
  (session_row_id, tenant_id, agent_id, session_id, summary_version, model, source_hash,
161
123
  message_count, user_message_count, assistant_message_count,
162
- boundary_count, fresh_tail_count,
163
124
  started_at, ended_at, structured_summary, summary_text, embedding, updated_at)
164
- VALUES ($1,$2,$3,$4,1,$5,$6,$7,$8,$9,0,0,$10,$11,COALESCE($12::jsonb,'{}'::jsonb),COALESCE($13,''),$14::vector,now())
125
+ VALUES ($1,$2,$3,$4,1,$5,$6,$7,$8,$9,$10,$11,COALESCE($12::jsonb,'{}'::jsonb),COALESCE($13,''),$14::vector,now())
165
126
  ON CONFLICT (session_row_id) DO UPDATE SET
166
127
  tenant_id = EXCLUDED.tenant_id,
167
128
  agent_id = EXCLUDED.agent_id,
@@ -211,50 +172,6 @@ async function markStatus(pool, sessionRowId, status, error, { schema } = {}) {
211
172
  return result.rows[0] || null;
212
173
  }
213
174
 
214
- // ---------------------------------------------------------------------------
215
- // persistProcessingResults (@internal — prefer aquifer.enrich() for full pipeline)
216
- // ---------------------------------------------------------------------------
217
-
218
- async function persistProcessingResults(pool, sessionRowId, {
219
- schema,
220
- segments,
221
- summaryText,
222
- structuredSummary,
223
- agentId,
224
- sessionId,
225
- tenantId,
226
- model,
227
- sourceHash,
228
- msgCount,
229
- userCount,
230
- assistantCount,
231
- startedAt,
232
- endedAt,
233
- embedding,
234
- }) {
235
- const client = await pool.connect();
236
- try {
237
- await client.query('BEGIN');
238
- if (segments) await upsertSegments(client, sessionRowId, segments, { schema });
239
- await upsertSummary(client, sessionRowId, {
240
- schema, tenantId, agentId, sessionId, summaryText,
241
- structuredSummary, model, sourceHash,
242
- msgCount, userCount, assistantCount,
243
- startedAt, endedAt, embedding,
244
- });
245
- await markStatus(client, sessionRowId, 'succeeded', null, { schema });
246
- await client.query('COMMIT');
247
- } catch (err) {
248
- await client.query('ROLLBACK').catch(() => {});
249
- try {
250
- await markStatus(pool, sessionRowId, 'failed', err.message, { schema });
251
- } catch (_) { /* swallow */ }
252
- throw err;
253
- } finally {
254
- client.release();
255
- }
256
- }
257
-
258
175
  // ---------------------------------------------------------------------------
259
176
  // getSession
260
177
  // ---------------------------------------------------------------------------
@@ -282,36 +199,6 @@ async function getSession(pool, sessionId, agentId, options = {}, { schema, tena
282
199
  return result.rows[0] || null;
283
200
  }
284
201
 
285
- // ---------------------------------------------------------------------------
286
- // getSessionFull
287
- // ---------------------------------------------------------------------------
288
-
289
- async function getSessionFull(pool, sessionId, agentId, { schema, tenantId } = {}) {
290
- const session = await getSession(pool, sessionId, agentId, { tenantId }, { schema, tenantId });
291
- if (!session) return null;
292
-
293
- const [segResult, sumResult] = await Promise.all([
294
- pool.query(
295
- `SELECT * FROM ${qi(schema)}.session_segments
296
- WHERE session_row_id = $1
297
- ORDER BY segment_no ASC`,
298
- [session.id]
299
- ),
300
- pool.query(
301
- `SELECT * FROM ${qi(schema)}.session_summaries
302
- WHERE session_row_id = $1
303
- LIMIT 1`,
304
- [session.id]
305
- ),
306
- ]);
307
-
308
- return {
309
- session,
310
- segments: segResult.rows,
311
- summary: sumResult.rows[0] || null,
312
- };
313
- }
314
-
315
202
  // ---------------------------------------------------------------------------
316
203
  // getMessages
317
204
  // ---------------------------------------------------------------------------
@@ -324,7 +211,7 @@ async function getMessages(pool, sessionId, agentId, { schema, tenantId } = {})
324
211
  }
325
212
 
326
213
  // ---------------------------------------------------------------------------
327
- // searchSessions (FTS)
214
+ // searchSessions (trigram + FTS fallback)
328
215
  // ---------------------------------------------------------------------------
329
216
 
330
217
  async function searchSessions(pool, query, {
@@ -333,34 +220,27 @@ async function searchSessions(pool, query, {
333
220
  agentId,
334
221
  agentIds: rawAgentIds,
335
222
  source,
336
- dateFrom, // m1: add date filtering
223
+ dateFrom,
337
224
  dateTo,
338
225
  limit = 20,
339
- ftsConfig = 'simple',
340
226
  } = {}) {
341
227
  const clampedLimit = Math.max(1, Math.min(100, limit));
342
- // FTS config is locked to 'simple' — the search_tsv trigger always uses
343
- // to_tsvector('simple', ...) so query semantics must match. Warn callers
344
- // that pass a different value rather than silently honouring it.
345
- if (ftsConfig !== 'simple') {
346
- console.warn(
347
- `[aquifer/storage] searchSessions: ftsConfig '${ftsConfig}' ignored. ` +
348
- `Only 'simple' is supported (index is built with simple tokenizer). ` +
349
- `Using 'simple'.`
350
- );
351
- }
352
- const safeFts = 'simple';
353
228
 
354
229
  // Normalize agentId/agentIds
355
230
  const agentIds = rawAgentIds && rawAgentIds.length > 0
356
231
  ? rawAgentIds
357
232
  : (agentId ? [agentId] : null);
358
233
 
234
+ // Escape LIKE special characters in query
235
+ const likeQuery = query.replace(/[%_\\]/g, '\\$&');
236
+
237
+ // Primary: trigram ILIKE on search_text (works for CJK + Latin)
238
+ // Fallback: tsvector FTS (for installations without search_text populated)
359
239
  const where = [
360
- `ss.search_tsv @@ plainto_tsquery('${safeFts}', $1)`,
361
- `s.tenant_id = $2`,
240
+ `(ss.search_text ILIKE '%' || $1 || '%' OR ss.search_tsv @@ plainto_tsquery('simple', $2))`,
241
+ `s.tenant_id = $3`,
362
242
  ];
363
- const params = [query, tenantId];
243
+ const params = [likeQuery, query, tenantId];
364
244
 
365
245
  if (agentIds) {
366
246
  params.push(agentIds);
@@ -394,8 +274,10 @@ async function searchSessions(pool, query, {
394
274
  ss.access_count,
395
275
  ss.last_accessed_at,
396
276
  ss.trust_score,
397
- ts_headline('${safeFts}', COALESCE(ss.summary_text, ''), plainto_tsquery('${safeFts}', $1)) AS summary_snippet,
398
- ts_rank(ss.search_tsv, plainto_tsquery('${safeFts}', $1)) AS fts_rank
277
+ CASE WHEN ss.search_text IS NOT NULL
278
+ THEN similarity(ss.search_text, $2)
279
+ ELSE ts_rank(ss.search_tsv, plainto_tsquery('simple', $2))
280
+ END AS fts_rank
399
281
  FROM ${qi(schema)}.sessions s
400
282
  LEFT JOIN ${qi(schema)}.session_summaries ss ON ss.session_row_id = s.id
401
283
  WHERE ${where.join(' AND ')}
@@ -414,7 +296,7 @@ async function recordAccess(pool, sessionRowIds, { schema } = {}) {
414
296
  if (!sessionRowIds || sessionRowIds.length === 0) return;
415
297
  await pool.query(
416
298
  `UPDATE ${qi(schema)}.session_summaries
417
- SET access_count = access_count + 1, last_accessed_at = now()
299
+ SET access_count = COALESCE(access_count, 0) + 1, last_accessed_at = now()
418
300
  WHERE session_row_id = ANY($1)`,
419
301
  [sessionRowIds]
420
302
  );
@@ -643,12 +525,9 @@ async function recordFeedback(pool, {
643
525
 
644
526
  module.exports = {
645
527
  upsertSession,
646
- upsertSegments,
647
528
  upsertSummary,
648
529
  markStatus,
649
- persistProcessingResults,
650
530
  getSession,
651
- getSessionFull,
652
531
  getMessages,
653
532
  searchSessions,
654
533
  recordAccess,
package/index.js CHANGED
@@ -3,6 +3,5 @@
3
3
  const { createAquifer } = require('./core/aquifer');
4
4
  const { createEmbedder } = require('./pipeline/embed');
5
5
  const { createReranker } = require('./pipeline/rerank');
6
- const { normalizeSession, detectClient } = require('./pipeline/normalize');
7
6
 
8
- module.exports = { createAquifer, createEmbedder, createReranker, normalizeSession, detectClient };
7
+ module.exports = { createAquifer, createEmbedder, createReranker };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shadowforge0/aquifer-memory",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "description": "PG-native long-term memory for AI agents. Turn-level embedding, hybrid RRF ranking, optional knowledge graph. MCP server, CLI, and library API.",
5
5
  "main": "index.js",
6
6
  "files": [
@@ -17,8 +17,6 @@
17
17
  },
18
18
  "exports": {
19
19
  ".": "./index.js",
20
- "./core/*": "./core/*.js",
21
- "./pipeline/*": "./pipeline/*.js",
22
20
  "./consumers/mcp": "./consumers/mcp.js",
23
21
  "./consumers/openclaw-plugin": "./consumers/openclaw-plugin.js",
24
22
  "./consumers/opencode": "./consumers/opencode.js",
@@ -2,6 +2,7 @@
2
2
  -- Usage: replace ${schema} with actual schema name (e.g., 'aquifer')
3
3
 
4
4
  CREATE EXTENSION IF NOT EXISTS vector;
5
+ CREATE EXTENSION IF NOT EXISTS pg_trgm;
5
6
  CREATE SCHEMA IF NOT EXISTS ${schema};
6
7
 
7
8
  -- =========================================================================
@@ -85,6 +86,7 @@ CREATE TABLE IF NOT EXISTS ${schema}.session_summaries (
85
86
  structured_summary JSONB NOT NULL DEFAULT '{}',
86
87
  embedding vector,
87
88
  search_tsv TSVECTOR,
89
+ search_text TEXT,
88
90
  access_count INT NOT NULL DEFAULT 0,
89
91
  last_accessed_at TIMESTAMPTZ,
90
92
  updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
@@ -96,6 +98,9 @@ CREATE INDEX IF NOT EXISTS idx_summaries_tenant
96
98
  CREATE INDEX IF NOT EXISTS idx_summaries_search_tsv
97
99
  ON ${schema}.session_summaries USING GIN (search_tsv);
98
100
 
101
+ CREATE INDEX IF NOT EXISTS idx_summaries_search_text_trgm
102
+ ON ${schema}.session_summaries USING GIN (search_text gin_trgm_ops);
103
+
99
104
  CREATE INDEX IF NOT EXISTS idx_summaries_embedding
100
105
  ON ${schema}.session_summaries (session_row_id)
101
106
  WHERE embedding IS NOT NULL;
@@ -141,6 +146,11 @@ BEGIN
141
146
  setweight(to_tsvector('simple', COALESCE(NEW.summary_text, '')), 'C') ||
142
147
  setweight(to_tsvector('simple', open_loops_text || ' ' || facts_text), 'D');
143
148
 
149
+ NEW.search_text :=
150
+ title_text || ' ' || overview_text || ' ' || topics_text || ' ' ||
151
+ decisions_text || ' ' || COALESCE(NEW.summary_text, '') || ' ' ||
152
+ open_loops_text || ' ' || facts_text;
153
+
144
154
  RETURN NEW;
145
155
  END;
146
156
  $$;
@@ -149,7 +159,7 @@ DROP TRIGGER IF EXISTS trg_session_summaries_search_tsv
149
159
  ON ${schema}.session_summaries;
150
160
 
151
161
  CREATE TRIGGER trg_session_summaries_search_tsv
152
- BEFORE INSERT OR UPDATE OF summary_text, structured_summary
162
+ BEFORE INSERT OR UPDATE OF summary_text, structured_summary, search_text
153
163
  ON ${schema}.session_summaries
154
164
  FOR EACH ROW
155
165
  EXECUTE FUNCTION ${schema}.session_summaries_search_tsv_update();
@@ -0,0 +1,161 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * FTS 中文診斷:檢查 'simple' tokenizer 在實際中文資料上的表現
5
+ *
6
+ * 測試項目:
7
+ * 1. FTS tokenization — 實際 token 長什麼樣
8
+ * 2. FTS recall — 常見中文查詢的命中率
9
+ * 3. FTS vs vector — FTS 有沒有在幫忙還是在拖後腿
10
+ */
11
+
12
+ const { Pool } = require('pg');
13
+
14
+ const DB_URL = process.env.DATABASE_URL || 'postgresql://burk:790476@localhost:5432/openclaw_db';
15
+ const SCHEMA = process.env.AQUIFER_SCHEMA || 'miranda';
16
+
17
+ const pool = new Pool({ connectionString: DB_URL });
18
+
19
+ async function run() {
20
+ const qi = (s) => `"${s}"`;
21
+
22
+ console.log('=== FTS 中文診斷 ===\n');
23
+
24
+ // 1. 看 token 分佈
25
+ console.log('--- 1. Token 分析 ---');
26
+ const tokenSample = await pool.query(`
27
+ SELECT ss.session_id,
28
+ array_length(tsvector_to_array(ss.search_tsv), 1) as token_count,
29
+ left(ss.summary_text, 80) as preview
30
+ FROM ${qi(SCHEMA)}.session_summaries ss
31
+ WHERE ss.search_tsv IS NOT NULL
32
+ ORDER BY ss.updated_at DESC
33
+ LIMIT 10
34
+ `);
35
+
36
+ let totalTokens = 0;
37
+ let sessionCount = 0;
38
+ for (const r of tokenSample.rows) {
39
+ totalTokens += r.token_count || 0;
40
+ sessionCount++;
41
+ console.log(` ${r.session_id?.slice(0, 8)} | ${r.token_count || 0} tokens | ${r.preview}`);
42
+ }
43
+ console.log(` avg: ${sessionCount ? Math.round(totalTokens / sessionCount) : 0} tokens/session\n`);
44
+
45
+ // 2. 看一個 session 的實際 token
46
+ console.log('--- 2. Token 範例(最近 session)---');
47
+ const tokenDetail = await pool.query(`
48
+ SELECT ss.session_id,
49
+ array_to_string(tsvector_to_array(ss.search_tsv), ' | ') as tokens
50
+ FROM ${qi(SCHEMA)}.session_summaries ss
51
+ WHERE ss.search_tsv IS NOT NULL
52
+ ORDER BY ss.updated_at DESC
53
+ LIMIT 1
54
+ `);
55
+ if (tokenDetail.rows[0]) {
56
+ console.log(` session: ${tokenDetail.rows[0].session_id?.slice(0, 8)}`);
57
+ const tokens = tokenDetail.rows[0].tokens || '';
58
+ // 分類 token
59
+ const all = tokens.split(' | ');
60
+ const cjk = all.filter(t => /[\u4e00-\u9fff]/.test(t));
61
+ const latin = all.filter(t => /^[a-z0-9]/.test(t));
62
+ const other = all.filter(t => !(/[\u4e00-\u9fff]/.test(t)) && !(/^[a-z0-9]/.test(t)));
63
+ console.log(` total: ${all.length} | latin: ${latin.length} | cjk: ${cjk.length} | other: ${other.length}`);
64
+ console.log(` CJK tokens (前 20): ${cjk.slice(0, 20).join(' | ')}`);
65
+ console.log(` Latin tokens (前 20): ${latin.slice(0, 20).join(' | ')}\n`);
66
+ }
67
+
68
+ // 3. 中文查詢命中率測試
69
+ console.log('--- 3. 中文查詢 FTS 命中率 ---');
70
+ const testQueries = [
71
+ 'afterburn',
72
+ 'bootstrap',
73
+ 'session',
74
+ 'recall',
75
+ '記憶',
76
+ '修復',
77
+ '架構',
78
+ '時區',
79
+ '去重',
80
+ 'daily entries',
81
+ 'OpenCode',
82
+ 'entity',
83
+ 'Jenny',
84
+ 'Aquifer',
85
+ '消化模式',
86
+ ];
87
+
88
+ // 總 session 數
89
+ const totalResult = await pool.query(`
90
+ SELECT COUNT(*) as cnt FROM ${qi(SCHEMA)}.session_summaries WHERE search_tsv IS NOT NULL
91
+ `);
92
+ const totalSessions = parseInt(totalResult.rows[0].cnt);
93
+ console.log(` total sessions with FTS index: ${totalSessions}\n`);
94
+
95
+ for (const q of testQueries) {
96
+ const ftsResult = await pool.query(`
97
+ SELECT COUNT(*) as cnt
98
+ FROM ${qi(SCHEMA)}.session_summaries ss
99
+ WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
100
+ `, [q]);
101
+ const ftsHits = parseInt(ftsResult.rows[0].cnt);
102
+
103
+ // 同時看 summary_text ILIKE 能找到幾筆(ground truth)
104
+ const ilikeResult = await pool.query(`
105
+ SELECT COUNT(*) as cnt
106
+ FROM ${qi(SCHEMA)}.session_summaries ss
107
+ WHERE ss.summary_text ILIKE $1
108
+ OR ss.structured_summary::text ILIKE $1
109
+ `, [`%${q}%`]);
110
+ const ilikeHits = parseInt(ilikeResult.rows[0].cnt);
111
+
112
+ const ftsRecall = ilikeHits > 0 ? Math.round(ftsHits / ilikeHits * 100) : (ftsHits === 0 ? 100 : 0);
113
+ const status = ftsHits === ilikeHits ? '✓' : (ftsHits < ilikeHits ? '✗ MISS' : '?');
114
+ console.log(` "${q}" | FTS: ${ftsHits} | ILIKE: ${ilikeHits} | recall: ${ftsRecall}% | ${status}`);
115
+ }
116
+
117
+ // 4. FTS 對 RRF 的貢獻度
118
+ console.log('\n--- 4. FTS 在 hybrid search 中的貢獻度 ---');
119
+ // 跑幾個查詢,看 FTS 跟 vector 的 session 重疊率
120
+ const overlapQueries = ['afterburn', 'bootstrap', '記憶', 'recall', 'entity'];
121
+ for (const q of overlapQueries) {
122
+ const ftsResult = await pool.query(`
123
+ SELECT ss.session_id
124
+ FROM ${qi(SCHEMA)}.session_summaries ss
125
+ JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
126
+ WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
127
+ AND s.processing_status = 'succeeded'
128
+ ORDER BY ts_rank(ss.search_tsv, plainto_tsquery('simple', $1)) DESC
129
+ LIMIT 10
130
+ `, [q]);
131
+ const ftsIds = new Set(ftsResult.rows.map(r => r.session_id));
132
+
133
+ // vector search (if embedding available)
134
+ const embResult = await pool.query(`
135
+ SELECT ss.session_id
136
+ FROM ${qi(SCHEMA)}.session_summaries ss
137
+ JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
138
+ WHERE ss.embedding IS NOT NULL
139
+ AND s.processing_status = 'succeeded'
140
+ ORDER BY ss.embedding <=> (
141
+ SELECT ss2.embedding FROM ${qi(SCHEMA)}.session_summaries ss2
142
+ WHERE ss2.search_tsv @@ plainto_tsquery('simple', $1)
143
+ ORDER BY ts_rank(ss2.search_tsv, plainto_tsquery('simple', $1)) DESC
144
+ LIMIT 1
145
+ )
146
+ LIMIT 10
147
+ `, [q]);
148
+ const embIds = new Set(embResult.rows.map(r => r.session_id));
149
+
150
+ const overlap = [...ftsIds].filter(id => embIds.has(id)).length;
151
+ const ftsOnly = [...ftsIds].filter(id => !embIds.has(id)).length;
152
+ const embOnly = [...embIds].filter(id => !ftsIds.has(id)).length;
153
+
154
+ console.log(` "${q}" | FTS top10: ${ftsIds.size} | Vec top10: ${embIds.size} | overlap: ${overlap} | FTS-only: ${ftsOnly} | Vec-only: ${embOnly}`);
155
+ }
156
+
157
+ await pool.end();
158
+ console.log('\n=== 完成 ===');
159
+ }
160
+
161
+ run().catch(err => { console.error(err); process.exit(1); });