@shadowforge0/aquifer-memory 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -4
- package/core/aquifer.js +7 -30
- package/core/storage.js +16 -137
- package/index.js +1 -2
- package/package.json +1 -3
- package/schema/001-base.sql +11 -1
- package/scripts/diagnose-fts-zh.js +161 -0
package/README.md
CHANGED
|
@@ -130,13 +130,14 @@ Full env-to-config mapping is in [consumers/shared/config.js](consumers/shared/c
|
|
|
130
130
|
|
|
131
131
|
## Host Integration
|
|
132
132
|
|
|
133
|
-
MCP is the primary integration surface. Agent hosts connect to the Aquifer MCP server, which exposes
|
|
133
|
+
MCP is the primary integration surface. Agent hosts connect to the Aquifer MCP server, which exposes five tools: `session_recall`, `session_feedback`, `session_bootstrap`, `memory_stats`, `memory_pending`.
|
|
134
134
|
|
|
135
135
|
| Integration | Route | Status | When to use |
|
|
136
136
|
|-------------|-------|--------|-------------|
|
|
137
137
|
| MCP server | `consumers/mcp.js` | Primary | Claude Code, OpenClaw, Codex, any MCP-capable host |
|
|
138
138
|
| Library API | `createAquifer()` | Primary | Backend apps, custom pipelines, direct Node.js usage |
|
|
139
|
-
| CLI | `consumers/cli.js` | Secondary | Operations, debugging, manual recall/backfill |
|
|
139
|
+
| CLI | `consumers/cli.js` | Secondary | Operations, debugging, manual recall/backfill (`aquifer bootstrap`, `aquifer ingest-opencode`, etc.) |
|
|
140
|
+
| OpenCode ingest | `consumers/opencode.js` | Secondary | Import sessions from OpenCode's SQLite DB |
|
|
140
141
|
| OpenClaw plugin | `consumers/openclaw-plugin.js` | Compatibility only | Session capture via `before_reset` — not for tool delivery |
|
|
141
142
|
|
|
142
143
|
### Claude Code
|
|
@@ -160,7 +161,7 @@ Add to your project's `.claude.json` or user-level MCP config:
|
|
|
160
161
|
}
|
|
161
162
|
```
|
|
162
163
|
|
|
163
|
-
Tools appear as `mcp__aquifer__session_recall`, `mcp__aquifer__session_feedback`, etc.
|
|
164
|
+
Tools appear as `mcp__aquifer__session_recall`, `mcp__aquifer__session_feedback`, `mcp__aquifer__session_bootstrap`, etc.
|
|
164
165
|
|
|
165
166
|
### OpenClaw
|
|
166
167
|
|
|
@@ -184,7 +185,7 @@ Add to `openclaw.json` under `mcp.servers`:
|
|
|
184
185
|
}
|
|
185
186
|
```
|
|
186
187
|
|
|
187
|
-
Tools materialize as `aquifer__session_recall`, `aquifer__session_feedback`, `aquifer__memory_stats`, `aquifer__memory_pending` (server name prefix added by the host).
|
|
188
|
+
Tools materialize as `aquifer__session_recall`, `aquifer__session_feedback`, `aquifer__session_bootstrap`, `aquifer__memory_stats`, `aquifer__memory_pending` (server name prefix added by the host).
|
|
188
189
|
|
|
189
190
|
The OpenClaw plugin (`consumers/openclaw-plugin.js`) is retained for session capture via `before_reset` but is **not** the recommended tool delivery path. Use MCP.
|
|
190
191
|
|
|
@@ -245,6 +246,7 @@ Any host that supports MCP stdio can connect the same way — point it at `node
|
|
|
245
246
|
| `pipeline/extract-entities.js` | LLM-powered entity extraction (12 types) |
|
|
246
247
|
| `pipeline/rerank.js` | Cross-encoder reranking (TEI, Jina, OpenRouter) |
|
|
247
248
|
| `pipeline/normalize/` | Session normalization for Claude Code / gateway noise |
|
|
249
|
+
| `consumers/opencode.js` | OpenCode SQLite ingest — reads sessions from OpenCode's local DB |
|
|
248
250
|
| `schema/001-base.sql` | DDL: sessions, summaries, turn_embeddings, FTS indexes |
|
|
249
251
|
| `schema/002-entities.sql` | DDL: entities, mentions, relations, entity_sessions |
|
|
250
252
|
| `schema/003-trust-feedback.sql` | DDL: trust_score column, session_feedback audit trail |
|
|
@@ -435,6 +437,24 @@ await aquifer.feedback('session-id', {
|
|
|
435
437
|
});
|
|
436
438
|
```
|
|
437
439
|
|
|
440
|
+
#### `aquifer.bootstrap(opts)`
|
|
441
|
+
|
|
442
|
+
Loads recent session context for a new conversation — summaries, open loops, and decisions. Time-based (no embedding search), designed for session-start injection.
|
|
443
|
+
|
|
444
|
+
```javascript
|
|
445
|
+
const result = await aquifer.bootstrap({
|
|
446
|
+
agentId: 'main',
|
|
447
|
+
limit: 5, // max sessions (default: 5)
|
|
448
|
+
lookbackDays: 14, // how far back (default: 14)
|
|
449
|
+
maxChars: 4000, // max output chars (default: 4000)
|
|
450
|
+
format: 'text', // 'text', 'structured', or 'both'
|
|
451
|
+
});
|
|
452
|
+
// format='text': result.text contains XML block ready for injection
|
|
453
|
+
// format='structured': result.sessions, result.openLoops, result.recentDecisions
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
Cross-session dedup on open loops and decisions, sentinel filtering (removes 無/none/n/a), and maxChars truncation.
|
|
457
|
+
|
|
438
458
|
#### `aquifer.close()`
|
|
439
459
|
|
|
440
460
|
Closes the PostgreSQL connection pool (only if Aquifer created it).
|
package/core/aquifer.js
CHANGED
|
@@ -100,19 +100,6 @@ function createAquifer(config) {
|
|
|
100
100
|
const entityPromptFn = config.entities && config.entities.prompt ? config.entities.prompt : null;
|
|
101
101
|
const entityScope = (config.entities && config.entities.scope) || 'default';
|
|
102
102
|
|
|
103
|
-
// FTS config — locked to 'simple'.
|
|
104
|
-
// The search_tsv trigger always uses to_tsvector('simple', ...), so query-time
|
|
105
|
-
// config must match. Warn and override if someone passes anything else.
|
|
106
|
-
const _rawFtsConfig = config.ftsConfig || 'simple';
|
|
107
|
-
if (_rawFtsConfig !== 'simple') {
|
|
108
|
-
console.warn(
|
|
109
|
-
`[aquifer] ftsConfig '${_rawFtsConfig}' is not currently supported. ` +
|
|
110
|
-
`The search_tsv index is built with 'simple'; only 'simple' is valid at query time. ` +
|
|
111
|
-
`Overriding to 'simple'.`
|
|
112
|
-
);
|
|
113
|
-
}
|
|
114
|
-
const ftsConfig = 'simple';
|
|
115
|
-
|
|
116
103
|
// Rank weights
|
|
117
104
|
const rankWeights = {
|
|
118
105
|
rrf: 0.65,
|
|
@@ -706,7 +693,7 @@ function createAquifer(config) {
|
|
|
706
693
|
const [ftsRows, embRows, turnResult] = await Promise.all([
|
|
707
694
|
runFts
|
|
708
695
|
? storage.searchSessions(pool, query, {
|
|
709
|
-
schema, tenantId, agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit,
|
|
696
|
+
schema, tenantId, agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit,
|
|
710
697
|
}).catch((err) => {
|
|
711
698
|
recordSearchError('fts', err);
|
|
712
699
|
return [];
|
|
@@ -918,7 +905,6 @@ function createAquifer(config) {
|
|
|
918
905
|
},
|
|
919
906
|
|
|
920
907
|
async getSessionFull(sessionId) {
|
|
921
|
-
// Try to find the session across agents by querying directly
|
|
922
908
|
const result = await pool.query(
|
|
923
909
|
`SELECT * FROM ${qi(schema)}.sessions
|
|
924
910
|
WHERE session_id = $1 AND tenant_id = $2
|
|
@@ -928,24 +914,15 @@ function createAquifer(config) {
|
|
|
928
914
|
const session = result.rows[0];
|
|
929
915
|
if (!session) return null;
|
|
930
916
|
|
|
931
|
-
const
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
),
|
|
938
|
-
pool.query(
|
|
939
|
-
`SELECT * FROM ${qi(schema)}.session_summaries
|
|
940
|
-
WHERE session_row_id = $1
|
|
941
|
-
LIMIT 1`,
|
|
942
|
-
[session.id]
|
|
943
|
-
),
|
|
944
|
-
]);
|
|
917
|
+
const sumResult = await pool.query(
|
|
918
|
+
`SELECT * FROM ${qi(schema)}.session_summaries
|
|
919
|
+
WHERE session_row_id = $1
|
|
920
|
+
LIMIT 1`,
|
|
921
|
+
[session.id]
|
|
922
|
+
);
|
|
945
923
|
|
|
946
924
|
return {
|
|
947
925
|
session,
|
|
948
|
-
segments: segResult.rows,
|
|
949
926
|
summary: sumResult.rows[0] || null,
|
|
950
927
|
};
|
|
951
928
|
},
|
package/core/storage.js
CHANGED
|
@@ -96,44 +96,6 @@ async function upsertSession(pool, {
|
|
|
96
96
|
};
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
-
// ---------------------------------------------------------------------------
|
|
100
|
-
// upsertSegments
|
|
101
|
-
// ---------------------------------------------------------------------------
|
|
102
|
-
|
|
103
|
-
async function upsertSegments(pool, sessionRowId, segments, { schema } = {}) {
|
|
104
|
-
if (!segments || segments.length === 0) return;
|
|
105
|
-
for (const seg of segments) {
|
|
106
|
-
await pool.query(
|
|
107
|
-
`INSERT INTO ${qi(schema)}.session_segments
|
|
108
|
-
(session_row_id, segment_no, start_msg_idx, end_msg_idx,
|
|
109
|
-
started_at, ended_at, raw_msg_count, effective_msg_count,
|
|
110
|
-
boundary_type, boundary_meta)
|
|
111
|
-
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
|
|
112
|
-
ON CONFLICT (session_row_id, segment_no) DO UPDATE SET
|
|
113
|
-
start_msg_idx = EXCLUDED.start_msg_idx,
|
|
114
|
-
end_msg_idx = EXCLUDED.end_msg_idx,
|
|
115
|
-
started_at = EXCLUDED.started_at,
|
|
116
|
-
ended_at = EXCLUDED.ended_at,
|
|
117
|
-
raw_msg_count = EXCLUDED.raw_msg_count,
|
|
118
|
-
effective_msg_count = EXCLUDED.effective_msg_count,
|
|
119
|
-
boundary_type = EXCLUDED.boundary_type,
|
|
120
|
-
boundary_meta = EXCLUDED.boundary_meta`,
|
|
121
|
-
[
|
|
122
|
-
sessionRowId,
|
|
123
|
-
seg.segmentNo,
|
|
124
|
-
seg.startMsgIdx !== null && seg.startMsgIdx !== undefined ? seg.startMsgIdx : null,
|
|
125
|
-
seg.endMsgIdx !== null && seg.endMsgIdx !== undefined ? seg.endMsgIdx : null,
|
|
126
|
-
seg.startedAt || null,
|
|
127
|
-
seg.endedAt || null,
|
|
128
|
-
seg.rawMsgCount || 0,
|
|
129
|
-
seg.effectiveMsgCount || 0,
|
|
130
|
-
seg.boundaryType || null,
|
|
131
|
-
seg.boundaryMeta ? JSON.stringify(seg.boundaryMeta) : '{}',
|
|
132
|
-
]
|
|
133
|
-
);
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
|
|
137
99
|
// ---------------------------------------------------------------------------
|
|
138
100
|
// upsertSummary
|
|
139
101
|
// ---------------------------------------------------------------------------
|
|
@@ -159,9 +121,8 @@ async function upsertSummary(pool, sessionRowId, {
|
|
|
159
121
|
`INSERT INTO ${qi(schema)}.session_summaries
|
|
160
122
|
(session_row_id, tenant_id, agent_id, session_id, summary_version, model, source_hash,
|
|
161
123
|
message_count, user_message_count, assistant_message_count,
|
|
162
|
-
boundary_count, fresh_tail_count,
|
|
163
124
|
started_at, ended_at, structured_summary, summary_text, embedding, updated_at)
|
|
164
|
-
VALUES ($1,$2,$3,$4,1,$5,$6,$7,$8,$9
|
|
125
|
+
VALUES ($1,$2,$3,$4,1,$5,$6,$7,$8,$9,$10,$11,COALESCE($12::jsonb,'{}'::jsonb),COALESCE($13,''),$14::vector,now())
|
|
165
126
|
ON CONFLICT (session_row_id) DO UPDATE SET
|
|
166
127
|
tenant_id = EXCLUDED.tenant_id,
|
|
167
128
|
agent_id = EXCLUDED.agent_id,
|
|
@@ -211,50 +172,6 @@ async function markStatus(pool, sessionRowId, status, error, { schema } = {}) {
|
|
|
211
172
|
return result.rows[0] || null;
|
|
212
173
|
}
|
|
213
174
|
|
|
214
|
-
// ---------------------------------------------------------------------------
|
|
215
|
-
// persistProcessingResults (@internal — prefer aquifer.enrich() for full pipeline)
|
|
216
|
-
// ---------------------------------------------------------------------------
|
|
217
|
-
|
|
218
|
-
async function persistProcessingResults(pool, sessionRowId, {
|
|
219
|
-
schema,
|
|
220
|
-
segments,
|
|
221
|
-
summaryText,
|
|
222
|
-
structuredSummary,
|
|
223
|
-
agentId,
|
|
224
|
-
sessionId,
|
|
225
|
-
tenantId,
|
|
226
|
-
model,
|
|
227
|
-
sourceHash,
|
|
228
|
-
msgCount,
|
|
229
|
-
userCount,
|
|
230
|
-
assistantCount,
|
|
231
|
-
startedAt,
|
|
232
|
-
endedAt,
|
|
233
|
-
embedding,
|
|
234
|
-
}) {
|
|
235
|
-
const client = await pool.connect();
|
|
236
|
-
try {
|
|
237
|
-
await client.query('BEGIN');
|
|
238
|
-
if (segments) await upsertSegments(client, sessionRowId, segments, { schema });
|
|
239
|
-
await upsertSummary(client, sessionRowId, {
|
|
240
|
-
schema, tenantId, agentId, sessionId, summaryText,
|
|
241
|
-
structuredSummary, model, sourceHash,
|
|
242
|
-
msgCount, userCount, assistantCount,
|
|
243
|
-
startedAt, endedAt, embedding,
|
|
244
|
-
});
|
|
245
|
-
await markStatus(client, sessionRowId, 'succeeded', null, { schema });
|
|
246
|
-
await client.query('COMMIT');
|
|
247
|
-
} catch (err) {
|
|
248
|
-
await client.query('ROLLBACK').catch(() => {});
|
|
249
|
-
try {
|
|
250
|
-
await markStatus(pool, sessionRowId, 'failed', err.message, { schema });
|
|
251
|
-
} catch (_) { /* swallow */ }
|
|
252
|
-
throw err;
|
|
253
|
-
} finally {
|
|
254
|
-
client.release();
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
|
|
258
175
|
// ---------------------------------------------------------------------------
|
|
259
176
|
// getSession
|
|
260
177
|
// ---------------------------------------------------------------------------
|
|
@@ -282,36 +199,6 @@ async function getSession(pool, sessionId, agentId, options = {}, { schema, tena
|
|
|
282
199
|
return result.rows[0] || null;
|
|
283
200
|
}
|
|
284
201
|
|
|
285
|
-
// ---------------------------------------------------------------------------
|
|
286
|
-
// getSessionFull
|
|
287
|
-
// ---------------------------------------------------------------------------
|
|
288
|
-
|
|
289
|
-
async function getSessionFull(pool, sessionId, agentId, { schema, tenantId } = {}) {
|
|
290
|
-
const session = await getSession(pool, sessionId, agentId, { tenantId }, { schema, tenantId });
|
|
291
|
-
if (!session) return null;
|
|
292
|
-
|
|
293
|
-
const [segResult, sumResult] = await Promise.all([
|
|
294
|
-
pool.query(
|
|
295
|
-
`SELECT * FROM ${qi(schema)}.session_segments
|
|
296
|
-
WHERE session_row_id = $1
|
|
297
|
-
ORDER BY segment_no ASC`,
|
|
298
|
-
[session.id]
|
|
299
|
-
),
|
|
300
|
-
pool.query(
|
|
301
|
-
`SELECT * FROM ${qi(schema)}.session_summaries
|
|
302
|
-
WHERE session_row_id = $1
|
|
303
|
-
LIMIT 1`,
|
|
304
|
-
[session.id]
|
|
305
|
-
),
|
|
306
|
-
]);
|
|
307
|
-
|
|
308
|
-
return {
|
|
309
|
-
session,
|
|
310
|
-
segments: segResult.rows,
|
|
311
|
-
summary: sumResult.rows[0] || null,
|
|
312
|
-
};
|
|
313
|
-
}
|
|
314
|
-
|
|
315
202
|
// ---------------------------------------------------------------------------
|
|
316
203
|
// getMessages
|
|
317
204
|
// ---------------------------------------------------------------------------
|
|
@@ -324,7 +211,7 @@ async function getMessages(pool, sessionId, agentId, { schema, tenantId } = {})
|
|
|
324
211
|
}
|
|
325
212
|
|
|
326
213
|
// ---------------------------------------------------------------------------
|
|
327
|
-
// searchSessions (FTS)
|
|
214
|
+
// searchSessions (trigram + FTS fallback)
|
|
328
215
|
// ---------------------------------------------------------------------------
|
|
329
216
|
|
|
330
217
|
async function searchSessions(pool, query, {
|
|
@@ -333,34 +220,27 @@ async function searchSessions(pool, query, {
|
|
|
333
220
|
agentId,
|
|
334
221
|
agentIds: rawAgentIds,
|
|
335
222
|
source,
|
|
336
|
-
dateFrom,
|
|
223
|
+
dateFrom,
|
|
337
224
|
dateTo,
|
|
338
225
|
limit = 20,
|
|
339
|
-
ftsConfig = 'simple',
|
|
340
226
|
} = {}) {
|
|
341
227
|
const clampedLimit = Math.max(1, Math.min(100, limit));
|
|
342
|
-
// FTS config is locked to 'simple' — the search_tsv trigger always uses
|
|
343
|
-
// to_tsvector('simple', ...) so query semantics must match. Warn callers
|
|
344
|
-
// that pass a different value rather than silently honouring it.
|
|
345
|
-
if (ftsConfig !== 'simple') {
|
|
346
|
-
console.warn(
|
|
347
|
-
`[aquifer/storage] searchSessions: ftsConfig '${ftsConfig}' ignored. ` +
|
|
348
|
-
`Only 'simple' is supported (index is built with simple tokenizer). ` +
|
|
349
|
-
`Using 'simple'.`
|
|
350
|
-
);
|
|
351
|
-
}
|
|
352
|
-
const safeFts = 'simple';
|
|
353
228
|
|
|
354
229
|
// Normalize agentId/agentIds
|
|
355
230
|
const agentIds = rawAgentIds && rawAgentIds.length > 0
|
|
356
231
|
? rawAgentIds
|
|
357
232
|
: (agentId ? [agentId] : null);
|
|
358
233
|
|
|
234
|
+
// Escape LIKE special characters in query
|
|
235
|
+
const likeQuery = query.replace(/[%_\\]/g, '\\$&');
|
|
236
|
+
|
|
237
|
+
// Primary: trigram ILIKE on search_text (works for CJK + Latin)
|
|
238
|
+
// Fallback: tsvector FTS (for installations without search_text populated)
|
|
359
239
|
const where = [
|
|
360
|
-
`ss.search_tsv @@ plainto_tsquery('
|
|
361
|
-
`s.tenant_id = $
|
|
240
|
+
`(ss.search_text ILIKE '%' || $1 || '%' OR ss.search_tsv @@ plainto_tsquery('simple', $2))`,
|
|
241
|
+
`s.tenant_id = $3`,
|
|
362
242
|
];
|
|
363
|
-
const params = [query, tenantId];
|
|
243
|
+
const params = [likeQuery, query, tenantId];
|
|
364
244
|
|
|
365
245
|
if (agentIds) {
|
|
366
246
|
params.push(agentIds);
|
|
@@ -394,8 +274,10 @@ async function searchSessions(pool, query, {
|
|
|
394
274
|
ss.access_count,
|
|
395
275
|
ss.last_accessed_at,
|
|
396
276
|
ss.trust_score,
|
|
397
|
-
|
|
398
|
-
|
|
277
|
+
CASE WHEN ss.search_text IS NOT NULL
|
|
278
|
+
THEN similarity(ss.search_text, $2)
|
|
279
|
+
ELSE ts_rank(ss.search_tsv, plainto_tsquery('simple', $2))
|
|
280
|
+
END AS fts_rank
|
|
399
281
|
FROM ${qi(schema)}.sessions s
|
|
400
282
|
LEFT JOIN ${qi(schema)}.session_summaries ss ON ss.session_row_id = s.id
|
|
401
283
|
WHERE ${where.join(' AND ')}
|
|
@@ -414,7 +296,7 @@ async function recordAccess(pool, sessionRowIds, { schema } = {}) {
|
|
|
414
296
|
if (!sessionRowIds || sessionRowIds.length === 0) return;
|
|
415
297
|
await pool.query(
|
|
416
298
|
`UPDATE ${qi(schema)}.session_summaries
|
|
417
|
-
SET access_count = access_count + 1, last_accessed_at = now()
|
|
299
|
+
SET access_count = COALESCE(access_count, 0) + 1, last_accessed_at = now()
|
|
418
300
|
WHERE session_row_id = ANY($1)`,
|
|
419
301
|
[sessionRowIds]
|
|
420
302
|
);
|
|
@@ -643,12 +525,9 @@ async function recordFeedback(pool, {
|
|
|
643
525
|
|
|
644
526
|
module.exports = {
|
|
645
527
|
upsertSession,
|
|
646
|
-
upsertSegments,
|
|
647
528
|
upsertSummary,
|
|
648
529
|
markStatus,
|
|
649
|
-
persistProcessingResults,
|
|
650
530
|
getSession,
|
|
651
|
-
getSessionFull,
|
|
652
531
|
getMessages,
|
|
653
532
|
searchSessions,
|
|
654
533
|
recordAccess,
|
package/index.js
CHANGED
|
@@ -3,6 +3,5 @@
|
|
|
3
3
|
const { createAquifer } = require('./core/aquifer');
|
|
4
4
|
const { createEmbedder } = require('./pipeline/embed');
|
|
5
5
|
const { createReranker } = require('./pipeline/rerank');
|
|
6
|
-
const { normalizeSession, detectClient } = require('./pipeline/normalize');
|
|
7
6
|
|
|
8
|
-
module.exports = { createAquifer, createEmbedder, createReranker
|
|
7
|
+
module.exports = { createAquifer, createEmbedder, createReranker };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@shadowforge0/aquifer-memory",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "PG-native long-term memory for AI agents. Turn-level embedding, hybrid RRF ranking, optional knowledge graph. MCP server, CLI, and library API.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"files": [
|
|
@@ -17,8 +17,6 @@
|
|
|
17
17
|
},
|
|
18
18
|
"exports": {
|
|
19
19
|
".": "./index.js",
|
|
20
|
-
"./core/*": "./core/*.js",
|
|
21
|
-
"./pipeline/*": "./pipeline/*.js",
|
|
22
20
|
"./consumers/mcp": "./consumers/mcp.js",
|
|
23
21
|
"./consumers/openclaw-plugin": "./consumers/openclaw-plugin.js",
|
|
24
22
|
"./consumers/opencode": "./consumers/opencode.js",
|
package/schema/001-base.sql
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
-- Usage: replace ${schema} with actual schema name (e.g., 'aquifer')
|
|
3
3
|
|
|
4
4
|
CREATE EXTENSION IF NOT EXISTS vector;
|
|
5
|
+
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
|
5
6
|
CREATE SCHEMA IF NOT EXISTS ${schema};
|
|
6
7
|
|
|
7
8
|
-- =========================================================================
|
|
@@ -85,6 +86,7 @@ CREATE TABLE IF NOT EXISTS ${schema}.session_summaries (
|
|
|
85
86
|
structured_summary JSONB NOT NULL DEFAULT '{}',
|
|
86
87
|
embedding vector,
|
|
87
88
|
search_tsv TSVECTOR,
|
|
89
|
+
search_text TEXT,
|
|
88
90
|
access_count INT NOT NULL DEFAULT 0,
|
|
89
91
|
last_accessed_at TIMESTAMPTZ,
|
|
90
92
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
@@ -96,6 +98,9 @@ CREATE INDEX IF NOT EXISTS idx_summaries_tenant
|
|
|
96
98
|
CREATE INDEX IF NOT EXISTS idx_summaries_search_tsv
|
|
97
99
|
ON ${schema}.session_summaries USING GIN (search_tsv);
|
|
98
100
|
|
|
101
|
+
CREATE INDEX IF NOT EXISTS idx_summaries_search_text_trgm
|
|
102
|
+
ON ${schema}.session_summaries USING GIN (search_text gin_trgm_ops);
|
|
103
|
+
|
|
99
104
|
CREATE INDEX IF NOT EXISTS idx_summaries_embedding
|
|
100
105
|
ON ${schema}.session_summaries (session_row_id)
|
|
101
106
|
WHERE embedding IS NOT NULL;
|
|
@@ -141,6 +146,11 @@ BEGIN
|
|
|
141
146
|
setweight(to_tsvector('simple', COALESCE(NEW.summary_text, '')), 'C') ||
|
|
142
147
|
setweight(to_tsvector('simple', open_loops_text || ' ' || facts_text), 'D');
|
|
143
148
|
|
|
149
|
+
NEW.search_text :=
|
|
150
|
+
title_text || ' ' || overview_text || ' ' || topics_text || ' ' ||
|
|
151
|
+
decisions_text || ' ' || COALESCE(NEW.summary_text, '') || ' ' ||
|
|
152
|
+
open_loops_text || ' ' || facts_text;
|
|
153
|
+
|
|
144
154
|
RETURN NEW;
|
|
145
155
|
END;
|
|
146
156
|
$$;
|
|
@@ -149,7 +159,7 @@ DROP TRIGGER IF EXISTS trg_session_summaries_search_tsv
|
|
|
149
159
|
ON ${schema}.session_summaries;
|
|
150
160
|
|
|
151
161
|
CREATE TRIGGER trg_session_summaries_search_tsv
|
|
152
|
-
BEFORE INSERT OR UPDATE OF summary_text, structured_summary
|
|
162
|
+
BEFORE INSERT OR UPDATE OF summary_text, structured_summary, search_text
|
|
153
163
|
ON ${schema}.session_summaries
|
|
154
164
|
FOR EACH ROW
|
|
155
165
|
EXECUTE FUNCTION ${schema}.session_summaries_search_tsv_update();
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* FTS 中文診斷:檢查 'simple' tokenizer 在實際中文資料上的表現
|
|
5
|
+
*
|
|
6
|
+
* 測試項目:
|
|
7
|
+
* 1. FTS tokenization — 實際 token 長什麼樣
|
|
8
|
+
* 2. FTS recall — 常見中文查詢的命中率
|
|
9
|
+
* 3. FTS vs vector — FTS 有沒有在幫忙還是在拖後腿
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
const { Pool } = require('pg');
|
|
13
|
+
|
|
14
|
+
const DB_URL = process.env.DATABASE_URL || 'postgresql://burk:790476@localhost:5432/openclaw_db';
|
|
15
|
+
const SCHEMA = process.env.AQUIFER_SCHEMA || 'miranda';
|
|
16
|
+
|
|
17
|
+
const pool = new Pool({ connectionString: DB_URL });
|
|
18
|
+
|
|
19
|
+
async function run() {
|
|
20
|
+
const qi = (s) => `"${s}"`;
|
|
21
|
+
|
|
22
|
+
console.log('=== FTS 中文診斷 ===\n');
|
|
23
|
+
|
|
24
|
+
// 1. 看 token 分佈
|
|
25
|
+
console.log('--- 1. Token 分析 ---');
|
|
26
|
+
const tokenSample = await pool.query(`
|
|
27
|
+
SELECT ss.session_id,
|
|
28
|
+
array_length(tsvector_to_array(ss.search_tsv), 1) as token_count,
|
|
29
|
+
left(ss.summary_text, 80) as preview
|
|
30
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
31
|
+
WHERE ss.search_tsv IS NOT NULL
|
|
32
|
+
ORDER BY ss.updated_at DESC
|
|
33
|
+
LIMIT 10
|
|
34
|
+
`);
|
|
35
|
+
|
|
36
|
+
let totalTokens = 0;
|
|
37
|
+
let sessionCount = 0;
|
|
38
|
+
for (const r of tokenSample.rows) {
|
|
39
|
+
totalTokens += r.token_count || 0;
|
|
40
|
+
sessionCount++;
|
|
41
|
+
console.log(` ${r.session_id?.slice(0, 8)} | ${r.token_count || 0} tokens | ${r.preview}`);
|
|
42
|
+
}
|
|
43
|
+
console.log(` avg: ${sessionCount ? Math.round(totalTokens / sessionCount) : 0} tokens/session\n`);
|
|
44
|
+
|
|
45
|
+
// 2. 看一個 session 的實際 token
|
|
46
|
+
console.log('--- 2. Token 範例(最近 session)---');
|
|
47
|
+
const tokenDetail = await pool.query(`
|
|
48
|
+
SELECT ss.session_id,
|
|
49
|
+
array_to_string(tsvector_to_array(ss.search_tsv), ' | ') as tokens
|
|
50
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
51
|
+
WHERE ss.search_tsv IS NOT NULL
|
|
52
|
+
ORDER BY ss.updated_at DESC
|
|
53
|
+
LIMIT 1
|
|
54
|
+
`);
|
|
55
|
+
if (tokenDetail.rows[0]) {
|
|
56
|
+
console.log(` session: ${tokenDetail.rows[0].session_id?.slice(0, 8)}`);
|
|
57
|
+
const tokens = tokenDetail.rows[0].tokens || '';
|
|
58
|
+
// 分類 token
|
|
59
|
+
const all = tokens.split(' | ');
|
|
60
|
+
const cjk = all.filter(t => /[\u4e00-\u9fff]/.test(t));
|
|
61
|
+
const latin = all.filter(t => /^[a-z0-9]/.test(t));
|
|
62
|
+
const other = all.filter(t => !(/[\u4e00-\u9fff]/.test(t)) && !(/^[a-z0-9]/.test(t)));
|
|
63
|
+
console.log(` total: ${all.length} | latin: ${latin.length} | cjk: ${cjk.length} | other: ${other.length}`);
|
|
64
|
+
console.log(` CJK tokens (前 20): ${cjk.slice(0, 20).join(' | ')}`);
|
|
65
|
+
console.log(` Latin tokens (前 20): ${latin.slice(0, 20).join(' | ')}\n`);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 3. 中文查詢命中率測試
|
|
69
|
+
console.log('--- 3. 中文查詢 FTS 命中率 ---');
|
|
70
|
+
const testQueries = [
|
|
71
|
+
'afterburn',
|
|
72
|
+
'bootstrap',
|
|
73
|
+
'session',
|
|
74
|
+
'recall',
|
|
75
|
+
'記憶',
|
|
76
|
+
'修復',
|
|
77
|
+
'架構',
|
|
78
|
+
'時區',
|
|
79
|
+
'去重',
|
|
80
|
+
'daily entries',
|
|
81
|
+
'OpenCode',
|
|
82
|
+
'entity',
|
|
83
|
+
'Jenny',
|
|
84
|
+
'Aquifer',
|
|
85
|
+
'消化模式',
|
|
86
|
+
];
|
|
87
|
+
|
|
88
|
+
// 總 session 數
|
|
89
|
+
const totalResult = await pool.query(`
|
|
90
|
+
SELECT COUNT(*) as cnt FROM ${qi(SCHEMA)}.session_summaries WHERE search_tsv IS NOT NULL
|
|
91
|
+
`);
|
|
92
|
+
const totalSessions = parseInt(totalResult.rows[0].cnt);
|
|
93
|
+
console.log(` total sessions with FTS index: ${totalSessions}\n`);
|
|
94
|
+
|
|
95
|
+
for (const q of testQueries) {
|
|
96
|
+
const ftsResult = await pool.query(`
|
|
97
|
+
SELECT COUNT(*) as cnt
|
|
98
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
99
|
+
WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
|
|
100
|
+
`, [q]);
|
|
101
|
+
const ftsHits = parseInt(ftsResult.rows[0].cnt);
|
|
102
|
+
|
|
103
|
+
// 同時看 summary_text ILIKE 能找到幾筆(ground truth)
|
|
104
|
+
const ilikeResult = await pool.query(`
|
|
105
|
+
SELECT COUNT(*) as cnt
|
|
106
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
107
|
+
WHERE ss.summary_text ILIKE $1
|
|
108
|
+
OR ss.structured_summary::text ILIKE $1
|
|
109
|
+
`, [`%${q}%`]);
|
|
110
|
+
const ilikeHits = parseInt(ilikeResult.rows[0].cnt);
|
|
111
|
+
|
|
112
|
+
const ftsRecall = ilikeHits > 0 ? Math.round(ftsHits / ilikeHits * 100) : (ftsHits === 0 ? 100 : 0);
|
|
113
|
+
const status = ftsHits === ilikeHits ? '✓' : (ftsHits < ilikeHits ? '✗ MISS' : '?');
|
|
114
|
+
console.log(` "${q}" | FTS: ${ftsHits} | ILIKE: ${ilikeHits} | recall: ${ftsRecall}% | ${status}`);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// 4. FTS 對 RRF 的貢獻度
|
|
118
|
+
console.log('\n--- 4. FTS 在 hybrid search 中的貢獻度 ---');
|
|
119
|
+
// 跑幾個查詢,看 FTS 跟 vector 的 session 重疊率
|
|
120
|
+
const overlapQueries = ['afterburn', 'bootstrap', '記憶', 'recall', 'entity'];
|
|
121
|
+
for (const q of overlapQueries) {
|
|
122
|
+
const ftsResult = await pool.query(`
|
|
123
|
+
SELECT ss.session_id
|
|
124
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
125
|
+
JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
|
|
126
|
+
WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
|
|
127
|
+
AND s.processing_status = 'succeeded'
|
|
128
|
+
ORDER BY ts_rank(ss.search_tsv, plainto_tsquery('simple', $1)) DESC
|
|
129
|
+
LIMIT 10
|
|
130
|
+
`, [q]);
|
|
131
|
+
const ftsIds = new Set(ftsResult.rows.map(r => r.session_id));
|
|
132
|
+
|
|
133
|
+
// vector search (if embedding available)
|
|
134
|
+
const embResult = await pool.query(`
|
|
135
|
+
SELECT ss.session_id
|
|
136
|
+
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
137
|
+
JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
|
|
138
|
+
WHERE ss.embedding IS NOT NULL
|
|
139
|
+
AND s.processing_status = 'succeeded'
|
|
140
|
+
ORDER BY ss.embedding <=> (
|
|
141
|
+
SELECT ss2.embedding FROM ${qi(SCHEMA)}.session_summaries ss2
|
|
142
|
+
WHERE ss2.search_tsv @@ plainto_tsquery('simple', $1)
|
|
143
|
+
ORDER BY ts_rank(ss2.search_tsv, plainto_tsquery('simple', $1)) DESC
|
|
144
|
+
LIMIT 1
|
|
145
|
+
)
|
|
146
|
+
LIMIT 10
|
|
147
|
+
`, [q]);
|
|
148
|
+
const embIds = new Set(embResult.rows.map(r => r.session_id));
|
|
149
|
+
|
|
150
|
+
const overlap = [...ftsIds].filter(id => embIds.has(id)).length;
|
|
151
|
+
const ftsOnly = [...ftsIds].filter(id => !embIds.has(id)).length;
|
|
152
|
+
const embOnly = [...embIds].filter(id => !ftsIds.has(id)).length;
|
|
153
|
+
|
|
154
|
+
console.log(` "${q}" | FTS top10: ${ftsIds.size} | Vec top10: ${embIds.size} | overlap: ${overlap} | FTS-only: ${ftsOnly} | Vec-only: ${embOnly}`);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
await pool.end();
|
|
158
|
+
console.log('\n=== 完成 ===');
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
run().catch(err => { console.error(err); process.exit(1); });
|