@shadowforge0/aquifer-memory 1.0.2 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -20
- package/consumers/claude-code.js +117 -0
- package/consumers/cli.js +17 -0
- package/consumers/default/daily-entries.js +196 -0
- package/consumers/default/index.js +282 -0
- package/consumers/default/prompts/summary.js +153 -0
- package/consumers/mcp.js +3 -23
- package/consumers/miranda/context-inject.js +119 -0
- package/consumers/miranda/daily-entries.js +224 -0
- package/consumers/miranda/index.js +353 -0
- package/consumers/miranda/instance.js +55 -0
- package/consumers/miranda/llm.js +99 -0
- package/consumers/miranda/prompts/summary.js +303 -0
- package/consumers/miranda/recall-format.js +74 -0
- package/consumers/miranda/workspace-files.js +91 -0
- package/consumers/openclaw-ext/index.js +38 -0
- package/consumers/openclaw-ext/openclaw.plugin.json +9 -0
- package/consumers/openclaw-ext/package.json +10 -0
- package/consumers/openclaw-plugin.js +66 -74
- package/consumers/opencode.js +21 -24
- package/consumers/shared/autodetect.js +64 -0
- package/consumers/shared/entity-parser.js +119 -0
- package/consumers/shared/ingest.js +148 -0
- package/consumers/shared/llm-autodetect.js +137 -0
- package/consumers/shared/normalize.js +129 -0
- package/consumers/shared/recall-format.js +110 -0
- package/core/aquifer.js +200 -82
- package/core/entity.js +29 -17
- package/core/storage.js +116 -45
- package/docs/postprocess-contract.md +132 -0
- package/index.js +9 -1
- package/package.json +23 -2
- package/pipeline/_http.js +1 -1
- package/pipeline/consolidation/apply.js +176 -0
- package/pipeline/consolidation/index.js +21 -0
- package/pipeline/extract-entities.js +2 -2
- package/pipeline/rerank.js +1 -1
- package/pipeline/summarize.js +4 -1
- package/schema/001-base.sql +61 -24
- package/schema/002-entities.sql +17 -3
- package/schema/004-facts.sql +67 -0
- package/scripts/diagnose-fts-zh.js +168 -134
- package/scripts/diagnose-vector.js +188 -0
- package/scripts/install-openclaw.sh +59 -0
- package/scripts/smoke.mjs +2 -2
package/pipeline/summarize.js
CHANGED
|
@@ -206,6 +206,9 @@ async function summarize(messages, {
|
|
|
206
206
|
try {
|
|
207
207
|
const prompt = buildPrompt(messages, { mergeEntities });
|
|
208
208
|
const response = await llmFn(prompt);
|
|
209
|
+
if (typeof response !== 'string' || response.trim() === '') {
|
|
210
|
+
return extractiveFallback(messages);
|
|
211
|
+
}
|
|
209
212
|
|
|
210
213
|
// Parse structured fields
|
|
211
214
|
const structuredSummary = _parseStructuredSummary(response);
|
|
@@ -232,7 +235,7 @@ async function summarize(messages, {
|
|
|
232
235
|
entityRaw,
|
|
233
236
|
isExtractive: false,
|
|
234
237
|
};
|
|
235
|
-
} catch
|
|
238
|
+
} catch {
|
|
236
239
|
// LLM failure: fall back to extractive
|
|
237
240
|
return extractiveFallback(messages);
|
|
238
241
|
}
|
package/schema/001-base.sql
CHANGED
|
@@ -43,27 +43,6 @@ CREATE INDEX IF NOT EXISTS idx_sessions_processing_status
|
|
|
43
43
|
ON ${schema}.sessions (processing_status)
|
|
44
44
|
WHERE processing_status IN ('pending', 'processing');
|
|
45
45
|
|
|
46
|
-
-- =========================================================================
|
|
47
|
-
-- Session segments: conversation boundary metadata
|
|
48
|
-
-- =========================================================================
|
|
49
|
-
CREATE TABLE IF NOT EXISTS ${schema}.session_segments (
|
|
50
|
-
id BIGSERIAL PRIMARY KEY,
|
|
51
|
-
session_row_id BIGINT NOT NULL REFERENCES ${schema}.sessions(id) ON DELETE CASCADE,
|
|
52
|
-
segment_no INT NOT NULL,
|
|
53
|
-
start_msg_idx INT,
|
|
54
|
-
end_msg_idx INT,
|
|
55
|
-
started_at TIMESTAMPTZ,
|
|
56
|
-
ended_at TIMESTAMPTZ,
|
|
57
|
-
raw_msg_count INT NOT NULL DEFAULT 0,
|
|
58
|
-
effective_msg_count INT NOT NULL DEFAULT 0,
|
|
59
|
-
boundary_type TEXT,
|
|
60
|
-
boundary_meta JSONB NOT NULL DEFAULT '{}',
|
|
61
|
-
UNIQUE (session_row_id, segment_no)
|
|
62
|
-
);
|
|
63
|
-
|
|
64
|
-
CREATE INDEX IF NOT EXISTS idx_session_segments_row
|
|
65
|
-
ON ${schema}.session_segments (session_row_id);
|
|
66
|
-
|
|
67
46
|
-- =========================================================================
|
|
68
47
|
-- Session summaries: LLM-generated or extractive summaries
|
|
69
48
|
-- =========================================================================
|
|
@@ -78,8 +57,6 @@ CREATE TABLE IF NOT EXISTS ${schema}.session_summaries (
|
|
|
78
57
|
message_count INT NOT NULL DEFAULT 0,
|
|
79
58
|
user_message_count INT NOT NULL DEFAULT 0,
|
|
80
59
|
assistant_message_count INT NOT NULL DEFAULT 0,
|
|
81
|
-
boundary_count INT NOT NULL DEFAULT 0,
|
|
82
|
-
fresh_tail_count INT NOT NULL DEFAULT 0,
|
|
83
60
|
started_at TIMESTAMPTZ,
|
|
84
61
|
ended_at TIMESTAMPTZ,
|
|
85
62
|
summary_text TEXT,
|
|
@@ -92,6 +69,23 @@ CREATE TABLE IF NOT EXISTS ${schema}.session_summaries (
|
|
|
92
69
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
93
70
|
);
|
|
94
71
|
|
|
72
|
+
-- Cleanup legacy segment-era schema artifacts so migrate() converges old installs.
|
|
73
|
+
-- Wrapped because the implicit sequence on session_segments can be referenced from
|
|
74
|
+
-- other schemas (e.g. bench/staging created via CREATE TABLE LIKE), which would
|
|
75
|
+
-- otherwise hard-fail the migration. Operators get a NOTICE and must decouple
|
|
76
|
+
-- dependents themselves before the table will actually drop.
|
|
77
|
+
DO $$
|
|
78
|
+
BEGIN
|
|
79
|
+
BEGIN
|
|
80
|
+
DROP TABLE IF EXISTS ${schema}.session_segments;
|
|
81
|
+
EXCEPTION
|
|
82
|
+
WHEN dependent_objects_still_exist THEN
|
|
83
|
+
RAISE NOTICE '[aquifer] skipped session_segments drop: %; decouple cross-schema dependents and re-run migrate to complete cleanup', SQLERRM;
|
|
84
|
+
END;
|
|
85
|
+
END$$;
|
|
86
|
+
ALTER TABLE ${schema}.session_summaries DROP COLUMN IF EXISTS boundary_count;
|
|
87
|
+
ALTER TABLE ${schema}.session_summaries DROP COLUMN IF EXISTS fresh_tail_count;
|
|
88
|
+
|
|
95
89
|
CREATE INDEX IF NOT EXISTS idx_summaries_tenant
|
|
96
90
|
ON ${schema}.session_summaries (tenant_id);
|
|
97
91
|
|
|
@@ -105,6 +99,27 @@ CREATE INDEX IF NOT EXISTS idx_summaries_embedding
|
|
|
105
99
|
ON ${schema}.session_summaries (session_row_id)
|
|
106
100
|
WHERE embedding IS NOT NULL;
|
|
107
101
|
|
|
102
|
+
-- HNSW approximate nearest-neighbor index for cosine-distance vector search.
|
|
103
|
+
-- Without this, ORDER BY embedding <=> $vec degrades to seq scan at scale.
|
|
104
|
+
-- Requires pgvector >= 0.5.0. HNSW cannot build on an empty unsized `vector`
|
|
105
|
+
-- column (can't infer dim), so we defer on failure — re-running migrate()
|
|
106
|
+
-- after the first insert will finish the job.
|
|
107
|
+
DO $$
|
|
108
|
+
BEGIN
|
|
109
|
+
BEGIN
|
|
110
|
+
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_summaries_embedding_hnsw ON ${schema}.session_summaries USING hnsw (embedding vector_cosine_ops)';
|
|
111
|
+
EXCEPTION
|
|
112
|
+
WHEN invalid_parameter_value THEN
|
|
113
|
+
RAISE NOTICE '[aquifer] HNSW index on session_summaries.embedding deferred; re-run migrate() after the first embedded row';
|
|
114
|
+
WHEN feature_not_supported THEN
|
|
115
|
+
RAISE NOTICE '[aquifer] HNSW not available on this pgvector; upgrade to >= 0.5.0 for index-accelerated vector search';
|
|
116
|
+
WHEN out_of_memory THEN
|
|
117
|
+
RAISE WARNING '[aquifer] HNSW build on session_summaries.embedding ran out of memory; raise maintenance_work_mem and re-run migrate()';
|
|
118
|
+
WHEN program_limit_exceeded THEN
|
|
119
|
+
RAISE WARNING '[aquifer] HNSW build on session_summaries.embedding exceeded an internal limit; inspect pgvector logs';
|
|
120
|
+
END;
|
|
121
|
+
END$$;
|
|
122
|
+
|
|
108
123
|
-- FTS trigger: auto-update search_tsv on INSERT/UPDATE
|
|
109
124
|
CREATE OR REPLACE FUNCTION ${schema}.session_summaries_search_tsv_update()
|
|
110
125
|
RETURNS trigger
|
|
@@ -158,8 +173,12 @@ $$;
|
|
|
158
173
|
DROP TRIGGER IF EXISTS trg_session_summaries_search_tsv
|
|
159
174
|
ON ${schema}.session_summaries;
|
|
160
175
|
|
|
176
|
+
-- Trigger fires on input-column changes only. search_text is a trigger output
|
|
177
|
+
-- (derived from structured_summary + summary_text) and listing it here was
|
|
178
|
+
-- redundant — PG's BEFORE semantics already prevent the assignment inside the
|
|
179
|
+
-- trigger body from re-firing the trigger.
|
|
161
180
|
CREATE TRIGGER trg_session_summaries_search_tsv
|
|
162
|
-
BEFORE INSERT OR UPDATE OF summary_text, structured_summary
|
|
181
|
+
BEFORE INSERT OR UPDATE OF summary_text, structured_summary
|
|
163
182
|
ON ${schema}.session_summaries
|
|
164
183
|
FOR EACH ROW
|
|
165
184
|
EXECUTE FUNCTION ${schema}.session_summaries_search_tsv_update();
|
|
@@ -189,3 +208,21 @@ CREATE INDEX IF NOT EXISTS idx_turn_emb_session_row
|
|
|
189
208
|
|
|
190
209
|
CREATE INDEX IF NOT EXISTS idx_turn_emb_tenant_agent
|
|
191
210
|
ON ${schema}.turn_embeddings (tenant_id, agent_id, source);
|
|
211
|
+
|
|
212
|
+
-- HNSW approximate nearest-neighbor index for turn-level vector search.
|
|
213
|
+
-- See notes on session_summaries.embedding HNSW above.
|
|
214
|
+
DO $$
|
|
215
|
+
BEGIN
|
|
216
|
+
BEGIN
|
|
217
|
+
EXECUTE 'CREATE INDEX IF NOT EXISTS idx_turn_emb_embedding_hnsw ON ${schema}.turn_embeddings USING hnsw (embedding vector_cosine_ops)';
|
|
218
|
+
EXCEPTION
|
|
219
|
+
WHEN invalid_parameter_value THEN
|
|
220
|
+
RAISE NOTICE '[aquifer] HNSW index on turn_embeddings.embedding deferred; re-run migrate() after the first embedded row';
|
|
221
|
+
WHEN feature_not_supported THEN
|
|
222
|
+
RAISE NOTICE '[aquifer] HNSW not available on this pgvector; upgrade to >= 0.5.0 for index-accelerated vector search';
|
|
223
|
+
WHEN out_of_memory THEN
|
|
224
|
+
RAISE WARNING '[aquifer] HNSW build on turn_embeddings.embedding ran out of memory; raise maintenance_work_mem and re-run migrate()';
|
|
225
|
+
WHEN program_limit_exceeded THEN
|
|
226
|
+
RAISE WARNING '[aquifer] HNSW build on turn_embeddings.embedding exceeded an internal limit; inspect pgvector logs';
|
|
227
|
+
END;
|
|
228
|
+
END$$;
|
package/schema/002-entities.sql
CHANGED
|
@@ -28,10 +28,24 @@ CREATE TABLE IF NOT EXISTS ${schema}.entities (
|
|
|
28
28
|
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
29
29
|
);
|
|
30
30
|
|
|
31
|
-
-- Migration: add entity_scope if missing (idempotent)
|
|
32
|
-
-- For upgrades: backfill from agent_id
|
|
31
|
+
-- Migration: add entity_scope if missing (idempotent, scope-corruption-safe).
|
|
32
|
+
-- For upgrades: backfill from agent_id ONLY on the first run of this migration,
|
|
33
|
+
-- detected via the column still being NULL-able. Once SET NOT NULL below fires,
|
|
34
|
+
-- subsequent runs skip the backfill so operator-assigned 'default' values are
|
|
35
|
+
-- never clobbered.
|
|
33
36
|
ALTER TABLE ${schema}.entities ADD COLUMN IF NOT EXISTS entity_scope TEXT DEFAULT 'default';
|
|
34
|
-
|
|
37
|
+
DO $$
|
|
38
|
+
BEGIN
|
|
39
|
+
IF EXISTS (
|
|
40
|
+
SELECT 1 FROM information_schema.columns
|
|
41
|
+
WHERE table_schema = '${schema}' AND table_name = 'entities'
|
|
42
|
+
AND column_name = 'entity_scope' AND is_nullable = 'YES'
|
|
43
|
+
) THEN
|
|
44
|
+
UPDATE ${schema}.entities
|
|
45
|
+
SET entity_scope = agent_id
|
|
46
|
+
WHERE entity_scope IS NULL OR entity_scope = 'default';
|
|
47
|
+
END IF;
|
|
48
|
+
END$$;
|
|
35
49
|
ALTER TABLE ${schema}.entities ALTER COLUMN entity_scope SET NOT NULL;
|
|
36
50
|
|
|
37
51
|
-- Unique constraint: entity identity is (tenant, name, scope)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
-- Aquifer facts / consolidation extension
|
|
2
|
+
-- Requires: 001-base.sql applied first
|
|
3
|
+
-- Usage: replace ${schema} with actual schema name
|
|
4
|
+
--
|
|
5
|
+
-- Facts store long-lived subject/statement pairs with a lifecycle:
|
|
6
|
+
-- candidate → active → (stale | archived | superseded)
|
|
7
|
+
-- Consumers write candidates during enrich (via writeFactCandidates).
|
|
8
|
+
-- consolidate() then promotes / updates / confirms / archives them.
|
|
9
|
+
|
|
10
|
+
-- =========================================================================
|
|
11
|
+
-- Facts: long-lived current-state statements per (subject, agent)
|
|
12
|
+
-- =========================================================================
|
|
13
|
+
CREATE TABLE IF NOT EXISTS ${schema}.facts (
|
|
14
|
+
id BIGSERIAL PRIMARY KEY,
|
|
15
|
+
tenant_id TEXT NOT NULL DEFAULT 'default',
|
|
16
|
+
subject_key TEXT NOT NULL,
|
|
17
|
+
subject_label TEXT NOT NULL,
|
|
18
|
+
statement TEXT NOT NULL,
|
|
19
|
+
status TEXT NOT NULL DEFAULT 'candidate'
|
|
20
|
+
CHECK (status IN ('candidate','active','stale','archived','superseded')),
|
|
21
|
+
importance SMALLINT NOT NULL DEFAULT 5,
|
|
22
|
+
source_session_id TEXT,
|
|
23
|
+
agent_id TEXT NOT NULL DEFAULT 'main',
|
|
24
|
+
evidence JSONB NOT NULL DEFAULT '[]'::jsonb,
|
|
25
|
+
superseded_by BIGINT REFERENCES ${schema}.facts(id) ON DELETE SET NULL,
|
|
26
|
+
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
27
|
+
last_confirmed_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
-- Migration: add tenant_id if upgrading from a legacy facts table (no tenant column).
|
|
31
|
+
ALTER TABLE ${schema}.facts ADD COLUMN IF NOT EXISTS tenant_id TEXT NOT NULL DEFAULT 'default';
|
|
32
|
+
|
|
33
|
+
-- At most one active fact per (tenant, subject, agent)
|
|
34
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_facts_active_subject
|
|
35
|
+
ON ${schema}.facts (tenant_id, subject_key, agent_id)
|
|
36
|
+
WHERE status = 'active';
|
|
37
|
+
|
|
38
|
+
CREATE INDEX IF NOT EXISTS idx_facts_active_agent
|
|
39
|
+
ON ${schema}.facts (tenant_id, agent_id, importance DESC, last_confirmed_at DESC)
|
|
40
|
+
WHERE status = 'active';
|
|
41
|
+
|
|
42
|
+
CREATE INDEX IF NOT EXISTS idx_facts_status_created
|
|
43
|
+
ON ${schema}.facts (tenant_id, status, first_seen_at DESC);
|
|
44
|
+
|
|
45
|
+
CREATE INDEX IF NOT EXISTS idx_facts_subject
|
|
46
|
+
ON ${schema}.facts (tenant_id, subject_key);
|
|
47
|
+
|
|
48
|
+
CREATE INDEX IF NOT EXISTS idx_facts_source_session
|
|
49
|
+
ON ${schema}.facts (source_session_id)
|
|
50
|
+
WHERE source_session_id IS NOT NULL;
|
|
51
|
+
|
|
52
|
+
COMMENT ON TABLE ${schema}.facts IS 'Fact candidates and active facts per (tenant, subject, agent) with consolidation lifecycle';
|
|
53
|
+
|
|
54
|
+
-- =========================================================================
|
|
55
|
+
-- Fact ↔ Entity join (optional, only when entities enabled)
|
|
56
|
+
-- =========================================================================
|
|
57
|
+
CREATE TABLE IF NOT EXISTS ${schema}.fact_entities (
|
|
58
|
+
id BIGSERIAL PRIMARY KEY,
|
|
59
|
+
fact_id BIGINT NOT NULL REFERENCES ${schema}.facts(id) ON DELETE CASCADE,
|
|
60
|
+
entity_id BIGINT NOT NULL,
|
|
61
|
+
UNIQUE (fact_id, entity_id)
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
CREATE INDEX IF NOT EXISTS idx_fact_entities_entity_id
|
|
65
|
+
ON ${schema}.fact_entities (entity_id);
|
|
66
|
+
|
|
67
|
+
COMMENT ON TABLE ${schema}.fact_entities IS 'Join table linking facts to entities (FK to entities is soft — entities table is optional)';
|
|
@@ -1,161 +1,195 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* FTS
|
|
4
|
+
* Aquifer FTS 中文診斷
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
6
|
+
* 測 aquifer 實際搜尋主路徑(trigram ILIKE on search_text + similarity ranking)
|
|
7
|
+
* vs fallback 路徑(tsvector @@ plainto_tsquery('simple', q))對中文 query 的表現。
|
|
8
|
+
*
|
|
9
|
+
* env:
|
|
10
|
+
* DATABASE_URL — required
|
|
11
|
+
* AQUIFER_SCHEMA — default 'public'
|
|
12
|
+
* DIAGNOSE_QUERIES — comma-separated, overrides built-in set
|
|
10
13
|
*/
|
|
11
14
|
|
|
12
15
|
const { Pool } = require('pg');
|
|
13
16
|
|
|
14
|
-
const DB_URL = process.env.DATABASE_URL
|
|
15
|
-
|
|
17
|
+
const DB_URL = process.env.DATABASE_URL;
|
|
18
|
+
if (!DB_URL) {
|
|
19
|
+
console.error('DATABASE_URL is required');
|
|
20
|
+
process.exit(2);
|
|
21
|
+
}
|
|
22
|
+
const SCHEMA = process.env.AQUIFER_SCHEMA || 'public';
|
|
23
|
+
|
|
24
|
+
const DEFAULT_QUERIES = [
|
|
25
|
+
// latin
|
|
26
|
+
'afterburn', 'bootstrap', 'session', 'recall', 'entity', 'OpenCode', 'Jenny', 'Aquifer',
|
|
27
|
+
// CJK short tokens — 最容易暴露 tokenizer 問題
|
|
28
|
+
'記憶', '時區', '去重', '架構', '修復',
|
|
29
|
+
// CJK phrase
|
|
30
|
+
'消化模式', 'daily entries',
|
|
31
|
+
];
|
|
32
|
+
const QUERIES = process.env.DIAGNOSE_QUERIES
|
|
33
|
+
? process.env.DIAGNOSE_QUERIES.split(',').map(s => s.trim()).filter(Boolean)
|
|
34
|
+
: DEFAULT_QUERIES;
|
|
16
35
|
|
|
17
36
|
const pool = new Pool({ connectionString: DB_URL });
|
|
37
|
+
const qi = (s) => `"${s.replace(/"/g, '""')}"`;
|
|
18
38
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
// 1. 看 token 分佈
|
|
25
|
-
console.log('--- 1. Token 分析 ---');
|
|
26
|
-
const tokenSample = await pool.query(`
|
|
27
|
-
SELECT ss.session_id,
|
|
28
|
-
array_length(tsvector_to_array(ss.search_tsv), 1) as token_count,
|
|
29
|
-
left(ss.summary_text, 80) as preview
|
|
30
|
-
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
31
|
-
WHERE ss.search_tsv IS NOT NULL
|
|
32
|
-
ORDER BY ss.updated_at DESC
|
|
33
|
-
LIMIT 10
|
|
34
|
-
`);
|
|
35
|
-
|
|
36
|
-
let totalTokens = 0;
|
|
37
|
-
let sessionCount = 0;
|
|
38
|
-
for (const r of tokenSample.rows) {
|
|
39
|
-
totalTokens += r.token_count || 0;
|
|
40
|
-
sessionCount++;
|
|
41
|
-
console.log(` ${r.session_id?.slice(0, 8)} | ${r.token_count || 0} tokens | ${r.preview}`);
|
|
42
|
-
}
|
|
43
|
-
console.log(` avg: ${sessionCount ? Math.round(totalTokens / sessionCount) : 0} tokens/session\n`);
|
|
39
|
+
function pct(n, d) {
|
|
40
|
+
if (d === 0) return n === 0 ? '—' : '∞%';
|
|
41
|
+
return `${Math.round((n / d) * 100)}%`;
|
|
42
|
+
}
|
|
44
43
|
|
|
45
|
-
|
|
46
|
-
console.log(
|
|
44
|
+
async function main() {
|
|
45
|
+
console.log(`=== Aquifer FTS 中文診斷 (schema=${SCHEMA}) ===\n`);
|
|
46
|
+
|
|
47
|
+
// -------------------------------------------------------------------------
|
|
48
|
+
// 0. 覆蓋率:search_text NULL 率 → 看 fallback 觸發比例
|
|
49
|
+
// -------------------------------------------------------------------------
|
|
50
|
+
const cov = await pool.query(`
|
|
51
|
+
SELECT
|
|
52
|
+
COUNT(*) AS total,
|
|
53
|
+
COUNT(*) FILTER (WHERE search_text IS NOT NULL) AS with_text,
|
|
54
|
+
COUNT(*) FILTER (WHERE search_tsv IS NOT NULL) AS with_tsv,
|
|
55
|
+
COUNT(*) FILTER (WHERE search_text IS NULL
|
|
56
|
+
AND search_tsv IS NOT NULL) AS tsv_only
|
|
57
|
+
FROM ${qi(SCHEMA)}.session_summaries
|
|
58
|
+
`);
|
|
59
|
+
const c = cov.rows[0];
|
|
60
|
+
console.log('--- 0. 搜尋欄位覆蓋率 ---');
|
|
61
|
+
console.log(` total rows : ${c.total}`);
|
|
62
|
+
console.log(` has search_text : ${c.with_text} (${pct(c.with_text, c.total)})`);
|
|
63
|
+
console.log(` has search_tsv : ${c.with_tsv} (${pct(c.with_tsv, c.total)})`);
|
|
64
|
+
console.log(` tsv-only (NULL search_text, falls back to FTS): ${c.tsv_only} (${pct(c.tsv_only, c.total)})\n`);
|
|
65
|
+
|
|
66
|
+
// -------------------------------------------------------------------------
|
|
67
|
+
// 1. Token 範例(tsvector lexeme 粒度觀察)
|
|
68
|
+
// -------------------------------------------------------------------------
|
|
69
|
+
console.log('--- 1. tsvector lexeme 粒度範例(最近 1 筆)---');
|
|
47
70
|
const tokenDetail = await pool.query(`
|
|
48
|
-
SELECT
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
71
|
+
SELECT session_id,
|
|
72
|
+
array_length(tsvector_to_array(search_tsv), 1) AS token_count,
|
|
73
|
+
array_to_string(tsvector_to_array(search_tsv), ' | ') AS tokens
|
|
74
|
+
FROM ${qi(SCHEMA)}.session_summaries
|
|
75
|
+
WHERE search_tsv IS NOT NULL
|
|
76
|
+
ORDER BY updated_at DESC
|
|
53
77
|
LIMIT 1
|
|
54
78
|
`);
|
|
55
79
|
if (tokenDetail.rows[0]) {
|
|
56
|
-
|
|
57
|
-
const
|
|
58
|
-
// 分類 token
|
|
59
|
-
const all = tokens.split(' | ');
|
|
80
|
+
const r = tokenDetail.rows[0];
|
|
81
|
+
const all = (r.tokens || '').split(' | ').filter(Boolean);
|
|
60
82
|
const cjk = all.filter(t => /[\u4e00-\u9fff]/.test(t));
|
|
61
83
|
const latin = all.filter(t => /^[a-z0-9]/.test(t));
|
|
62
|
-
|
|
63
|
-
console.log(`
|
|
64
|
-
console.log(` CJK
|
|
65
|
-
console.log(`
|
|
84
|
+
console.log(` session: ${String(r.session_id).slice(0, 8)} | total tokens: ${r.token_count || 0}`);
|
|
85
|
+
console.log(` latin: ${latin.length} | cjk-containing: ${cjk.length}`);
|
|
86
|
+
console.log(` CJK lexemes (前 15): ${cjk.slice(0, 15).join(' | ')}`);
|
|
87
|
+
console.log(` → CJK lexeme 若是 phrase 級(整句無空白),簡 tokenizer 對中文短 query 會 miss\n`);
|
|
88
|
+
} else {
|
|
89
|
+
console.log(' (no rows)\n');
|
|
66
90
|
}
|
|
67
91
|
|
|
68
|
-
//
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
const ftsResult = await pool.query(`
|
|
97
|
-
SELECT COUNT(*) as cnt
|
|
98
|
-
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
99
|
-
WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
|
|
100
|
-
`, [q]);
|
|
101
|
-
const ftsHits = parseInt(ftsResult.rows[0].cnt);
|
|
102
|
-
|
|
103
|
-
// 同時看 summary_text ILIKE 能找到幾筆(ground truth)
|
|
104
|
-
const ilikeResult = await pool.query(`
|
|
105
|
-
SELECT COUNT(*) as cnt
|
|
106
|
-
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
107
|
-
WHERE ss.summary_text ILIKE $1
|
|
108
|
-
OR ss.structured_summary::text ILIKE $1
|
|
109
|
-
`, [`%${q}%`]);
|
|
110
|
-
const ilikeHits = parseInt(ilikeResult.rows[0].cnt);
|
|
111
|
-
|
|
112
|
-
const ftsRecall = ilikeHits > 0 ? Math.round(ftsHits / ilikeHits * 100) : (ftsHits === 0 ? 100 : 0);
|
|
113
|
-
const status = ftsHits === ilikeHits ? '✓' : (ftsHits < ilikeHits ? '✗ MISS' : '?');
|
|
114
|
-
console.log(` "${q}" | FTS: ${ftsHits} | ILIKE: ${ilikeHits} | recall: ${ftsRecall}% | ${status}`);
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
// 4. FTS 對 RRF 的貢獻度
|
|
118
|
-
console.log('\n--- 4. FTS 在 hybrid search 中的貢獻度 ---');
|
|
119
|
-
// 跑幾個查詢,看 FTS 跟 vector 的 session 重疊率
|
|
120
|
-
const overlapQueries = ['afterburn', 'bootstrap', '記憶', 'recall', 'entity'];
|
|
121
|
-
for (const q of overlapQueries) {
|
|
122
|
-
const ftsResult = await pool.query(`
|
|
123
|
-
SELECT ss.session_id
|
|
124
|
-
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
125
|
-
JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
|
|
126
|
-
WHERE ss.search_tsv @@ plainto_tsquery('simple', $1)
|
|
127
|
-
AND s.processing_status = 'succeeded'
|
|
128
|
-
ORDER BY ts_rank(ss.search_tsv, plainto_tsquery('simple', $1)) DESC
|
|
129
|
-
LIMIT 10
|
|
130
|
-
`, [q]);
|
|
131
|
-
const ftsIds = new Set(ftsResult.rows.map(r => r.session_id));
|
|
132
|
-
|
|
133
|
-
// vector search (if embedding available)
|
|
134
|
-
const embResult = await pool.query(`
|
|
135
|
-
SELECT ss.session_id
|
|
136
|
-
FROM ${qi(SCHEMA)}.session_summaries ss
|
|
137
|
-
JOIN ${qi(SCHEMA)}.sessions s ON s.id = ss.session_row_id
|
|
138
|
-
WHERE ss.embedding IS NOT NULL
|
|
139
|
-
AND s.processing_status = 'succeeded'
|
|
140
|
-
ORDER BY ss.embedding <=> (
|
|
141
|
-
SELECT ss2.embedding FROM ${qi(SCHEMA)}.session_summaries ss2
|
|
142
|
-
WHERE ss2.search_tsv @@ plainto_tsquery('simple', $1)
|
|
143
|
-
ORDER BY ts_rank(ss2.search_tsv, plainto_tsquery('simple', $1)) DESC
|
|
144
|
-
LIMIT 1
|
|
92
|
+
// -------------------------------------------------------------------------
|
|
93
|
+
// 2. 主路徑 vs fallback:binary match 比對
|
|
94
|
+
//
|
|
95
|
+
// Ground truth = search_text ILIKE '%q%'(所有源欄位拼出的純文字 superset)
|
|
96
|
+
// 主路徑 = search_text ILIKE(GIN trgm 加速,語意等價 ILIKE)
|
|
97
|
+
// Fallback = search_tsv @@ plainto_tsquery('simple', q)
|
|
98
|
+
// -------------------------------------------------------------------------
|
|
99
|
+
console.log('--- 2. 主路徑(trigram)vs fallback(tsvector)binary match ---');
|
|
100
|
+
console.log(' query | truth | trgm | tsv | trgm% | tsv% | tsv-extra');
|
|
101
|
+
console.log(' ' + '-'.repeat(82));
|
|
102
|
+
|
|
103
|
+
const rowCount = await pool.query(
|
|
104
|
+
`SELECT COUNT(*) AS n FROM ${qi(SCHEMA)}.session_summaries WHERE search_text IS NOT NULL`
|
|
105
|
+
);
|
|
106
|
+
const withTextN = parseInt(rowCount.rows[0].n, 10);
|
|
107
|
+
console.log(` (ground truth 基數:含 search_text 的 row ${withTextN})`);
|
|
108
|
+
|
|
109
|
+
const summary = [];
|
|
110
|
+
for (const q of QUERIES) {
|
|
111
|
+
const r = await pool.query(
|
|
112
|
+
`
|
|
113
|
+
WITH base AS (
|
|
114
|
+
SELECT search_text,
|
|
115
|
+
search_tsv,
|
|
116
|
+
(search_text ILIKE '%' || $1 || '%') AS trgm_hit,
|
|
117
|
+
(search_tsv @@ plainto_tsquery('simple', $2)) AS tsv_hit
|
|
118
|
+
FROM ${qi(SCHEMA)}.session_summaries
|
|
119
|
+
WHERE search_text IS NOT NULL
|
|
145
120
|
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
121
|
+
SELECT
|
|
122
|
+
COUNT(*) FILTER (WHERE trgm_hit) AS truth,
|
|
123
|
+
COUNT(*) FILTER (WHERE trgm_hit) AS trgm,
|
|
124
|
+
COUNT(*) FILTER (WHERE tsv_hit) AS tsv,
|
|
125
|
+
COUNT(*) FILTER (WHERE tsv_hit AND NOT trgm_hit) AS tsv_extra
|
|
126
|
+
FROM base
|
|
127
|
+
`,
|
|
128
|
+
[q.replace(/[%_\\]/g, '\\$&'), q]
|
|
129
|
+
);
|
|
130
|
+
const { truth, trgm, tsv, tsv_extra } = r.rows[0];
|
|
131
|
+
const T = parseInt(truth, 10);
|
|
132
|
+
const A = parseInt(trgm, 10);
|
|
133
|
+
const B = parseInt(tsv, 10);
|
|
134
|
+
const E = parseInt(tsv_extra, 10);
|
|
135
|
+
summary.push({ q, T, A, B, E });
|
|
136
|
+
console.log(
|
|
137
|
+
` ${q.padEnd(19)} | ${String(T).padStart(5)} | ${String(A).padStart(5)} | ${String(B).padStart(5)} | ${pct(A, T).padStart(5)} | ${pct(B, T).padStart(5)} | ${String(E).padStart(5)}`
|
|
138
|
+
);
|
|
139
|
+
}
|
|
140
|
+
console.log(' (tsv-extra = tsvector 命中但 trigram 沒命中 → 通常是 0,代表 tsv 對整體搜尋無額外貢獻)\n');
|
|
141
|
+
|
|
142
|
+
// -------------------------------------------------------------------------
|
|
143
|
+
// 3. Ranking 品質對比:舊 ranking (similarity only) vs 新 ranking (substr-hit first)
|
|
144
|
+
// -------------------------------------------------------------------------
|
|
145
|
+
console.log('--- 3. Ranking 品質對比:top-5 substring-hit 命中率 ---');
|
|
146
|
+
console.log(' query | truth | old (sim only) | new (hit+sim)');
|
|
147
|
+
console.log(' ' + '-'.repeat(70));
|
|
148
|
+
for (const q of QUERIES) {
|
|
149
|
+
const like = q.replace(/[%_\\]/g, '\\$&');
|
|
150
|
+
const truthR = await pool.query(
|
|
151
|
+
`SELECT COUNT(*) AS n
|
|
152
|
+
FROM ${qi(SCHEMA)}.session_summaries
|
|
153
|
+
WHERE search_text ILIKE '%' || $1 || '%'`,
|
|
154
|
+
[like]
|
|
155
|
+
);
|
|
156
|
+
const T = parseInt(truthR.rows[0].n, 10);
|
|
157
|
+
|
|
158
|
+
const oldR = await pool.query(
|
|
159
|
+
`
|
|
160
|
+
SELECT (search_text ILIKE '%' || $1 || '%') AS substr_hit
|
|
161
|
+
FROM ${qi(SCHEMA)}.session_summaries
|
|
162
|
+
WHERE search_text IS NOT NULL
|
|
163
|
+
ORDER BY similarity(search_text, $2) DESC
|
|
164
|
+
LIMIT 5
|
|
165
|
+
`,
|
|
166
|
+
[like, q]
|
|
167
|
+
);
|
|
168
|
+
const oldHits = oldR.rows.filter(x => x.substr_hit).length;
|
|
169
|
+
|
|
170
|
+
const newR = await pool.query(
|
|
171
|
+
`
|
|
172
|
+
SELECT (search_text ILIKE '%' || $1 || '%') AS substr_hit
|
|
173
|
+
FROM ${qi(SCHEMA)}.session_summaries
|
|
174
|
+
WHERE search_text IS NOT NULL
|
|
175
|
+
ORDER BY
|
|
176
|
+
(search_text ILIKE '%' || $1 || '%') DESC,
|
|
177
|
+
similarity(search_text, $2) DESC
|
|
178
|
+
LIMIT 5
|
|
179
|
+
`,
|
|
180
|
+
[like, q]
|
|
181
|
+
);
|
|
182
|
+
const newHits = newR.rows.filter(x => x.substr_hit).length;
|
|
183
|
+
|
|
184
|
+
const expected = Math.min(5, T);
|
|
185
|
+
console.log(
|
|
186
|
+
` ${q.padEnd(19)} | ${String(T).padStart(5)} | ${String(oldHits).padStart(3)}/5 → ${String(expected).padStart(1)}/5 ${oldHits < expected ? '✗' : '✓'} | ${String(newHits).padStart(3)}/5 ${newHits < expected ? '✗' : '✓'}`
|
|
187
|
+
);
|
|
155
188
|
}
|
|
189
|
+
console.log(' (truth = 含該字串的 row 數;ideal top-5 substr-hit = min(truth, 5))');
|
|
156
190
|
|
|
157
191
|
await pool.end();
|
|
158
192
|
console.log('\n=== 完成 ===');
|
|
159
193
|
}
|
|
160
194
|
|
|
161
|
-
|
|
195
|
+
main().catch(err => { console.error(err); process.exit(1); });
|