@shadowforge0/aquifer-memory 1.2.1 → 1.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -9
- package/consumers/cli.js +11 -1
- package/consumers/default/index.js +17 -4
- package/consumers/mcp.js +21 -0
- package/consumers/miranda/index.js +15 -4
- package/consumers/miranda/profile.json +145 -0
- package/consumers/miranda/recall-format.js +5 -3
- package/consumers/miranda/render-daily-md.js +186 -0
- package/consumers/shared/config.js +8 -0
- package/consumers/shared/factory.js +2 -1
- package/consumers/shared/llm.js +1 -1
- package/consumers/shared/recall-format.js +21 -1
- package/core/aquifer.js +693 -87
- package/core/artifacts.js +174 -0
- package/core/bundles.js +400 -0
- package/core/consolidation.js +340 -0
- package/core/decisions.js +164 -0
- package/core/entity-state.js +483 -0
- package/core/errors.js +97 -0
- package/core/handoff.js +153 -0
- package/core/insights.js +499 -0
- package/core/mcp-manifest.js +131 -0
- package/core/narratives.js +212 -0
- package/core/profiles.js +171 -0
- package/core/state.js +163 -0
- package/core/storage.js +82 -5
- package/core/timeline.js +152 -0
- package/index.js +14 -0
- package/package.json +1 -1
- package/pipeline/extract-state-changes.js +205 -0
- package/schema/001-base.sql +186 -16
- package/schema/002-entities.sql +35 -1
- package/schema/004-completion.sql +391 -0
- package/schema/005-entity-state-history.sql +87 -0
- package/schema/006-insights.sql +138 -0
- package/scripts/diagnose-fts-zh.js +37 -4
- package/scripts/drop-entity-state-history.sql +17 -0
- package/scripts/drop-insights.sql +12 -0
- package/scripts/extract-insights-from-recent-sessions.js +315 -0
- package/scripts/find-dburl-hints.js +29 -0
- package/scripts/queries.json +45 -0
- package/scripts/retro-recall-bench.js +409 -0
- package/scripts/sample-bench-queries.sql +75 -0
|
@@ -4,11 +4,15 @@
|
|
|
4
4
|
* Aquifer FTS 中文診斷
|
|
5
5
|
*
|
|
6
6
|
* 測 aquifer 實際搜尋主路徑(trigram ILIKE on search_text + similarity ranking)
|
|
7
|
-
* vs fallback 路徑(tsvector @@ plainto_tsquery(
|
|
7
|
+
* vs fallback 路徑(tsvector @@ plainto_tsquery(<cfg>, q))對中文 query 的表現。
|
|
8
|
+
* tsconfig 自動偵測:public.zhcfg 已存在就用 'zhcfg'(1.5.0+ 底層是 pg_jieba
|
|
9
|
+
* jiebaqry,1.4.0 底層是 zhparser),否則退回 'simple'。腳本會印出 zhcfg 實際
|
|
10
|
+
* parser 名稱——看到 'zhparser' 代表繁體分詞會退化 char-level。
|
|
8
11
|
*
|
|
9
12
|
* env:
|
|
10
13
|
* DATABASE_URL — required
|
|
11
14
|
* AQUIFER_SCHEMA — default 'public'
|
|
15
|
+
* AQUIFER_FTS_CONFIG — override auto-detect ('zhcfg' or 'simple')
|
|
12
16
|
* DIAGNOSE_QUERIES — comma-separated, overrides built-in set
|
|
13
17
|
*/
|
|
14
18
|
|
|
@@ -41,8 +45,37 @@ function pct(n, d) {
|
|
|
41
45
|
return `${Math.round((n / d) * 100)}%`;
|
|
42
46
|
}
|
|
43
47
|
|
|
48
|
+
async function detectFtsConfig() {
|
|
49
|
+
if (process.env.AQUIFER_FTS_CONFIG === 'zhcfg' || process.env.AQUIFER_FTS_CONFIG === 'simple') {
|
|
50
|
+
return { cfg: process.env.AQUIFER_FTS_CONFIG, parser: null };
|
|
51
|
+
}
|
|
52
|
+
try {
|
|
53
|
+
const r = await pool.query(`
|
|
54
|
+
SELECT p.prsname AS parser
|
|
55
|
+
FROM pg_ts_config c JOIN pg_ts_parser p ON c.cfgparser = p.oid
|
|
56
|
+
WHERE c.cfgname = 'zhcfg' AND c.cfgnamespace = 'public'::regnamespace
|
|
57
|
+
LIMIT 1`);
|
|
58
|
+
if (r.rowCount > 0) return { cfg: 'zhcfg', parser: r.rows[0].parser };
|
|
59
|
+
return { cfg: 'simple', parser: null };
|
|
60
|
+
} catch {
|
|
61
|
+
return { cfg: 'simple', parser: null };
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
let FTS_CFG = 'simple';
|
|
66
|
+
|
|
44
67
|
async function main() {
|
|
45
|
-
|
|
68
|
+
const detected = await detectFtsConfig();
|
|
69
|
+
FTS_CFG = detected.cfg;
|
|
70
|
+
const parserLabel = detected.parser
|
|
71
|
+
? ` parser=${detected.parser}`
|
|
72
|
+
: '';
|
|
73
|
+
console.log(`=== Aquifer FTS 中文診斷 (schema=${SCHEMA}, tsconfig=${FTS_CFG}${parserLabel}) ===\n`);
|
|
74
|
+
if (detected.parser === 'zhparser') {
|
|
75
|
+
console.log('[warn] zhcfg 目前是 zhparser-backed。scws 內建字典是簡體字為主,對');
|
|
76
|
+
console.log(' 繁體字會全退 char-level 分詞(「記憶」→ 記/憶 單字,等於');
|
|
77
|
+
console.log(' simple tokenizer)。考慮換 pg_jieba,見 CHANGELOG 1.5.0。\n');
|
|
78
|
+
}
|
|
46
79
|
|
|
47
80
|
// -------------------------------------------------------------------------
|
|
48
81
|
// 0. 覆蓋率:search_text NULL 率 → 看 fallback 觸發比例
|
|
@@ -94,7 +127,7 @@ async function main() {
|
|
|
94
127
|
//
|
|
95
128
|
// Ground truth = search_text ILIKE '%q%'(所有源欄位拼出的純文字 superset)
|
|
96
129
|
// 主路徑 = search_text ILIKE(GIN trgm 加速,語意等價 ILIKE)
|
|
97
|
-
// Fallback = search_tsv @@ plainto_tsquery(
|
|
130
|
+
// Fallback = search_tsv @@ plainto_tsquery(<cfg>, q)
|
|
98
131
|
// -------------------------------------------------------------------------
|
|
99
132
|
console.log('--- 2. 主路徑(trigram)vs fallback(tsvector)binary match ---');
|
|
100
133
|
console.log(' query | truth | trgm | tsv | trgm% | tsv% | tsv-extra');
|
|
@@ -114,7 +147,7 @@ async function main() {
|
|
|
114
147
|
SELECT search_text,
|
|
115
148
|
search_tsv,
|
|
116
149
|
(search_text ILIKE '%' || $1 || '%') AS trgm_hit,
|
|
117
|
-
(search_tsv @@ plainto_tsquery('
|
|
150
|
+
(search_tsv @@ plainto_tsquery('${FTS_CFG}', $2)) AS tsv_hit
|
|
118
151
|
FROM ${qi(SCHEMA)}.session_summaries
|
|
119
152
|
WHERE search_text IS NOT NULL
|
|
120
153
|
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
-- DROP-clean script for entity_state_history (Q3 bitter-lesson escape hatch).
|
|
2
|
+
--
|
|
3
|
+
-- Run this if you decide native long-context / agentic memory has obviated the
|
|
4
|
+
-- temporal state-change layer. Removes the table and all dependent indexes;
|
|
5
|
+
-- nothing else in Aquifer references it directly (FK is one-way: this table
|
|
6
|
+
-- references entities/sessions, not the reverse).
|
|
7
|
+
--
|
|
8
|
+
-- Usage:
|
|
9
|
+
-- psql $DATABASE_URL -v schema=miranda -f scripts/drop-entity-state-history.sql
|
|
10
|
+
|
|
11
|
+
DROP TABLE IF EXISTS :"schema".entity_state_history CASCADE;
|
|
12
|
+
|
|
13
|
+
-- Verify nothing remains.
|
|
14
|
+
SELECT to_regclass(:'schema' || '.entity_state_history') AS table_after_drop;
|
|
15
|
+
SELECT to_regclass(:'schema' || '.idx_entity_state_history_current') AS idx_current_after_drop;
|
|
16
|
+
SELECT to_regclass(:'schema' || '.idx_entity_state_history_idempotency') AS idx_idempotency_after_drop;
|
|
17
|
+
-- All three should report NULL.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
-- DROP-clean script for insights (Q4 bitter-lesson escape hatch).
|
|
2
|
+
--
|
|
3
|
+
-- Removes the table and all dependent indexes. Nothing else in Aquifer
|
|
4
|
+
-- references it directly, so DROP CASCADE is safe and complete.
|
|
5
|
+
|
|
6
|
+
DROP TABLE IF EXISTS :"schema".insights CASCADE;
|
|
7
|
+
|
|
8
|
+
-- Verify nothing remains.
|
|
9
|
+
SELECT to_regclass(:'schema' || '.insights') AS table_after_drop;
|
|
10
|
+
SELECT to_regclass(:'schema' || '.idx_insights_active') AS idx_active_after_drop;
|
|
11
|
+
SELECT to_regclass(:'schema' || '.idx_insights_embedding') AS idx_embedding_after_drop;
|
|
12
|
+
-- All three should report NULL.
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Extract insights from recent sessions and commit them via aquifer.insights.
|
|
6
|
+
*
|
|
7
|
+
* Designed for cron: pulls the last N days of session_summaries for one
|
|
8
|
+
* agent, sends a single LLM call to distil higher-order insights, writes
|
|
9
|
+
* them to the insights table.
|
|
10
|
+
*
|
|
11
|
+
* This is "Route B" from spec.md Q4 — bypasses the cron prompt JSON-parse
|
|
12
|
+
* fragility and lets us own the LLM call + write atomically.
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* node scripts/extract-insights-from-recent-sessions.js \
|
|
16
|
+
* --agent main \
|
|
17
|
+
* [--days 14] \
|
|
18
|
+
* [--max-sessions 50] \
|
|
19
|
+
* [--types preference,pattern,frustration,workflow] \
|
|
20
|
+
* [--schema miranda] \
|
|
21
|
+
* [--tenant-id default] \
|
|
22
|
+
* [--dry-run]
|
|
23
|
+
*
|
|
24
|
+
* env:
|
|
25
|
+
* DATABASE_URL required
|
|
26
|
+
* EMBED_PROVIDER recommended (vector recall otherwise won't work)
|
|
27
|
+
* AQUIFER_LLM_PROVIDER required (extraction LLM)
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
const { Pool } = require('pg');
|
|
31
|
+
const { spawn } = require('node:child_process');
|
|
32
|
+
const aquiferIndex = require('..');
|
|
33
|
+
const { createEmbedder } = require('..');
|
|
34
|
+
const { resolveLlmFn } = require('../consumers/shared/llm-autodetect');
|
|
35
|
+
|
|
36
|
+
// Optional adapter: spawn the `claude` CLI (Claude Code) for extraction.
|
|
37
|
+
// Toggled by AQUIFER_INSIGHTS_CLI=claude. Uses OAuth from the user's
|
|
38
|
+
// keychain (do NOT set --bare, which disables OAuth). Returns a function
|
|
39
|
+
// with the same contract as resolveLlmFn's output: (prompt) => text.
|
|
40
|
+
function createClaudeCliFn(env) {
|
|
41
|
+
const model = env.AQUIFER_INSIGHTS_CLI_MODEL || 'opus';
|
|
42
|
+
const bin = env.AQUIFER_INSIGHTS_CLI_BIN || 'claude';
|
|
43
|
+
const timeoutMs = parseInt(env.AQUIFER_INSIGHTS_CLI_TIMEOUT_MS || '600000', 10);
|
|
44
|
+
return function llmFn(prompt) {
|
|
45
|
+
return new Promise((resolve, reject) => {
|
|
46
|
+
const child = spawn(bin, ['-p', '--model', model, '--output-format', 'text'], {
|
|
47
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
48
|
+
env: process.env,
|
|
49
|
+
});
|
|
50
|
+
let stdout = '', stderr = '';
|
|
51
|
+
const timer = setTimeout(() => {
|
|
52
|
+
child.kill('SIGKILL');
|
|
53
|
+
reject(new Error(`[extract-insights] claude cli timeout after ${timeoutMs}ms`));
|
|
54
|
+
}, timeoutMs);
|
|
55
|
+
child.stdout.on('data', d => { stdout += d.toString('utf8'); });
|
|
56
|
+
child.stderr.on('data', d => { stderr += d.toString('utf8'); });
|
|
57
|
+
child.on('error', e => { clearTimeout(timer); reject(e); });
|
|
58
|
+
child.on('exit', code => {
|
|
59
|
+
clearTimeout(timer);
|
|
60
|
+
if (code === 0) return resolve(stdout);
|
|
61
|
+
reject(new Error(`[extract-insights] claude cli exit ${code}: ${stderr.slice(0, 800)}`));
|
|
62
|
+
});
|
|
63
|
+
child.stdin.end(prompt);
|
|
64
|
+
});
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function parseArgs(argv) {
|
|
69
|
+
const args = {
|
|
70
|
+
agent: null,
|
|
71
|
+
days: 14,
|
|
72
|
+
maxSessions: 50,
|
|
73
|
+
types: ['preference', 'pattern', 'frustration', 'workflow'],
|
|
74
|
+
schema: process.env.AQUIFER_SCHEMA || 'miranda',
|
|
75
|
+
tenantId: process.env.AQUIFER_TENANT_ID || 'default',
|
|
76
|
+
dryRun: false,
|
|
77
|
+
};
|
|
78
|
+
for (let i = 0; i < argv.length; i++) {
|
|
79
|
+
const a = argv[i], v = argv[i + 1];
|
|
80
|
+
if (a === '--agent') { args.agent = v; i++; }
|
|
81
|
+
else if (a === '--days') { args.days = parseInt(v, 10); i++; }
|
|
82
|
+
else if (a === '--max-sessions') { args.maxSessions = parseInt(v, 10); i++; }
|
|
83
|
+
else if (a === '--types') { args.types = v.split(',').map(s => s.trim()); i++; }
|
|
84
|
+
else if (a === '--schema') { args.schema = v; i++; }
|
|
85
|
+
else if (a === '--tenant-id') { args.tenantId = v; i++; }
|
|
86
|
+
else if (a === '--dry-run') { args.dryRun = true; }
|
|
87
|
+
else if (a === '-h' || a === '--help') { args.help = true; }
|
|
88
|
+
}
|
|
89
|
+
return args;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function buildExtractionPrompt(sessions, types) {
|
|
93
|
+
const sessionsBlock = sessions.map(s => {
|
|
94
|
+
const summary = typeof s.structured_summary === 'object' ? s.structured_summary : {};
|
|
95
|
+
const title = summary.title || s.summary_text?.slice(0, 80) || '(untitled)';
|
|
96
|
+
const overview = summary.overview || s.summary_text || '';
|
|
97
|
+
return `### Session ${s.session_id} (${s.started_at})\n${title}\n${overview}`;
|
|
98
|
+
}).join('\n\n');
|
|
99
|
+
|
|
100
|
+
const typesList = types.join(' | ');
|
|
101
|
+
|
|
102
|
+
return `You distill HIGHER-ORDER INSIGHTS from a window of past sessions.
|
|
103
|
+
NOT individual facts (those go to entity_state_history). NOT raw recap.
|
|
104
|
+
Insights are stable observations about how the user works, what they prefer,
|
|
105
|
+
where they get stuck, and which workflows succeed.
|
|
106
|
+
|
|
107
|
+
Aim for 6-12 insights when the window has >50 sessions and >=3 distinct
|
|
108
|
+
themes. Returning only 2-3 on a rich window means you're under-extracting.
|
|
109
|
+
Returning 0 is only correct when the window is genuinely sparse.
|
|
110
|
+
|
|
111
|
+
## Insight types
|
|
112
|
+
- preference: stable user preference (e.g. "MK prefers terse responses with no trailing summaries")
|
|
113
|
+
- pattern: recurring behaviour or decision (e.g. "MK runs /develop before any non-trivial schema change")
|
|
114
|
+
- frustration: repeated pain point (e.g. "Cron jobs.json prompt parse keeps breaking on minor LLM output drift")
|
|
115
|
+
- workflow: reusable procedure that worked (e.g. "Aquifer release: pack tarball -> bump gateway pkg -> migrate -> restart")
|
|
116
|
+
|
|
117
|
+
## What to look for — don't just describe incidents
|
|
118
|
+
|
|
119
|
+
Technical bug patterns (timeouts, drift, regressions) are easy to spot but
|
|
120
|
+
shallow. The *high-value* insights are META-LEVEL signals about how the user
|
|
121
|
+
operates that you'd only see by reading MULTIPLE sessions back-to-back:
|
|
122
|
+
|
|
123
|
+
- **Behavioural preferences the user re-states or re-enforces.** If the user
|
|
124
|
+
corrects the agent's tone, format, or process more than once across
|
|
125
|
+
sessions (e.g. "stop using bullet lists", "查歷史再動手", "不要客套"),
|
|
126
|
+
that's a preference worth recording. These directly shape how the agent
|
|
127
|
+
should behave next time — importance 0.85-0.95.
|
|
128
|
+
- **Discipline gaps the user flags repeatedly.** Things like "未驗證就回答",
|
|
129
|
+
"未查 context 就動手", "重複早上做過的事" are frustration insights about
|
|
130
|
+
the agent's own behaviour, not about external systems. These are the
|
|
131
|
+
highest-leverage insights because they prevent future trust erosion.
|
|
132
|
+
- **Decision-style signatures.** How the user makes calls under ambiguity:
|
|
133
|
+
"prefer direct over indirect routing", "拔掉不再用的 infra 不留以後可能用",
|
|
134
|
+
"選穩定版不追最新". These are rarely stated once but emerge as a shape
|
|
135
|
+
across many sessions.
|
|
136
|
+
- **Workflows that succeeded AND the scaffolding that made them succeed.**
|
|
137
|
+
Not just "user did X", but "user's X works because of Y precondition".
|
|
138
|
+
|
|
139
|
+
If you only surface technical bug frustrations and miss the meta-level
|
|
140
|
+
behavioural signal, you've failed at this task — a shallow extractor would
|
|
141
|
+
do the same.
|
|
142
|
+
|
|
143
|
+
## Strict rules
|
|
144
|
+
1. Insights must be TRUE ACROSS MULTIPLE SESSIONS (>=2). One-off events don't count.
|
|
145
|
+
2. title: <= 80 chars, declarative. The display surface — can be colourful.
|
|
146
|
+
3. canonicalClaim: <= 80 chars, DECLARATIVE AND STABLE. The *identity* of this
|
|
147
|
+
insight. No rhetoric, no examples, no time words, no emphasis. If the same
|
|
148
|
+
underlying claim shows up under a different title next run, canonicalClaim
|
|
149
|
+
should be identical. Example: canonicalClaim="mk prefers prose over bullet
|
|
150
|
+
lists", while title could be "散文段落,禁 bullet" or "prose-only formatting".
|
|
151
|
+
4. entities: array of proper-noun subjects the claim is ABOUT. Tool names,
|
|
152
|
+
project names, persona names, components. Empty array [] is valid when the
|
|
153
|
+
claim is generic. Example: ["Aquifer", "insights-cron"] or ["Claude Code"].
|
|
154
|
+
5. body: 2-4 sentences. Cite the pattern AND the root cause or user motivation,
|
|
155
|
+
not just restate facts.
|
|
156
|
+
6. importance: 0..1.
|
|
157
|
+
- 0.85-0.95: meta-level preferences + discipline gaps that directly change
|
|
158
|
+
how the agent should behave (highest leverage — these go here, not lower).
|
|
159
|
+
- 0.65-0.80: stable technical patterns / workflows.
|
|
160
|
+
- 0.45-0.60: useful but lower-leverage observations.
|
|
161
|
+
Don't bunch everything in 0.70-0.85 out of caution — spread the scale.
|
|
162
|
+
7. sourceSessionIds: list every session_id that contributes evidence.
|
|
163
|
+
>=2 required; >=3 strongly preferred for meta-level insights.
|
|
164
|
+
8. type must be one of: ${typesList}.
|
|
165
|
+
9. Do NOT output {"insights":[]} just because you're uncertain on individual
|
|
166
|
+
items. Extract what has clear evidence; omit only what lacks it.
|
|
167
|
+
|
|
168
|
+
## Output
|
|
169
|
+
Single JSON object, no prose, no fence:
|
|
170
|
+
{
|
|
171
|
+
"insights": [
|
|
172
|
+
{
|
|
173
|
+
"type": "preference|pattern|frustration|workflow",
|
|
174
|
+
"title": "...",
|
|
175
|
+
"canonicalClaim": "...",
|
|
176
|
+
"entities": ["..."],
|
|
177
|
+
"body": "...",
|
|
178
|
+
"importance": 0.7,
|
|
179
|
+
"sourceSessionIds": ["sess_a", "sess_b"]
|
|
180
|
+
}
|
|
181
|
+
]
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
## Sessions in window
|
|
185
|
+
${sessionsBlock}
|
|
186
|
+
`;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
function extractJsonBlock(text) {
|
|
190
|
+
if (!text || typeof text !== 'string') return null;
|
|
191
|
+
let s = text.trim();
|
|
192
|
+
const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
193
|
+
if (fence) s = fence[1].trim();
|
|
194
|
+
const first = s.indexOf('{'), last = s.lastIndexOf('}');
|
|
195
|
+
if (first < 0 || last < first) return null;
|
|
196
|
+
try { return JSON.parse(s.slice(first, last + 1)); } catch { return null; }
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
async function main() {
|
|
200
|
+
const args = parseArgs(process.argv.slice(2));
|
|
201
|
+
if (args.help || !args.agent) {
|
|
202
|
+
console.error('Usage: --agent <id> [--days 14] [--max-sessions 50] [--types ...] [--dry-run]');
|
|
203
|
+
process.exit(args.help ? 0 : 2);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const dbUrl = process.env.DATABASE_URL || process.env.AQUIFER_DB_URL;
|
|
207
|
+
if (!dbUrl) { console.error('DATABASE_URL is required'); process.exit(2); }
|
|
208
|
+
|
|
209
|
+
const pool = new Pool({ connectionString: dbUrl });
|
|
210
|
+
|
|
211
|
+
const useCli = (process.env.AQUIFER_INSIGHTS_CLI || '').toLowerCase() === 'claude';
|
|
212
|
+
const llmFn = useCli
|
|
213
|
+
? createClaudeCliFn(process.env)
|
|
214
|
+
: resolveLlmFn(null, process.env);
|
|
215
|
+
if (!llmFn) { console.error('AQUIFER_LLM_PROVIDER + key required (or set AQUIFER_INSIGHTS_CLI=claude)'); process.exit(2); }
|
|
216
|
+
console.log('[extract-insights] llm backend:', useCli ? `claude cli (${process.env.AQUIFER_INSIGHTS_CLI_MODEL || 'opus'})` : 'api provider');
|
|
217
|
+
|
|
218
|
+
const qi = (s) => `"${String(s).replace(/"/g, '""')}"`;
|
|
219
|
+
const sessionsRes = await pool.query(
|
|
220
|
+
`SELECT s.session_id, s.started_at, ss.summary_text, ss.structured_summary
|
|
221
|
+
FROM ${qi(args.schema)}.sessions s
|
|
222
|
+
JOIN ${qi(args.schema)}.session_summaries ss ON ss.session_row_id = s.id
|
|
223
|
+
WHERE s.tenant_id = $1
|
|
224
|
+
AND s.agent_id = $2
|
|
225
|
+
AND s.started_at >= now() - ($3 || ' days')::interval
|
|
226
|
+
AND ss.summary_text IS NOT NULL
|
|
227
|
+
ORDER BY s.started_at DESC
|
|
228
|
+
LIMIT $4`,
|
|
229
|
+
[args.tenantId, args.agent, String(args.days), args.maxSessions]
|
|
230
|
+
);
|
|
231
|
+
|
|
232
|
+
const sessions = sessionsRes.rows;
|
|
233
|
+
console.log(`[extract-insights] ${sessions.length} sessions in last ${args.days}d for agent=${args.agent}`);
|
|
234
|
+
if (sessions.length === 0) {
|
|
235
|
+
console.log('[extract-insights] nothing to do, exiting clean');
|
|
236
|
+
await pool.end();
|
|
237
|
+
return;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const prompt = buildExtractionPrompt(sessions, args.types);
|
|
241
|
+
console.log('[extract-insights] sending to LLM...');
|
|
242
|
+
let raw;
|
|
243
|
+
try {
|
|
244
|
+
raw = await llmFn(prompt);
|
|
245
|
+
} catch (e) {
|
|
246
|
+
console.error('[extract-insights] llm call failed:', e.message);
|
|
247
|
+
await pool.end();
|
|
248
|
+
process.exit(1);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
const parsed = extractJsonBlock(raw);
|
|
252
|
+
if (!parsed || !Array.isArray(parsed.insights)) {
|
|
253
|
+
console.error('[extract-insights] malformed LLM output, dumping raw:\n', raw);
|
|
254
|
+
await pool.end();
|
|
255
|
+
process.exit(1);
|
|
256
|
+
}
|
|
257
|
+
console.log(`[extract-insights] ${parsed.insights.length} insights returned`);
|
|
258
|
+
|
|
259
|
+
if (args.dryRun) {
|
|
260
|
+
console.log(JSON.stringify(parsed.insights, null, 2));
|
|
261
|
+
await pool.end();
|
|
262
|
+
return;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Build embedFn (optional — without it insights still write but recall via
|
|
266
|
+
// semantic query won't work).
|
|
267
|
+
let embedFn = null;
|
|
268
|
+
try {
|
|
269
|
+
const e = createEmbedder({});
|
|
270
|
+
embedFn = (texts) => e.embedBatch(texts);
|
|
271
|
+
} catch {
|
|
272
|
+
console.warn('[extract-insights] embed unavailable, insights will save without vector index entries');
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const aquifer = aquiferIndex.createAquifer({
|
|
276
|
+
db: pool,
|
|
277
|
+
schema: args.schema,
|
|
278
|
+
tenantId: args.tenantId,
|
|
279
|
+
embed: embedFn ? { fn: embedFn } : undefined,
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
// Window = oldest..newest source session timestamp (fallback to now).
|
|
283
|
+
const sortedTimes = sessions.map(s => new Date(s.started_at)).sort((a, b) => a - b);
|
|
284
|
+
const windowFrom = sortedTimes[0]?.toISOString() || new Date().toISOString();
|
|
285
|
+
const windowTo = sortedTimes[sortedTimes.length - 1]?.toISOString() || new Date().toISOString();
|
|
286
|
+
|
|
287
|
+
let written = 0, duplicates = 0, failed = 0;
|
|
288
|
+
for (const ins of parsed.insights) {
|
|
289
|
+
if (!ins || !ins.type || !args.types.includes(ins.type)) { failed++; continue; }
|
|
290
|
+
const r = await aquifer.insights.commitInsight({
|
|
291
|
+
agentId: args.agent,
|
|
292
|
+
type: ins.type,
|
|
293
|
+
title: ins.title,
|
|
294
|
+
canonicalClaim: typeof ins.canonicalClaim === 'string' ? ins.canonicalClaim : undefined,
|
|
295
|
+
entities: Array.isArray(ins.entities) ? ins.entities : [],
|
|
296
|
+
body: ins.body,
|
|
297
|
+
sourceSessionIds: Array.isArray(ins.sourceSessionIds) ? ins.sourceSessionIds : [],
|
|
298
|
+
evidenceWindow: { from: windowFrom, to: windowTo },
|
|
299
|
+
importance: ins.importance,
|
|
300
|
+
metadata: { extractor: 'extract-insights-from-recent-sessions', windowDays: args.days },
|
|
301
|
+
});
|
|
302
|
+
if (!r.ok) { failed++; console.warn(` fail ${ins.type}: ${r.error.code} ${r.error.message}`); }
|
|
303
|
+
else if (r.data.duplicate) { duplicates++; console.log(` dup ${ins.type}: ${ins.title}`); }
|
|
304
|
+
else { written++; console.log(` ok ${ins.type} (id=${r.data.insight.id}): ${ins.title}`); }
|
|
305
|
+
}
|
|
306
|
+
console.log(`[extract-insights] written=${written} dup=${duplicates} failed=${failed}`);
|
|
307
|
+
|
|
308
|
+
await aquifer.close?.().catch(() => {});
|
|
309
|
+
await pool.end().catch(() => {});
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
main().catch(err => {
|
|
313
|
+
console.error('[extract-insights] fatal:', err.stack || err.message);
|
|
314
|
+
process.exit(1);
|
|
315
|
+
});
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
|
|
4
|
+
const root = process.argv[2] || process.cwd();
|
|
5
|
+
const exts = new Set(['.md', '.js', '.json', '.yaml', '.yml', '.sh', '.sql', '.txt']);
|
|
6
|
+
const needles = [/DATABASE_URL/g, /AQUIFER_DB_URL/g, /postgresql:\/\//g, /postgres:\/\//g];
|
|
7
|
+
|
|
8
|
+
function walk(dir, out=[]) {
|
|
9
|
+
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
10
|
+
if (entry.name === 'node_modules' || entry.name === '.git') continue;
|
|
11
|
+
const p = path.join(dir, entry.name);
|
|
12
|
+
if (entry.isDirectory()) walk(p, out);
|
|
13
|
+
else out.push(p);
|
|
14
|
+
}
|
|
15
|
+
return out;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
for (const f of walk(root)) {
|
|
19
|
+
if (!exts.has(path.extname(f))) continue;
|
|
20
|
+
let txt;
|
|
21
|
+
try { txt = fs.readFileSync(f, 'utf8'); } catch { continue; }
|
|
22
|
+
if (!needles.some(re => re.test(txt))) continue;
|
|
23
|
+
const lines = txt.split('\n');
|
|
24
|
+
lines.forEach((line, i) => {
|
|
25
|
+
if (needles.some(re => re.test(line))) {
|
|
26
|
+
console.log(`${path.relative(root, f)}:${i+1}: ${line}`);
|
|
27
|
+
}
|
|
28
|
+
});
|
|
29
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"queries": [
|
|
4
|
+
{
|
|
5
|
+
"id": "q-001",
|
|
6
|
+
"lang": "en",
|
|
7
|
+
"text": "How do I set up Aquifer memory storage with PostgreSQL?"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"id": "q-002",
|
|
11
|
+
"lang": "en",
|
|
12
|
+
"text": "What is the difference between memory_search and session_recall in Aquifer?"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"id": "q-003",
|
|
16
|
+
"lang": "zh",
|
|
17
|
+
"text": "Aquifer 的 session recall 是怎麼做 hybrid 檢索的?"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "q-004",
|
|
21
|
+
"lang": "zh",
|
|
22
|
+
"text": "為什麼 zhcfg 會依賴 jieba 或 zhparser?"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "q-005",
|
|
26
|
+
"lang": "mixed",
|
|
27
|
+
"text": "How to debug fts-zhcfg pipeline 在 jieba migration 後失敗的問題?"
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": "q-006",
|
|
31
|
+
"lang": "mixed",
|
|
32
|
+
"text": "memory_search 找不到結果時,應該先看哪個 log 或 table?"
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"id": "q-007",
|
|
36
|
+
"lang": "en",
|
|
37
|
+
"text": "How does hybrid-rerank differ from hybrid mode in retro recall bench?"
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"id": "q-008",
|
|
41
|
+
"lang": "zh",
|
|
42
|
+
"text": "Aquifer 初始化後要如何驗證 embeddings pipeline 有正常工作?"
|
|
43
|
+
}
|
|
44
|
+
]
|
|
45
|
+
}
|