@shadowforge0/aquifer-memory 1.5.9 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.env.example +23 -0
  2. package/README.md +96 -73
  3. package/README_CN.md +659 -0
  4. package/README_TW.md +680 -0
  5. package/aquifer.config.example.json +34 -0
  6. package/consumers/claude-code.js +11 -11
  7. package/consumers/cli.js +374 -39
  8. package/consumers/codex-handoff.js +152 -0
  9. package/consumers/codex.js +1549 -0
  10. package/consumers/default/daily-entries.js +23 -4
  11. package/consumers/default/index.js +2 -2
  12. package/consumers/default/prompts/summary.js +6 -6
  13. package/consumers/mcp.js +131 -7
  14. package/consumers/openclaw-ext/index.js +0 -1
  15. package/consumers/openclaw-plugin.js +44 -4
  16. package/consumers/shared/config.js +28 -0
  17. package/consumers/shared/factory.js +2 -0
  18. package/consumers/shared/ingest.js +1 -1
  19. package/consumers/shared/normalize.js +14 -3
  20. package/consumers/shared/recall-format.js +53 -0
  21. package/consumers/shared/summary-parser.js +151 -0
  22. package/core/aquifer.js +384 -18
  23. package/core/finalization-review.js +319 -0
  24. package/core/insights.js +210 -58
  25. package/core/mcp-manifest.js +69 -2
  26. package/core/memory-bootstrap.js +188 -0
  27. package/core/memory-consolidation.js +1236 -0
  28. package/core/memory-promotion.js +544 -0
  29. package/core/memory-recall.js +247 -0
  30. package/core/memory-records.js +581 -0
  31. package/core/memory-safety-gate.js +224 -0
  32. package/core/session-finalization.js +350 -0
  33. package/core/storage.js +456 -2
  34. package/docs/getting-started.md +99 -0
  35. package/docs/postprocess-contract.md +2 -2
  36. package/docs/setup.md +51 -2
  37. package/package.json +31 -9
  38. package/pipeline/normalize/adapters/codex.js +106 -0
  39. package/pipeline/normalize/detect.js +3 -2
  40. package/schema/001-base.sql +3 -0
  41. package/schema/007-v1-foundation.sql +273 -0
  42. package/schema/008-session-finalizations.sql +50 -0
  43. package/schema/009-v1-assertion-plane.sql +193 -0
  44. package/schema/010-v1-finalization-review.sql +160 -0
  45. package/schema/011-v1-compaction-claim.sql +46 -0
  46. package/schema/012-v1-compaction-lease.sql +39 -0
  47. package/schema/013-v1-compaction-lineage.sql +193 -0
  48. package/scripts/backfill-canonical-key.js +250 -0
  49. package/scripts/codex-recovery.js +532 -0
  50. package/consumers/miranda/context-inject.js +0 -119
  51. package/consumers/miranda/daily-entries.js +0 -224
  52. package/consumers/miranda/index.js +0 -364
  53. package/consumers/miranda/instance.js +0 -55
  54. package/consumers/miranda/llm.js +0 -99
  55. package/consumers/miranda/profile.json +0 -145
  56. package/consumers/miranda/prompts/summary.js +0 -303
  57. package/consumers/miranda/recall-format.js +0 -76
  58. package/consumers/miranda/render-daily-md.js +0 -186
  59. package/consumers/miranda/workspace-files.js +0 -91
  60. package/scripts/drop-entity-state-history.sql +0 -17
  61. package/scripts/drop-insights.sql +0 -12
  62. package/scripts/install-openclaw.sh +0 -59
  63. package/scripts/queries.json +0 -45
  64. package/scripts/retro-recall-bench.js +0 -409
  65. package/scripts/sample-bench-queries.sql +0 -75
@@ -1,409 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- /**
5
- * Retro recall bench — runs the same query set across 6 pipelines and
6
- * reports nDCG@5 / MRR / latency / empty-rate. Designed for the post-1.3.0
7
- * Phase 0 audit; see ~/.claude/develop-runs/20260419-142432-aquifer-memory-routes/spec.md.
8
- *
9
- * Pipelines:
10
- * fts-simple storage.searchSessions(ftsConfig='simple')
11
- * fts-zhcfg storage.searchSessions(ftsConfig='zhcfg') [skip if zhcfg missing]
12
- * summary-vector storage.searchSummaryEmbeddings
13
- * turn-only storage.searchTurnEmbeddings
14
- * hybrid aquifer.recall(mode='hybrid', rerank disabled)
15
- * hybrid-rerank aquifer.recall(mode='hybrid', rerank forced)
16
- *
17
- * Usage:
18
- * node scripts/retro-recall-bench.js \
19
- * --query-set queries.json \
20
- * [--judgements judgements.json] \
21
- * [--output report.json] \
22
- * [--markdown summary.md] \
23
- * [--pipelines fts-simple,fts-zhcfg,summary-vector,turn-only,hybrid,hybrid-rerank] \
24
- * [--limit 5] [--warmup 1] [--schema miranda] [--tenant-id default]
25
- *
26
- * env:
27
- * DATABASE_URL required
28
- * AQUIFER_SCHEMA default 'miranda'
29
- * EMBED_PROVIDER + key required for vector pipelines
30
- * AQUIFER_LLM_PROVIDER unused here (no enrich)
31
- * OPENROUTER_API_KEY required for hybrid-rerank pipeline
32
- */
33
-
34
- const fs = require('fs');
35
- const { Pool } = require('pg');
36
- const aquiferIndex = require('..');
37
- const storage = require('../core/storage');
38
- const { createEmbedder } = require('..');
39
-
40
- const ALL_PIPELINES = [
41
- 'fts-simple',
42
- 'fts-zhcfg',
43
- 'summary-vector',
44
- 'turn-only',
45
- 'hybrid',
46
- 'hybrid-rerank',
47
- ];
48
-
49
- function parseArgs(argv) {
50
- const args = {
51
- querySet: null,
52
- judgements: null,
53
- output: null,
54
- markdown: null,
55
- pipelines: ALL_PIPELINES,
56
- limit: 5,
57
- warmup: 1,
58
- schema: process.env.AQUIFER_SCHEMA || 'miranda',
59
- tenantId: process.env.AQUIFER_TENANT_ID || 'default',
60
- rerankTopK: 20,
61
- };
62
- for (let i = 0; i < argv.length; i++) {
63
- const a = argv[i];
64
- const v = argv[i + 1];
65
- if (a === '--query-set') { args.querySet = v; i++; }
66
- else if (a === '--judgements') { args.judgements = v; i++; }
67
- else if (a === '--output') { args.output = v; i++; }
68
- else if (a === '--markdown') { args.markdown = v; i++; }
69
- else if (a === '--pipelines') { args.pipelines = v.split(',').map(s => s.trim()).filter(Boolean); i++; }
70
- else if (a === '--limit') { args.limit = parseInt(v, 10); i++; }
71
- else if (a === '--warmup') { args.warmup = parseInt(v, 10); i++; }
72
- else if (a === '--schema') { args.schema = v; i++; }
73
- else if (a === '--tenant-id') { args.tenantId = v; i++; }
74
- else if (a === '--rerank-topk') { args.rerankTopK = parseInt(v, 10); i++; }
75
- else if (a === '-h' || a === '--help') { args.help = true; }
76
- }
77
- return args;
78
- }
79
-
80
- function printHelp() {
81
- console.log(fs.readFileSync(__filename, 'utf8').split('\n').slice(0, 36).join('\n'));
82
- }
83
-
84
- function detectFtsConfigsAvailable(pool) {
85
- return pool.query(`SELECT cfgname FROM pg_ts_config WHERE cfgname IN ('simple','zhcfg')`)
86
- .then(r => new Set(r.rows.map(row => row.cfgname)));
87
- }
88
-
89
- async function withLatency(fn) {
90
- const t0 = process.hrtime.bigint();
91
- let result; let error = null;
92
- try { result = await fn(); } catch (e) { error = e; }
93
- const t1 = process.hrtime.bigint();
94
- return { latencyMs: Number(t1 - t0) / 1e6, result, error };
95
- }
96
-
97
- function normalizeHits(rows, scoreKey) {
98
- return rows.map((r, i) => ({
99
- rank: i + 1,
100
- sessionId: r.session_id || r.sessionId,
101
- sessionRowId: r.id || r.session_row_id || r.sessionRowId || null,
102
- agentId: r.agent_id || r.agentId || null,
103
- score: r[scoreKey] ?? null,
104
- summaryDistance: r.distance ?? null,
105
- turnDistance: r.turn_distance ?? null,
106
- }));
107
- }
108
-
109
- function normalizeAquiferHits(rows) {
110
- return rows.map((r, i) => ({
111
- rank: i + 1,
112
- sessionId: r.sessionId,
113
- sessionRowId: null,
114
- agentId: r.agentId,
115
- score: r.score ?? null,
116
- summaryDistance: r._debug?.hybridScore ?? null,
117
- turnDistance: null,
118
- rerankApplied: r._debug?.rerankApplied ?? false,
119
- rerankReason: r._debug?.rerankReason ?? null,
120
- }));
121
- }
122
-
123
- async function runPipeline(name, ctx, query) {
124
- const { pool, schema, tenantId, limit, queryVec, aquifer, ftsAvailable } = ctx;
125
- switch (name) {
126
- case 'fts-simple': {
127
- if (!ftsAvailable.has('simple')) return { skipped: true, reason: 'simple tsconfig missing' };
128
- return withLatency(async () => {
129
- const rows = await storage.searchSessions(pool, query.text, {
130
- schema, tenantId, agentId: query.agentId, limit, ftsConfig: 'simple',
131
- });
132
- return normalizeHits(rows, 'fts_rank');
133
- });
134
- }
135
- case 'fts-zhcfg': {
136
- if (!ftsAvailable.has('zhcfg')) return { skipped: true, reason: 'zhcfg tsconfig missing — install zhparser' };
137
- return withLatency(async () => {
138
- const rows = await storage.searchSessions(pool, query.text, {
139
- schema, tenantId, agentId: query.agentId, limit, ftsConfig: 'zhcfg',
140
- });
141
- return normalizeHits(rows, 'fts_rank');
142
- });
143
- }
144
- case 'summary-vector': {
145
- if (!queryVec) return { skipped: true, reason: 'no embed provider' };
146
- return withLatency(async () => {
147
- const { rows } = await storage.searchSummaryEmbeddings(pool, {
148
- schema, tenantId, queryVec, agentId: query.agentId, limit,
149
- });
150
- return normalizeHits(rows, 'distance');
151
- });
152
- }
153
- case 'turn-only': {
154
- if (!queryVec) return { skipped: true, reason: 'no embed provider' };
155
- return withLatency(async () => {
156
- const { rows } = await storage.searchTurnEmbeddings(pool, {
157
- schema, tenantId, queryVec, agentId: query.agentId, limit,
158
- });
159
- return normalizeHits(rows, 'turn_distance');
160
- });
161
- }
162
- case 'hybrid': {
163
- return withLatency(async () => {
164
- const rows = await aquifer.recall(query.text, {
165
- agentId: query.agentId, limit, mode: 'hybrid', rerank: false,
166
- });
167
- return normalizeAquiferHits(rows);
168
- });
169
- }
170
- case 'hybrid-rerank': {
171
- return withLatency(async () => {
172
- const rows = await aquifer.recall(query.text, {
173
- agentId: query.agentId, limit, mode: 'hybrid', rerank: true,
174
- });
175
- return normalizeAquiferHits(rows);
176
- });
177
- }
178
- default:
179
- return { skipped: true, reason: `unknown pipeline ${name}` };
180
- }
181
- }
182
-
183
- function dcg(rels) {
184
- return rels.reduce((acc, rel, i) => acc + (Math.pow(2, rel) - 1) / Math.log2(i + 2), 0);
185
- }
186
-
187
- function nDcgAtK(judgedHits, k) {
188
- const at = judgedHits.slice(0, k);
189
- const ideal = [...judgedHits].sort((a, b) => b - a).slice(0, k);
190
- const idcg = dcg(ideal);
191
- if (idcg === 0) return null;
192
- return dcg(at) / idcg;
193
- }
194
-
195
- function reciprocalRank(judgedHits) {
196
- const idx = judgedHits.findIndex(r => r > 0);
197
- return idx < 0 ? 0 : 1 / (idx + 1);
198
- }
199
-
200
- function computeMetrics(runs, judgements, k) {
201
- const judgeMap = new Map();
202
- for (const j of judgements) {
203
- judgeMap.set(`${j.queryId}::${j.sessionId}`, j.relevance);
204
- }
205
- const byPipeline = {};
206
- for (const run of runs) {
207
- if (run.skipped || run.error) continue;
208
- const arr = byPipeline[run.pipeline] || (byPipeline[run.pipeline] = { judged: [], latency: [], empty: 0, total: 0, judgeable: 0 });
209
- arr.total++;
210
- arr.latency.push(run.latencyMs);
211
- if (!run.hits || run.hits.length === 0) arr.empty++;
212
- const rels = (run.hits || []).map(h => judgeMap.get(`${run.queryId}::${h.sessionId}`) ?? 0);
213
- if (rels.some(r => r > 0)) arr.judgeable++;
214
- arr.judged.push(rels);
215
- }
216
- const result = [];
217
- for (const [pipeline, agg] of Object.entries(byPipeline)) {
218
- const ndcgs = agg.judged.map(rels => nDcgAtK(rels, k)).filter(v => v !== null);
219
- const mrrs = agg.judged.map(reciprocalRank);
220
- const lat = agg.latency.slice().sort((a, b) => a - b);
221
- const p = (frac) => lat.length === 0 ? null : lat[Math.min(lat.length - 1, Math.floor(frac * lat.length))];
222
- result.push({
223
- pipeline,
224
- count: agg.total,
225
- nDCG5: ndcgs.length ? ndcgs.reduce((a, b) => a + b, 0) / ndcgs.length : null,
226
- MRR: mrrs.length ? mrrs.reduce((a, b) => a + b, 0) / mrrs.length : null,
227
- latencyMs: {
228
- mean: lat.length ? lat.reduce((a, b) => a + b, 0) / lat.length : null,
229
- p50: p(0.5),
230
- p95: p(0.95),
231
- },
232
- emptyResultRate: agg.total ? agg.empty / agg.total : 0,
233
- judgeableRate: agg.total ? agg.judgeable / agg.total : 0,
234
- });
235
- }
236
- return result;
237
- }
238
-
239
- function renderMarkdown(report, k) {
240
- const lines = [];
241
- lines.push(`# Aquifer Retro Recall Bench`);
242
- lines.push('');
243
- lines.push(`- Generated: ${report.meta.generatedAt}`);
244
- lines.push(`- Schema: \`${report.meta.schema}\` / Tenant: \`${report.meta.tenantId}\``);
245
- lines.push(`- Queries: ${report.queries.length} (warmup ${report.meta.warmup} excluded from metrics)`);
246
- lines.push(`- Pipelines: ${report.meta.pipelines.join(', ')}`);
247
- lines.push('');
248
- lines.push(`## Overall (top ${k})`);
249
- lines.push('| Pipeline | nDCG@5 | MRR | Mean ms | p50 | p95 | Empty% | Judgeable% | N |');
250
- lines.push('|---|---|---|---|---|---|---|---|---|');
251
- for (const m of report.metrics.overall) {
252
- lines.push(`| ${m.pipeline} | ${fmt(m.nDCG5)} | ${fmt(m.MRR)} | ${fmtMs(m.latencyMs.mean)} | ${fmtMs(m.latencyMs.p50)} | ${fmtMs(m.latencyMs.p95)} | ${pct(m.emptyResultRate)} | ${pct(m.judgeableRate)} | ${m.count} |`);
253
- }
254
- if (report.metrics.zhMixed) {
255
- lines.push('');
256
- lines.push(`## ZH+Mixed subset (top ${k})`);
257
- lines.push('| Pipeline | nDCG@5 | MRR | Empty% | N |');
258
- lines.push('|---|---|---|---|---|');
259
- for (const m of report.metrics.zhMixed) {
260
- lines.push(`| ${m.pipeline} | ${fmt(m.nDCG5)} | ${fmt(m.MRR)} | ${pct(m.emptyResultRate)} | ${m.count} |`);
261
- }
262
- }
263
- if (report.metrics.en) {
264
- lines.push('');
265
- lines.push(`## EN subset (top ${k})`);
266
- lines.push('| Pipeline | nDCG@5 | MRR | Empty% | N |');
267
- lines.push('|---|---|---|---|---|');
268
- for (const m of report.metrics.en) {
269
- lines.push(`| ${m.pipeline} | ${fmt(m.nDCG5)} | ${fmt(m.MRR)} | ${pct(m.emptyResultRate)} | ${m.count} |`);
270
- }
271
- }
272
- if (report.skipped.length > 0) {
273
- lines.push('');
274
- lines.push('## Skipped pipelines');
275
- for (const s of report.skipped) lines.push(`- \`${s.pipeline}\`: ${s.reason}`);
276
- }
277
- return lines.join('\n') + '\n';
278
- }
279
-
280
- function fmt(v) { return (v === null || v === undefined) ? '—' : v.toFixed(3); }
281
- function fmtMs(v) { return (v === null || v === undefined) ? '—' : v.toFixed(1); }
282
- function pct(v) { return (v === null || v === undefined) ? '—' : `${(v * 100).toFixed(0)}%`; }
283
-
284
- async function main() {
285
- const args = parseArgs(process.argv.slice(2));
286
- if (args.help || !args.querySet) { printHelp(); process.exit(args.help ? 0 : 2); }
287
-
288
- const querySet = JSON.parse(fs.readFileSync(args.querySet, 'utf8'));
289
- const queries = querySet.queries || [];
290
- if (queries.length === 0) { console.error('Empty query set'); process.exit(2); }
291
-
292
- const judgements = args.judgements
293
- ? (JSON.parse(fs.readFileSync(args.judgements, 'utf8')).judgements || [])
294
- : [];
295
-
296
- const dbUrl = process.env.DATABASE_URL || process.env.AQUIFER_DB_URL;
297
- if (!dbUrl) { console.error('DATABASE_URL is required'); process.exit(2); }
298
-
299
- const pool = new Pool({ connectionString: dbUrl });
300
- const ftsAvailable = await detectFtsConfigsAvailable(pool);
301
-
302
- const embedFn = (() => {
303
- try {
304
- const e = createEmbedder({}); // autodetect via EMBED_PROVIDER
305
- return (texts) => e.embedBatch(texts);
306
- } catch (err) {
307
- console.warn(`[bench] embed unavailable: ${err.message} — vector pipelines will skip`);
308
- return null;
309
- }
310
- })();
311
-
312
- const aquifer = aquiferIndex.createAquifer({
313
- db: pool,
314
- schema: args.schema,
315
- tenantId: args.tenantId,
316
- embed: embedFn ? { fn: embedFn } : undefined,
317
- rerank: process.env.OPENROUTER_API_KEY
318
- ? { provider: 'openrouter', openrouterApiKey: process.env.OPENROUTER_API_KEY, topK: args.rerankTopK, autoTrigger: { enabled: false } }
319
- : null,
320
- });
321
-
322
- const ctx = {
323
- pool, schema: args.schema, tenantId: args.tenantId,
324
- limit: args.limit, queryVec: null, aquifer, ftsAvailable,
325
- };
326
-
327
- const runs = [];
328
- const skipped = new Set();
329
-
330
- // Warmup: just run hybrid once to prime the pool.
331
- for (let w = 0; w < args.warmup; w++) {
332
- try { await aquifer.recall(queries[0].text, { limit: args.limit }); } catch { /* ignore */ }
333
- }
334
-
335
- for (const q of queries) {
336
- let queryVec = null;
337
- if (embedFn) {
338
- try { queryVec = (await embedFn([q.text]))[0]; }
339
- catch (err) { console.warn(`[bench] embed failed for "${q.id}": ${err.message}`); }
340
- }
341
- ctx.queryVec = queryVec;
342
-
343
- for (const pipeline of args.pipelines) {
344
- const r = await runPipeline(pipeline, ctx, q);
345
- if (r.skipped) {
346
- skipped.add(JSON.stringify({ pipeline, reason: r.reason }));
347
- continue;
348
- }
349
- runs.push({
350
- queryId: q.id,
351
- pipeline,
352
- latencyMs: r.latencyMs,
353
- empty: !r.result || r.result.length === 0,
354
- error: r.error ? { code: r.error.code || 'ERR', message: r.error.message } : null,
355
- hits: r.result || [],
356
- });
357
- }
358
- }
359
-
360
- await aquifer.close?.().catch(() => {});
361
- await pool.end().catch(() => {});
362
-
363
- const overallRuns = runs;
364
- const zhMixedQueries = new Set(queries.filter(q => q.lang === 'zh' || q.lang === 'mixed').map(q => q.id));
365
- const enQueries = new Set(queries.filter(q => q.lang === 'en').map(q => q.id));
366
- const subset = (set) => runs.filter(r => set.has(r.queryId));
367
-
368
- const report = {
369
- meta: {
370
- generatedAt: new Date().toISOString(),
371
- schema: args.schema,
372
- tenantId: args.tenantId,
373
- limit: args.limit,
374
- warmup: args.warmup,
375
- pipelines: args.pipelines,
376
- ftsConfigsAvailable: [...ftsAvailable],
377
- },
378
- queries,
379
- judgements,
380
- runs,
381
- metrics: {
382
- overall: computeMetrics(overallRuns, judgements, args.limit),
383
- zhMixed: zhMixedQueries.size > 0 ? computeMetrics(subset(zhMixedQueries), judgements, args.limit) : null,
384
- en: enQueries.size > 0 ? computeMetrics(subset(enQueries), judgements, args.limit) : null,
385
- },
386
- skipped: [...skipped].map(s => JSON.parse(s)),
387
- };
388
-
389
- const outPath = args.output || `bench-report-${Date.now()}.json`;
390
- fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
391
- console.log(`Wrote ${outPath}`);
392
-
393
- const md = renderMarkdown(report, args.limit);
394
- const mdPath = args.markdown || outPath.replace(/\.json$/, '.md');
395
- fs.writeFileSync(mdPath, md);
396
- console.log(`Wrote ${mdPath}`);
397
-
398
- if (judgements.length === 0) {
399
- console.log('\nNo judgements provided — metrics are coverage/latency only.');
400
- console.log('Edit the JSON output to add judgements like:');
401
- console.log(' "judgements": [{"queryId":"q-001","sessionId":"sess_abc","relevance":3}, ...]');
402
- console.log('then re-run with --judgements <path>.');
403
- }
404
- }
405
-
406
- main().catch(err => {
407
- console.error('[bench] fatal:', err.stack || err.message);
408
- process.exit(1);
409
- });
@@ -1,75 +0,0 @@
1
- -- Sample real user "first question" turns from sessions for retro recall bench.
2
- -- Output: 30 rows balanced across agent (cc/main/life) × language (zh/en/mixed).
3
- -- Usage:
4
- -- psql $DATABASE_URL -f scripts/sample-bench-queries.sql --csv > queries.csv
5
- -- then convert to JSON via:
6
- -- node -e "const fs=require('fs');const lines=fs.readFileSync('queries.csv','utf8').trim().split('\n').slice(1);const out={version:1,queries:lines.map((l,i)=>{const[id,sid,ag,src,lang,qt]=l.split(','); return {id:'q-sql-'+(i+1),sessionRowId:Number(id),sessionId:sid,agentId:ag,source:src,sourceKind:'sql-sampled',lang,text:qt.replace(/^\"|\"$/g,'').replace(/\"\"/g,'\"')}})};fs.writeFileSync('queries.json',JSON.stringify(out,null,2))"
7
-
8
- -- Override with `psql -v schema=aquifer -f sample-bench-queries.sql`
9
- \if :{?schema}
10
- \else
11
- \set schema 'miranda'
12
- \endif
13
-
14
- WITH raw_turns AS (
15
- SELECT
16
- s.id AS session_row_id,
17
- s.session_id,
18
- s.agent_id,
19
- s.source,
20
- s.started_at,
21
- m.ordinality AS turn_ordinal,
22
- m.msg->>'role' AS role,
23
- m.msg->>'content' AS content
24
- FROM :"schema".sessions s
25
- CROSS JOIN LATERAL jsonb_array_elements(
26
- COALESCE(s.messages->'normalized', s.messages)
27
- ) WITH ORDINALITY AS m(msg, ordinality)
28
- WHERE s.agent_id IN ('main', 'life', 'cc')
29
- AND s.user_count > 0
30
- ),
31
- question_turns AS (
32
- SELECT *,
33
- CASE
34
- WHEN content ~ '[\u4e00-\u9fff]' AND content ~ '[A-Za-z]' THEN 'mixed'
35
- WHEN content ~ '[\u4e00-\u9fff]' THEN 'zh'
36
- WHEN content ~ '[A-Za-z]' THEN 'en'
37
- ELSE 'other'
38
- END AS lang_bucket,
39
- row_number() OVER (PARTITION BY session_row_id ORDER BY turn_ordinal) AS seq_in_session
40
- FROM raw_turns
41
- WHERE role = 'user'
42
- AND content IS NOT NULL
43
- AND length(content) BETWEEN 4 AND 200
44
- AND (content ~ '[??]' OR content ~ '(嗎|呢|怎麼|如何|why|how|what|can you|could you)')
45
- ),
46
- first_questions AS (
47
- SELECT *
48
- FROM question_turns
49
- WHERE seq_in_session = 1
50
- AND lang_bucket IN ('mixed', 'zh', 'en')
51
- ),
52
- balanced AS (
53
- SELECT *,
54
- row_number() OVER (PARTITION BY agent_id, lang_bucket ORDER BY random()) AS bucket_rn
55
- FROM first_questions
56
- )
57
- SELECT
58
- session_row_id,
59
- session_id,
60
- agent_id,
61
- source,
62
- lang_bucket AS lang,
63
- content AS query_text
64
- FROM balanced
65
- WHERE
66
- (agent_id = 'main' AND lang_bucket = 'mixed' AND bucket_rn <= 4) OR
67
- (agent_id = 'main' AND lang_bucket = 'zh' AND bucket_rn <= 3) OR
68
- (agent_id = 'main' AND lang_bucket = 'en' AND bucket_rn <= 2) OR
69
- (agent_id = 'life' AND lang_bucket = 'mixed' AND bucket_rn <= 3) OR
70
- (agent_id = 'life' AND lang_bucket = 'zh' AND bucket_rn <= 2) OR
71
- (agent_id = 'life' AND lang_bucket = 'en' AND bucket_rn <= 2) OR
72
- (agent_id = 'cc' AND lang_bucket = 'mixed' AND bucket_rn <= 4) OR
73
- (agent_id = 'cc' AND lang_bucket = 'zh' AND bucket_rn <= 3) OR
74
- (agent_id = 'cc' AND lang_bucket = 'en' AND bucket_rn <= 2)
75
- ORDER BY agent_id, lang;