metame-cli 1.6.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +4 -1
- package/package.json +1 -1
- package/scripts/core/chunker.js +100 -0
- package/scripts/core/embedding.js +225 -0
- package/scripts/core/hybrid-search.js +296 -0
- package/scripts/core/wiki-db.js +144 -3
- package/scripts/daemon-command-router.js +25 -1
- package/scripts/daemon-default.yaml +31 -0
- package/scripts/daemon-embedding.js +162 -0
- package/scripts/daemon-engine-runtime.js +1 -1
- package/scripts/daemon-health-scan.js +185 -0
- package/scripts/daemon-runtime-lifecycle.js +1 -1
- package/scripts/daemon-task-scheduler.js +5 -3
- package/scripts/daemon-wiki.js +126 -4
- package/scripts/daemon.js +4 -2
- package/scripts/feishu-adapter.js +25 -0
- package/scripts/memory-backfill-chunks.js +92 -0
- package/scripts/memory-search.js +43 -15
- package/scripts/memory-wiki-schema.js +161 -2
- package/scripts/memory.js +15 -0
- package/scripts/wiki-cluster.js +121 -0
- package/scripts/wiki-extract.js +171 -0
- package/scripts/wiki-facts.js +351 -0
- package/scripts/wiki-import.js +256 -0
- package/scripts/wiki-reflect-build.js +352 -28
- package/scripts/wiki-reflect-export.js +115 -0
- package/scripts/wiki-reflect.js +34 -1
- package/scripts/wiki-synthesis.js +224 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('node:fs');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
|
|
6
|
+
const { extractText, slugFromFilename, sha256 } = require('./wiki-extract');
|
|
7
|
+
const { buildConnectedComponents, getDocEmbeddings } = require('./wiki-cluster');
|
|
8
|
+
const { buildDocWikiPage, buildTopicClusterPage } = require('./wiki-reflect-build');
|
|
9
|
+
const {
|
|
10
|
+
upsertDocSource, getDocSourceByPath, listStaleDocSources,
|
|
11
|
+
markDocSourcesMissing, getClusterMemberIds,
|
|
12
|
+
listClusterPages,
|
|
13
|
+
} = require('./core/wiki-db');
|
|
14
|
+
|
|
15
|
+
const SUPPORTED_EXTS = new Set(['.md', '.txt', '.pdf']);
|
|
16
|
+
const DRAIN_POLL_MS = 5000;
|
|
17
|
+
const DRAIN_TIMEOUT_MS = 5 * 60 * 1000;
|
|
18
|
+
|
|
19
|
+
function scanFiles(inputPath) {
|
|
20
|
+
const real = fs.realpathSync(inputPath);
|
|
21
|
+
const stat = fs.statSync(real);
|
|
22
|
+
if (stat.isFile()) {
|
|
23
|
+
return SUPPORTED_EXTS.has(path.extname(real).toLowerCase()) ? [real] : [];
|
|
24
|
+
}
|
|
25
|
+
return fs.readdirSync(real)
|
|
26
|
+
.filter(f => SUPPORTED_EXTS.has(path.extname(f).toLowerCase()))
|
|
27
|
+
.map(f => fs.realpathSync(path.join(real, f)));
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function generateUniqueSlug(db, base) {
|
|
31
|
+
let candidate = base;
|
|
32
|
+
let n = 2;
|
|
33
|
+
const checkWiki = db.prepare('SELECT 1 FROM wiki_pages WHERE slug=?');
|
|
34
|
+
const checkDoc = db.prepare('SELECT 1 FROM doc_sources WHERE slug=?');
|
|
35
|
+
while (checkWiki.get(candidate) || checkDoc.get(candidate)) {
|
|
36
|
+
candidate = `${base}-${n++}`;
|
|
37
|
+
}
|
|
38
|
+
return candidate;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async function waitForEmbeddingDrain(db, chunkIds, log = () => {}) {
|
|
42
|
+
if (chunkIds.length === 0) return true;
|
|
43
|
+
const placeholders = chunkIds.map(() => '?').join(',');
|
|
44
|
+
const query = db.prepare(
|
|
45
|
+
`SELECT COUNT(*) as cnt FROM embedding_queue WHERE item_type='chunk' AND item_id IN (${placeholders})`
|
|
46
|
+
);
|
|
47
|
+
const deadline = Date.now() + DRAIN_TIMEOUT_MS;
|
|
48
|
+
while (Date.now() < deadline) {
|
|
49
|
+
const { cnt } = query.get(...chunkIds);
|
|
50
|
+
if (cnt === 0) return true;
|
|
51
|
+
log(`[wiki-import] waiting for ${cnt} embeddings to drain...`);
|
|
52
|
+
await new Promise(r => setTimeout(r, DRAIN_POLL_MS));
|
|
53
|
+
}
|
|
54
|
+
log('[wiki-import] WARNING: embedding drain timed out, skipping clustering');
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
async function runWikiImport(db, inputPath, { providers, noCluster = false, log = () => {} } = {}) {
|
|
59
|
+
if (!fs.existsSync(inputPath)) {
|
|
60
|
+
log(`[wiki-import] ERROR: path does not exist: ${inputPath}`);
|
|
61
|
+
return { imported: 0, skipped: 0, failed: 0, clusters: 0 };
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
log(`[wiki-import] scanning: ${inputPath}`);
|
|
65
|
+
const files = scanFiles(inputPath);
|
|
66
|
+
log(`[wiki-import] found ${files.length} supported files`);
|
|
67
|
+
|
|
68
|
+
const seenPaths = [];
|
|
69
|
+
const stats = { imported: 0, skipped: 0, failed: 0, clusters: 0 };
|
|
70
|
+
const extractedTexts = new Map();
|
|
71
|
+
|
|
72
|
+
// Phase 0: Extract + hash check + upsert doc_sources
|
|
73
|
+
for (const filePath of files) {
|
|
74
|
+
seenPaths.push(filePath);
|
|
75
|
+
try {
|
|
76
|
+
const stat = fs.statSync(filePath);
|
|
77
|
+
const existing = getDocSourceByPath(db, filePath);
|
|
78
|
+
|
|
79
|
+
if (existing && existing.mtime_ms === stat.mtimeMs && existing.size_bytes === stat.size) {
|
|
80
|
+
if (!existing.content_stale) { stats.skipped++; continue; }
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const { text, title, extractor, extractStatus, errorMessage } = await extractText(filePath);
|
|
84
|
+
if (text) extractedTexts.set(filePath, text);
|
|
85
|
+
const fileHash = sha256(fs.readFileSync(filePath));
|
|
86
|
+
const extractedTextHash = text ? sha256(text) : null;
|
|
87
|
+
const baseSlug = slugFromFilename(filePath);
|
|
88
|
+
const slug = existing ? existing.slug : generateUniqueSlug(db, baseSlug);
|
|
89
|
+
|
|
90
|
+
upsertDocSource(db, {
|
|
91
|
+
filePath, fileHash,
|
|
92
|
+
mtimeMs: stat.mtimeMs, sizeBytes: stat.size,
|
|
93
|
+
extractedTextHash, fileType: path.extname(filePath).slice(1).toLowerCase(),
|
|
94
|
+
extractor, extractStatus, title, slug,
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
if (extractStatus !== 'ok') {
|
|
98
|
+
log(`[wiki-import] SKIP ${path.basename(filePath)}: ${errorMessage || extractStatus}`);
|
|
99
|
+
}
|
|
100
|
+
} catch (err) {
|
|
101
|
+
log(`[wiki-import] Phase 0 error for ${path.basename(filePath)}: ${err.message}`);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
markDocSourcesMissing(db, seenPaths);
|
|
106
|
+
|
|
107
|
+
// Phase 1: Build Tier 1 pages for stale docs
|
|
108
|
+
const stale = listStaleDocSources(db);
|
|
109
|
+
const builtSlugs = [];
|
|
110
|
+
const allChunkIds = [];
|
|
111
|
+
|
|
112
|
+
for (const docSrc of stale) {
|
|
113
|
+
if (docSrc.extract_status !== 'ok') {
|
|
114
|
+
db.prepare("UPDATE doc_sources SET content_stale=0 WHERE id=?").run(docSrc.id);
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
try {
|
|
118
|
+
const text = extractedTexts.get(docSrc.file_path) || '';
|
|
119
|
+
const allowedSlugs = files.map(f => slugFromFilename(f));
|
|
120
|
+
const result = await buildDocWikiPage(db, docSrc, text, { allowedSlugs, providers });
|
|
121
|
+
if (result) {
|
|
122
|
+
db.prepare("UPDATE doc_sources SET content_stale=0, built_at=? WHERE id=?")
|
|
123
|
+
.run(new Date().toISOString(), docSrc.id);
|
|
124
|
+
builtSlugs.push(docSrc.slug);
|
|
125
|
+
const chunks = db.prepare("SELECT id FROM content_chunks WHERE page_slug=?").all(docSrc.slug);
|
|
126
|
+
allChunkIds.push(...chunks.map(c => c.id));
|
|
127
|
+
stats.imported++;
|
|
128
|
+
log(`[wiki-import] built: ${docSrc.slug}`);
|
|
129
|
+
}
|
|
130
|
+
} catch (err) {
|
|
131
|
+
db.prepare("UPDATE doc_sources SET error_message=? WHERE id=?").run(err.message, docSrc.id);
|
|
132
|
+
stats.failed++;
|
|
133
|
+
log(`[wiki-import] FAILED ${docSrc.slug}: ${err.message}`);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Phase 2: Cascade stale cluster pages
|
|
138
|
+
if (builtSlugs.length > 0) {
|
|
139
|
+
const affected = db.prepare(`
|
|
140
|
+
SELECT DISTINCT page_slug FROM wiki_page_doc_sources
|
|
141
|
+
WHERE role='cluster_member'
|
|
142
|
+
AND doc_source_id IN (SELECT id FROM doc_sources WHERE slug IN (${builtSlugs.map(() => '?').join(',')}))
|
|
143
|
+
`).all(...builtSlugs).map(r => r.page_slug);
|
|
144
|
+
if (affected.length > 0) {
|
|
145
|
+
const ph = affected.map(() => '?').join(',');
|
|
146
|
+
db.prepare(`UPDATE wiki_pages SET staleness=1 WHERE slug IN (${ph})`).run(...affected);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Phase 3: Clustering (Tier 2)
|
|
151
|
+
if (!noCluster) {
|
|
152
|
+
const drained = await waitForEmbeddingDrain(db, allChunkIds, log);
|
|
153
|
+
if (drained) {
|
|
154
|
+
const allDocSlugs = db.prepare("SELECT slug FROM doc_sources WHERE status='active' AND extract_status='ok'").all().map(r => r.slug);
|
|
155
|
+
const embeddings = getDocEmbeddings(db, allDocSlugs);
|
|
156
|
+
|
|
157
|
+
// embeddings is Array<{ slug, vector: Float32Array }>, use .length
|
|
158
|
+
if (embeddings.length >= 3) {
|
|
159
|
+
const clusters = buildConnectedComponents(embeddings, { threshold: 0.75, minSize: 3 });
|
|
160
|
+
const existingClusters = listClusterPages(db).map(cp => ({
|
|
161
|
+
slug: cp.slug,
|
|
162
|
+
memberIds: getClusterMemberIds(db, cp.slug),
|
|
163
|
+
}));
|
|
164
|
+
|
|
165
|
+
const getDocBySlug = db.prepare("SELECT * FROM doc_sources WHERE slug=?");
|
|
166
|
+
for (const memberSlugs of clusters) {
|
|
167
|
+
const docRows = memberSlugs.map(s => getDocBySlug.get(s)).filter(Boolean);
|
|
168
|
+
try {
|
|
169
|
+
// buildTopicClusterPage returns { slug, strippedLinks } or null
|
|
170
|
+
const clusterResult = await buildTopicClusterPage(db, docRows, {
|
|
171
|
+
allowedSlugs: allDocSlugs, providers, existingClusters,
|
|
172
|
+
});
|
|
173
|
+
if (clusterResult) {
|
|
174
|
+
stats.clusters++;
|
|
175
|
+
log(`[wiki-import] cluster: ${clusterResult.slug}`);
|
|
176
|
+
}
|
|
177
|
+
} catch (err) {
|
|
178
|
+
log(`[wiki-import] cluster FAILED: ${err.message}`);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
} else {
|
|
182
|
+
log('[wiki-import] not enough embedded docs for clustering yet');
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
log(`[wiki-import] done — imported: ${stats.imported}, skipped: ${stats.skipped}, failed: ${stats.failed}, clusters: ${stats.clusters}`);
|
|
188
|
+
return stats;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
module.exports = { runWikiImport, scanFiles, generateUniqueSlug, waitForEmbeddingDrain };
|
|
192
|
+
|
|
193
|
+
// ── CLI entry point ──────────────────────────────────────────────────────────
|
|
194
|
+
// Usage: node wiki-import.js <path> [--no-cluster]
|
|
195
|
+
//
|
|
196
|
+
// <path> File or directory to import (.pdf / .md / .txt)
|
|
197
|
+
// --no-cluster Skip Tier 2 clustering step
|
|
198
|
+
//
|
|
199
|
+
// Output: JSON on stdout, progress on stderr
|
|
200
|
+
// {"imported":3,"skipped":1,"failed":0,"clusters":1}
|
|
201
|
+
//
|
|
202
|
+
if (require.main === module) {
|
|
203
|
+
(async () => {
|
|
204
|
+
const { DatabaseSync } = require('node:sqlite');
|
|
205
|
+
const { DB_PATH } = require('./memory.js');
|
|
206
|
+
const { applyWikiSchema } = require('./memory-wiki-schema.js');
|
|
207
|
+
const { callHaiku, buildDistillEnv } = require('./providers.js');
|
|
208
|
+
|
|
209
|
+
const args = process.argv.slice(2);
|
|
210
|
+
if (args.length === 0 || args[0] === '--help' || args[0] === '-h') {
|
|
211
|
+
process.stderr.write([
|
|
212
|
+
'Usage: node wiki-import.js <path> [--no-cluster]',
|
|
213
|
+
'',
|
|
214
|
+
' <path> File or directory (.pdf / .md / .txt)',
|
|
215
|
+
' --no-cluster Skip Tier 2 embedding-based clustering',
|
|
216
|
+
'',
|
|
217
|
+
'Examples:',
|
|
218
|
+
' node ~/.metame/wiki-import.js ~/papers/',
|
|
219
|
+
' node ~/.metame/wiki-import.js ~/papers/paper.pdf',
|
|
220
|
+
].join('\n') + '\n');
|
|
221
|
+
process.exit(0);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const inputPath = args.find(a => !a.startsWith('--'));
|
|
225
|
+
const noCluster = args.includes('--no-cluster');
|
|
226
|
+
|
|
227
|
+
if (!inputPath) {
|
|
228
|
+
process.stderr.write('Error: path argument required\n');
|
|
229
|
+
process.exit(1);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
let db;
|
|
233
|
+
try {
|
|
234
|
+
db = new DatabaseSync(DB_PATH);
|
|
235
|
+
applyWikiSchema(db);
|
|
236
|
+
} catch (err) {
|
|
237
|
+
process.stderr.write(`Error: failed to open DB at ${DB_PATH}: ${err.message}\n`);
|
|
238
|
+
process.exit(1);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
try {
|
|
242
|
+
const stats = await runWikiImport(db, inputPath, {
|
|
243
|
+
providers: { callHaiku, buildDistillEnv },
|
|
244
|
+
noCluster,
|
|
245
|
+
log: (msg) => process.stderr.write(msg + '\n'),
|
|
246
|
+
});
|
|
247
|
+
process.stdout.write(JSON.stringify(stats) + '\n');
|
|
248
|
+
process.exit(0);
|
|
249
|
+
} catch (err) {
|
|
250
|
+
process.stderr.write(`Error: ${err.message}\n`);
|
|
251
|
+
process.exit(1);
|
|
252
|
+
} finally {
|
|
253
|
+
try { db.close(); } catch { /* ignore */ }
|
|
254
|
+
}
|
|
255
|
+
})();
|
|
256
|
+
}
|
|
@@ -10,31 +10,35 @@
|
|
|
10
10
|
* Exports:
|
|
11
11
|
* buildWikiPage(db, topic, queryResult, { allowedSlugs, providers })
|
|
12
12
|
* → { slug, content, strippedLinks, rawSourceIds } | null
|
|
13
|
+
* generateWikiContent(prompt, providers, allowedSlugs)
|
|
14
|
+
* → { content, strippedLinks } | null
|
|
15
|
+
* writeWikiPageWithChunks(db, pageSpec, content, { docSourceIds, role })
|
|
16
|
+
* → void
|
|
13
17
|
*/
|
|
14
18
|
|
|
19
|
+
const crypto = require('node:crypto');
|
|
15
20
|
const { buildWikiPrompt, validateWikilinks } = require('./core/wiki-prompt');
|
|
16
|
-
const {
|
|
21
|
+
const { extractText, extractSections } = require('./wiki-extract');
|
|
22
|
+
const { extractPaperFacts, buildTier1Prompt } = require('./wiki-facts');
|
|
23
|
+
const { buildComparisonMatrix, buildTimeline, detectContradictions, buildCoverageReport } = require('./wiki-synthesis');
|
|
24
|
+
const { upsertWikiPage, resetPageStaleness, appendWikiTimeline } = require('./core/wiki-db');
|
|
25
|
+
const { chunkText } = require('./core/chunker');
|
|
26
|
+
const { membershipHash, findMatchingCluster } = require('./wiki-cluster');
|
|
17
27
|
|
|
18
28
|
const LLM_TIMEOUT_MS = 60000; // Sonnet needs more time than Haiku
|
|
19
29
|
|
|
20
30
|
/**
|
|
21
|
-
*
|
|
31
|
+
* Call the LLM with a prompt and validate [[wikilinks]] in the response.
|
|
22
32
|
*
|
|
23
|
-
* @param {
|
|
24
|
-
* @param {{
|
|
25
|
-
* @param {
|
|
26
|
-
* @
|
|
27
|
-
*
|
|
28
|
-
* Returns null on LLM failure (caller enqueues for retry). DB write failure throws.
|
|
33
|
+
* @param {string} prompt
|
|
34
|
+
* @param {{ callHaiku: Function, buildDistillEnv: Function }} providers
|
|
35
|
+
* @param {string[]} allowedSlugs
|
|
36
|
+
* @returns {{ content: string, strippedLinks: string[] } | null}
|
|
37
|
+
* Returns null on LLM failure or empty response (caller enqueues for retry).
|
|
29
38
|
*/
|
|
30
|
-
async function
|
|
39
|
+
async function generateWikiContent(prompt, providers, allowedSlugs) {
|
|
31
40
|
const { callHaiku, buildDistillEnv } = providers;
|
|
32
|
-
const { totalCount, facts, capsuleExcerpts } = queryResult;
|
|
33
41
|
|
|
34
|
-
// Build prompt
|
|
35
|
-
const prompt = buildWikiPrompt(topic, facts, capsuleExcerpts, allowedSlugs);
|
|
36
|
-
|
|
37
|
-
// Call LLM — return null on failure so caller can schedule exponential-backoff retry
|
|
38
42
|
let rawContent;
|
|
39
43
|
try {
|
|
40
44
|
const env = buildDistillEnv();
|
|
@@ -49,34 +53,141 @@ async function buildWikiPage(db, topic, queryResult, { allowedSlugs = [], provid
|
|
|
49
53
|
|
|
50
54
|
// Validate and strip illegal [[wikilinks]]
|
|
51
55
|
const { content, stripped: strippedLinks } = validateWikilinks(rawContent.trim(), allowedSlugs);
|
|
56
|
+
return { content, strippedLinks };
|
|
57
|
+
}
|
|
52
58
|
|
|
53
|
-
|
|
54
|
-
|
|
59
|
+
/**
|
|
60
|
+
* Atomic DB write: upsert wiki_page, reset staleness, replace chunks, enqueue
|
|
61
|
+
* embeddings, and optionally link doc_sources. All inside a single transaction.
|
|
62
|
+
*
|
|
63
|
+
* @param {object} db - DatabaseSync instance
|
|
64
|
+
* @param {{ slug: string, title: string, primary_topic: string, source_type?: string,
|
|
65
|
+
* raw_source_ids?: string, capsule_refs?: string, raw_source_count?: number,
|
|
66
|
+
* topic_tags?: string, word_count?: number, membership_hash?: string,
|
|
67
|
+
* cluster_size?: number }} pageSpec
|
|
68
|
+
* @param {string} content
|
|
69
|
+
* @param {{ docSourceIds?: number[], role?: string }} opts
|
|
70
|
+
*/
|
|
71
|
+
function writeWikiPageWithChunks(db, pageSpec, content, { docSourceIds = [], role } = {}) {
|
|
72
|
+
const {
|
|
73
|
+
slug,
|
|
74
|
+
title,
|
|
75
|
+
primary_topic,
|
|
76
|
+
source_type = 'memory',
|
|
77
|
+
raw_source_ids,
|
|
78
|
+
capsule_refs,
|
|
79
|
+
raw_source_count = 0,
|
|
80
|
+
topic_tags,
|
|
81
|
+
membership_hash,
|
|
82
|
+
cluster_size,
|
|
83
|
+
} = pageSpec;
|
|
84
|
+
|
|
85
|
+
const wordCount = content.split(/\s+/).filter(Boolean).length;
|
|
55
86
|
|
|
56
|
-
// Write to DB in a transaction
|
|
57
|
-
const topicTagsArr = [topic.tag];
|
|
58
87
|
db.prepare('BEGIN').run();
|
|
59
88
|
try {
|
|
60
89
|
upsertWikiPage(db, {
|
|
61
|
-
slug
|
|
62
|
-
primary_topic
|
|
63
|
-
title
|
|
90
|
+
slug,
|
|
91
|
+
primary_topic,
|
|
92
|
+
title,
|
|
64
93
|
content,
|
|
65
|
-
raw_source_ids:
|
|
66
|
-
capsule_refs: '[]',
|
|
67
|
-
raw_source_count
|
|
68
|
-
topic_tags:
|
|
69
|
-
word_count:
|
|
94
|
+
raw_source_ids: raw_source_ids !== undefined ? raw_source_ids : '[]',
|
|
95
|
+
capsule_refs: capsule_refs !== undefined ? capsule_refs : '[]',
|
|
96
|
+
raw_source_count,
|
|
97
|
+
topic_tags: topic_tags !== undefined ? topic_tags : '[]',
|
|
98
|
+
word_count: wordCount,
|
|
99
|
+
source_type,
|
|
100
|
+
membership_hash: membership_hash !== undefined ? membership_hash : null,
|
|
101
|
+
cluster_size: cluster_size !== undefined ? cluster_size : null,
|
|
70
102
|
});
|
|
71
103
|
|
|
72
104
|
// Reset staleness counters via canonical helper (staleness=0, last_built_at=now)
|
|
73
|
-
resetPageStaleness(db,
|
|
105
|
+
resetPageStaleness(db, slug, raw_source_count);
|
|
106
|
+
|
|
107
|
+
// ── Chunk content + enqueue embeddings ──────────────────────────────────
|
|
108
|
+
// Clean stale embedding_queue entries for this page's old chunks
|
|
109
|
+
const oldChunkIds = db.prepare(
|
|
110
|
+
'SELECT id FROM content_chunks WHERE page_slug = ?',
|
|
111
|
+
).all(slug).map(r => r.id);
|
|
112
|
+
if (oldChunkIds.length > 0) {
|
|
113
|
+
const ph = oldChunkIds.map(() => '?').join(', ');
|
|
114
|
+
db.prepare(`DELETE FROM embedding_queue WHERE item_type = 'chunk' AND item_id IN (${ph})`).run(...oldChunkIds);
|
|
115
|
+
}
|
|
116
|
+
// Delete old chunks
|
|
117
|
+
db.prepare('DELETE FROM content_chunks WHERE page_slug = ?').run(slug);
|
|
118
|
+
|
|
119
|
+
// Create new chunks + enqueue
|
|
120
|
+
const chunks = chunkText(content, { targetWords: 300 });
|
|
121
|
+
const insertChunk = db.prepare(
|
|
122
|
+
'INSERT INTO content_chunks (id, page_slug, chunk_text, chunk_idx) VALUES (?, ?, ?, ?)',
|
|
123
|
+
);
|
|
124
|
+
const enqueue = db.prepare(
|
|
125
|
+
"INSERT INTO embedding_queue (item_type, item_id) VALUES ('chunk', ?)",
|
|
126
|
+
);
|
|
127
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
128
|
+
const chunkId = `ck_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
|
129
|
+
insertChunk.run(chunkId, slug, chunks[i], i);
|
|
130
|
+
enqueue.run(chunkId);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Link doc_sources if provided
|
|
134
|
+
if (docSourceIds.length > 0) {
|
|
135
|
+
const insertLink = db.prepare(
|
|
136
|
+
'INSERT OR IGNORE INTO wiki_page_doc_sources (page_slug, doc_source_id, role) VALUES (?, ?, ?)',
|
|
137
|
+
);
|
|
138
|
+
const effectiveRole = role || 'primary';
|
|
139
|
+
for (const docId of docSourceIds) {
|
|
140
|
+
insertLink.run(slug, docId, effectiveRole);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
74
143
|
|
|
75
144
|
db.prepare('COMMIT').run();
|
|
76
145
|
} catch (err) {
|
|
77
146
|
try { db.prepare('ROLLBACK').run(); } catch { /* ignore */ }
|
|
78
147
|
throw err; // propagate DB errors to caller
|
|
79
148
|
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Build a wiki page: call LLM, validate links, write to DB.
|
|
153
|
+
*
|
|
154
|
+
* @param {object} db - DatabaseSync instance
|
|
155
|
+
* @param {{ tag: string, slug: string, label: string }} topic
|
|
156
|
+
* @param {{ totalCount: number, facts: object[], capsuleExcerpts: string }} queryResult
|
|
157
|
+
* @param {{ allowedSlugs: string[], providers: { callHaiku: Function, buildDistillEnv: Function } }} opts
|
|
158
|
+
* @returns {{ slug: string, content: string, strippedLinks: string[], rawSourceIds: string[] } | null}
|
|
159
|
+
* Returns null on LLM failure (caller enqueues for retry). DB write failure throws.
|
|
160
|
+
*/
|
|
161
|
+
async function buildWikiPage(db, topic, queryResult, { allowedSlugs = [], providers }) {
|
|
162
|
+
const { totalCount, facts, capsuleExcerpts } = queryResult;
|
|
163
|
+
|
|
164
|
+
// Build prompt
|
|
165
|
+
const prompt = buildWikiPrompt(topic, facts, capsuleExcerpts, allowedSlugs);
|
|
166
|
+
|
|
167
|
+
// Call LLM — return null on failure so caller can schedule exponential-backoff retry
|
|
168
|
+
const llmResult = await generateWikiContent(prompt, providers, allowedSlugs);
|
|
169
|
+
if (!llmResult) return null;
|
|
170
|
+
|
|
171
|
+
const { content, strippedLinks } = llmResult;
|
|
172
|
+
|
|
173
|
+
// Collect source IDs from facts
|
|
174
|
+
const rawSourceIds = facts.map(f => f.id).filter(Boolean);
|
|
175
|
+
|
|
176
|
+
// Write to DB in a transaction
|
|
177
|
+
const topicTagsArr = [topic.tag];
|
|
178
|
+
writeWikiPageWithChunks(db, {
|
|
179
|
+
slug: topic.slug,
|
|
180
|
+
primary_topic: topic.tag,
|
|
181
|
+
title: topic.label || topic.tag,
|
|
182
|
+
raw_source_ids: JSON.stringify(rawSourceIds),
|
|
183
|
+
capsule_refs: '[]',
|
|
184
|
+
raw_source_count: totalCount,
|
|
185
|
+
topic_tags: JSON.stringify(topicTagsArr),
|
|
186
|
+
}, content, { docSourceIds: [] });
|
|
187
|
+
|
|
188
|
+
// Append evidence to timeline (compiled truth was just rewritten above)
|
|
189
|
+
const chunks = chunkText(content, { targetWords: 300 });
|
|
190
|
+
appendWikiTimeline(db, topic.slug, `基于 ${totalCount} 条 facts 重建 (${rawSourceIds.length} 条直接引用, ${chunks.length} chunks)`);
|
|
80
191
|
|
|
81
192
|
return { slug: topic.slug, content, strippedLinks, rawSourceIds };
|
|
82
193
|
}
|
|
@@ -114,4 +225,217 @@ function buildFallbackWikiContent(topic, queryResult) {
|
|
|
114
225
|
return lines.join('\n').trim();
|
|
115
226
|
}
|
|
116
227
|
|
|
117
|
-
|
|
228
|
+
/**
|
|
229
|
+
* Build a Tier 1 wiki page from pre-extracted facts.
|
|
230
|
+
* LLM call is outside any DB transaction.
|
|
231
|
+
*
|
|
232
|
+
* @param {object} db
|
|
233
|
+
* @param {object} docSource — row from doc_sources
|
|
234
|
+
* @param {object[]} facts — rows from paper_facts (already written to DB)
|
|
235
|
+
* @param {{ allowedSlugs: string[], providers: object }} opts
|
|
236
|
+
* @returns {Promise<{slug, content, strippedLinks}|null>}
|
|
237
|
+
*/
|
|
238
|
+
async function buildTier1Page(db, docSource, facts, { allowedSlugs, providers }) {
|
|
239
|
+
const { slug, title, id: docSourceId } = docSource;
|
|
240
|
+
const displayTitle = title || slug;
|
|
241
|
+
|
|
242
|
+
// Build prompt from facts — no text truncation, evidence-grounded
|
|
243
|
+
const prompt = buildTier1Prompt(displayTitle, facts);
|
|
244
|
+
const result = await generateWikiContent(prompt, providers, allowedSlugs);
|
|
245
|
+
if (!result) return null;
|
|
246
|
+
|
|
247
|
+
const { content, strippedLinks } = result;
|
|
248
|
+
|
|
249
|
+
writeWikiPageWithChunks(db, {
|
|
250
|
+
slug,
|
|
251
|
+
title: displayTitle,
|
|
252
|
+
primary_topic: slug,
|
|
253
|
+
source_type: 'doc',
|
|
254
|
+
raw_source_ids: '[]',
|
|
255
|
+
topic_tags: '[]',
|
|
256
|
+
raw_source_count: facts.length,
|
|
257
|
+
}, content, { docSourceIds: [docSourceId], role: 'primary' });
|
|
258
|
+
|
|
259
|
+
return { slug, content, strippedLinks };
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Build a Tier 1 wiki page from a document source.
|
|
264
|
+
*
|
|
265
|
+
* New flow (evidence-first):
|
|
266
|
+
* 1. extractSections(text) — wiki-extract.js
|
|
267
|
+
* 2. extractPaperFacts(sections) — wiki-facts.js (writes paper_facts, all LLM calls here)
|
|
268
|
+
* 3. buildTier1Page(facts) — generates wiki page from evidence
|
|
269
|
+
*
|
|
270
|
+
* Falls back to null if text is unavailable (scanned PDF).
|
|
271
|
+
*
|
|
272
|
+
* @param {object} db
|
|
273
|
+
* @param {object} docSource — row from doc_sources
|
|
274
|
+
* @param {string} extractedText — full text (may be empty for stale re-runs)
|
|
275
|
+
* @param {{ allowedSlugs: string[], providers: object }} opts
|
|
276
|
+
* @returns {Promise<{slug, content, strippedLinks}|null>}
|
|
277
|
+
*/
|
|
278
|
+
async function buildDocWikiPage(db, docSource, extractedText, { allowedSlugs, providers }) {
|
|
279
|
+
// FLAG-5 fix: re-extract if caller passed empty string (stale re-run path)
|
|
280
|
+
let text = extractedText;
|
|
281
|
+
if (!text || !text.trim()) {
|
|
282
|
+
const reExtracted = await extractText(docSource.file_path).catch(() => ({ text: '' }));
|
|
283
|
+
text = reExtracted.text || '';
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (!text || !text.trim()) return null; // scanned PDF or missing file — skip
|
|
287
|
+
|
|
288
|
+
// Step 1: structured section split
|
|
289
|
+
const sections = extractSections(text);
|
|
290
|
+
|
|
291
|
+
// Step 2: per-section fact extraction (all LLM calls, writes paper_facts)
|
|
292
|
+
const facts = await extractPaperFacts(db, docSource, sections, providers);
|
|
293
|
+
|
|
294
|
+
// Step 3: generate Tier 1 wiki page from facts
|
|
295
|
+
return buildTier1Page(db, docSource, facts, { allowedSlugs, providers });
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
async function buildTopicClusterPage(db, docSourceRows, { allowedSlugs, providers, existingClusters = [] }) {
|
|
299
|
+
if (!docSourceRows || docSourceRows.length === 0) return null;
|
|
300
|
+
|
|
301
|
+
const memberIds = docSourceRows.map(r => r.id);
|
|
302
|
+
const memberSlugs = docSourceRows.map(r => r.slug);
|
|
303
|
+
const mHash = membershipHash(memberSlugs);
|
|
304
|
+
|
|
305
|
+
// Find or create stable slug
|
|
306
|
+
const match = findMatchingCluster(existingClusters, memberIds);
|
|
307
|
+
const clusterSlug = match ? match.slug : 'cluster-' + crypto.randomBytes(4).toString('hex');
|
|
308
|
+
|
|
309
|
+
// ── Gather evidence (all sync DB reads, no LLM yet) ──────────────────────
|
|
310
|
+
const matrix = buildComparisonMatrix(db, memberIds);
|
|
311
|
+
const timeline = buildTimeline(db, memberIds);
|
|
312
|
+
const contradictions = detectContradictions(db, memberIds);
|
|
313
|
+
const coverage = buildCoverageReport(db, memberIds);
|
|
314
|
+
|
|
315
|
+
// Total facts referenced in this cluster
|
|
316
|
+
const factsRow = db.prepare(
|
|
317
|
+
`SELECT COUNT(*) as n FROM paper_facts WHERE doc_source_id IN (${memberIds.map(() => '?').join(',')})`
|
|
318
|
+
).get(...memberIds);
|
|
319
|
+
const totalFacts = factsRow ? factsRow.n : 0;
|
|
320
|
+
|
|
321
|
+
// ── LLM synthesis (outside any DB transaction) ───────────────────────────
|
|
322
|
+
const prompt = buildEvidenceClusterPrompt(docSourceRows, {
|
|
323
|
+
matrix, timeline, contradictions, coverage, allowedSlugs,
|
|
324
|
+
});
|
|
325
|
+
const result = await generateWikiContent(prompt, providers, allowedSlugs);
|
|
326
|
+
if (!result) return null;
|
|
327
|
+
const { content, strippedLinks: clusterStrippedLinks } = result;
|
|
328
|
+
|
|
329
|
+
const clusterLabel = inferClusterLabel(docSourceRows.map(r => r.title || r.slug));
|
|
330
|
+
|
|
331
|
+
writeWikiPageWithChunks(db, {
|
|
332
|
+
slug: clusterSlug,
|
|
333
|
+
title: clusterLabel,
|
|
334
|
+
primary_topic: clusterSlug,
|
|
335
|
+
source_type: 'topic_cluster',
|
|
336
|
+
staleness: 0.0,
|
|
337
|
+
raw_source_ids: '[]',
|
|
338
|
+
raw_source_count: totalFacts,
|
|
339
|
+
topic_tags: '[]',
|
|
340
|
+
membership_hash: mHash,
|
|
341
|
+
cluster_size: memberIds.length,
|
|
342
|
+
}, content, { docSourceIds: memberIds, role: 'cluster_member' });
|
|
343
|
+
|
|
344
|
+
return { slug: clusterSlug, strippedLinks: clusterStrippedLinks || [] };
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
function inferClusterLabel(titles) {
|
|
348
|
+
const words = titles.flatMap(t => t.toLowerCase().split(/\W+/).filter(w => w.length > 3));
|
|
349
|
+
const freq = {};
|
|
350
|
+
for (const w of words) freq[w] = (freq[w] || 0) + 1;
|
|
351
|
+
const top = Object.entries(freq).filter(([, c]) => c > 1).sort((a, b) => b[1] - a[1]).slice(0, 2);
|
|
352
|
+
return top.length ? top.map(([w]) => w).join(' & ') + ' cluster' : 'Document Cluster';
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* Evidence-based cluster prompt — uses synthesis intermediates from wiki-synthesis.js.
|
|
357
|
+
* Produces a structured Tier 4 survey page (600–1500 words).
|
|
358
|
+
*/
|
|
359
|
+
function buildEvidenceClusterPrompt(docSourceRows, { matrix, timeline, contradictions, coverage, allowedSlugs }) {
|
|
360
|
+
const memberLinks = docSourceRows.map(r => {
|
|
361
|
+
const safeTitle = (r.title || r.slug || '').slice(0, 100).replace(/[\r\n]/g, ' ');
|
|
362
|
+
return `- [[${r.slug}]] — ${safeTitle}`;
|
|
363
|
+
}).join('\n');
|
|
364
|
+
|
|
365
|
+
// Render contradiction section (up to 5 pairs to stay within token budget)
|
|
366
|
+
const contradictionText = contradictions.length === 0
|
|
367
|
+
? 'No contradictions detected in current evidence.'
|
|
368
|
+
: contradictions.slice(0, 5).map((c, i) => {
|
|
369
|
+
return `${i + 1}. **"${c.factA.subject} ${c.factA.predicate}"** differs:\n` +
|
|
370
|
+
` - [[${c.slugA}]]: ${c.factA.object}\n` +
|
|
371
|
+
` - [[${c.slugB}]]: ${c.factB.object}`;
|
|
372
|
+
}).join('\n');
|
|
373
|
+
|
|
374
|
+
// Keep prompt under ~3000 tokens: truncate all variable-length sections
|
|
375
|
+
const matrixTrunc = matrix.length > 2000 ? matrix.slice(0, 2000) + '\n...[truncated]' : matrix;
|
|
376
|
+
const timelineTrunc = timeline.length > 1000 ? timeline.slice(0, 1000) + '\n...[truncated]' : timeline;
|
|
377
|
+
const coverageTrunc = coverage.length > 500 ? coverage.slice(0, 500) + '\n...[truncated]' : coverage;
|
|
378
|
+
const linksTrunc = memberLinks.length > 800 ? memberLinks.slice(0, 800) + '\n...[truncated]' : memberLinks;
|
|
379
|
+
const contradictionsTrunc = contradictionText.length > 600 ? contradictionText.slice(0, 600) + '\n...[truncated]' : contradictionText;
|
|
380
|
+
|
|
381
|
+
return `You are writing a Tier 4 survey wiki page that synthesizes evidence from ${docSourceRows.length} related academic papers.
|
|
382
|
+
|
|
383
|
+
Member papers:
|
|
384
|
+
${linksTrunc}
|
|
385
|
+
|
|
386
|
+
## Comparison Matrix (auto-generated)
|
|
387
|
+
${matrixTrunc || '(no result/metric facts available)'}
|
|
388
|
+
|
|
389
|
+
## Timeline (auto-generated)
|
|
390
|
+
${timelineTrunc || '(no year data available)'}
|
|
391
|
+
|
|
392
|
+
## Contradictions (auto-detected)
|
|
393
|
+
${contradictionsTrunc}
|
|
394
|
+
|
|
395
|
+
## Coverage Report (auto-generated)
|
|
396
|
+
${coverageTrunc || '(no coverage data)'}
|
|
397
|
+
|
|
398
|
+
Write a survey page with EXACTLY these eight sections in order:
|
|
399
|
+
## Scope
|
|
400
|
+
## Method Families
|
|
401
|
+
## Comparison Matrix
|
|
402
|
+
## Timeline
|
|
403
|
+
## Agreements
|
|
404
|
+
## Contradictions
|
|
405
|
+
## Open Questions / Gaps
|
|
406
|
+
## Source Papers
|
|
407
|
+
|
|
408
|
+
Rules:
|
|
409
|
+
- For "## Comparison Matrix": reproduce or improve on the auto-generated table above using exact evidence
|
|
410
|
+
- For "## Timeline": reproduce or improve the auto-generated timeline
|
|
411
|
+
- For "## Contradictions": explain the contradictions above in plain language; write "None detected." if empty
|
|
412
|
+
- For "## Agreements": summarize what all papers agree on
|
|
413
|
+
- For "## Open Questions / Gaps": derive from the coverage report above — what questions remain unanswered?
|
|
414
|
+
- For "## Source Papers": list all members as [[wikilinks]]
|
|
415
|
+
- Ground every claim in the evidence above — do not hallucinate
|
|
416
|
+
- Use [[wikilink]] syntax when referencing member papers by slug
|
|
417
|
+
- 600–1500 words total
|
|
418
|
+
- Respond with only the wiki page content`;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// @deprecated — use buildEvidenceClusterPrompt for evidence-grounded Tier 4 pages
|
|
422
|
+
function buildClusterPrompt(titles, slugs) {
|
|
423
|
+
const links = slugs.map((s, i) => {
|
|
424
|
+
const safeTitle = (titles[i] || '').slice(0, 120).replace(/[\r\n]/g, ' ');
|
|
425
|
+
return `- [[${s}]] — ${safeTitle}`;
|
|
426
|
+
}).join('\n');
|
|
427
|
+
return `You are writing a wiki overview page that synthesizes multiple related documents.
|
|
428
|
+
|
|
429
|
+
Member documents:
|
|
430
|
+
${links}
|
|
431
|
+
|
|
432
|
+
Write a concise wiki overview page (150–300 words) that:
|
|
433
|
+
- Opens with a paragraph explaining what these documents share in common
|
|
434
|
+
- Briefly notes what each document covers (1 sentence each)
|
|
435
|
+
- Uses [[wikilink]] syntax when referencing the member documents by slug
|
|
436
|
+
- Ends with a "## See Also" section listing all members as [[wikilinks]]
|
|
437
|
+
|
|
438
|
+
Respond with only the wiki page content.`;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
module.exports = { buildWikiPage, buildFallbackWikiContent, generateWikiContent, writeWikiPageWithChunks, buildDocWikiPage, buildTier1Page, buildTopicClusterPage };
|