metame-cli 1.6.0 → 1.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,351 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * wiki-facts.js — Paper fact extraction, persistence, and entity registration
5
+ *
6
+ * Single responsibility: turn structured paper sections into atomic facts
7
+ * stored in paper_facts + research_entities tables.
8
+ *
9
+ * Exports:
10
+ * extractPaperFacts(db, docSource, sections, providers, opts)
11
+ * → Promise<fact[]> all facts written to DB for this doc
12
+ *
13
+ * writeFacts(db, docSourceId, facts)
14
+ * → void idempotent batch INSERT (conflict ignore by id)
15
+ *
16
+ * registerEntities(db, facts)
17
+ * → void INSERT OR IGNORE entities inferred from subject/object fields
18
+ *
19
+ * buildTier1Prompt(title, facts)
20
+ * → string LLM prompt for Tier 1 wiki page generation
21
+ */
22
+
23
+ const crypto = require('node:crypto');
24
+
25
+ // ── Concurrency helper ────────────────────────────────────────────────────────
26
+ // Hand-written semaphore — no external dependencies.
27
+ async function withConcurrency(tasks, limit) {
28
+ const results = new Array(tasks.length);
29
+ let nextIdx = 0;
30
+
31
+ async function worker() {
32
+ while (nextIdx < tasks.length) {
33
+ const idx = nextIdx++;
34
+ try {
35
+ results[idx] = await tasks[idx]();
36
+ } catch (err) {
37
+ results[idx] = { error: err.message };
38
+ }
39
+ }
40
+ }
41
+
42
+ await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
43
+ return results;
44
+ }
45
+
46
+ // ── Section → LLM prompt ──────────────────────────────────────────────────────
47
+ const FACT_TYPES = [
48
+ 'problem', 'method', 'claim', 'assumption',
49
+ 'dataset', 'metric', 'result', 'baseline',
50
+ 'limitation', 'future_work', 'contradiction_note',
51
+ ];
52
+
53
+ function buildSectionFactPrompt(sectionName, sectionText, paperTitle) {
54
+ const truncated = sectionText.length > 4000
55
+ ? sectionText.slice(0, 4000) + '\n[...truncated]'
56
+ : sectionText;
57
+
58
+ return `You are extracting structured facts from a section of an academic paper.
59
+
60
+ Paper title: ${paperTitle}
61
+ Section: ${sectionName}
62
+
63
+ Section text:
64
+ ${truncated}
65
+
66
+ Extract all atomic, verifiable facts from this section. For each fact output a JSON object with these fields:
67
+ - fact_type: one of ${FACT_TYPES.join(', ')}
68
+ - subject: the primary entity (model name, method name, system name, etc.)
69
+ - predicate: a short verb phrase (achieves, outperforms, requires, assumes, proposes, uses, ...)
70
+ - object: what the subject does/has/achieves (metric value, baseline name, dataset name, ...)
71
+ - value: numeric value if any (e.g. "0.87")
72
+ - unit: unit if any (e.g. "%", "ms", "F1")
73
+ - context: conditions under which this holds (e.g. "on FORCE 2020 dataset", "with 5-fold CV")
74
+ - evidence_text: exact quote from the section (≤400 characters) that supports this fact
75
+ - confidence: 0.0–1.0 reflecting how clearly stated this fact is
76
+
77
+ Return ONLY a valid JSON array of fact objects. No explanation, no markdown. Empty array [] if no facts found.`;
78
+ }
79
+
80
+ // ── LLM response parser ───────────────────────────────────────────────────────
81
+ function parseFacts(raw, sectionName) {
82
+ if (!raw || typeof raw !== 'string') return [];
83
+ const trimmed = raw.trim();
84
+ // Strip possible markdown code fences
85
+ const jsonStr = trimmed.startsWith('```')
86
+ ? trimmed.replace(/^```[^\n]*\n?/, '').replace(/\n?```$/, '')
87
+ : trimmed;
88
+ try {
89
+ const parsed = JSON.parse(jsonStr);
90
+ if (!Array.isArray(parsed)) return [];
91
+ return parsed
92
+ .filter(f => f && typeof f === 'object' && f.fact_type && f.evidence_text)
93
+ .map(f => ({
94
+ fact_type: String(f.fact_type || 'claim'),
95
+ subject: f.subject ? String(f.subject).slice(0, 200) : null,
96
+ predicate: f.predicate ? String(f.predicate).slice(0, 100) : null,
97
+ object: f.object ? String(f.object).slice(0, 300) : null,
98
+ value: f.value ? String(f.value).slice(0, 50) : null,
99
+ unit: f.unit ? String(f.unit).slice(0, 20) : null,
100
+ context: f.context ? String(f.context).slice(0, 300) : null,
101
+ evidence_text: String(f.evidence_text || '').slice(0, 400),
102
+ section: sectionName,
103
+ confidence: typeof f.confidence === 'number'
104
+ ? Math.min(1, Math.max(0, f.confidence))
105
+ : 0.7,
106
+ }));
107
+ } catch {
108
+ return [];
109
+ }
110
+ }
111
+
112
+ // ── DB write helpers ──────────────────────────────────────────────────────────
113
+
114
+ // Valid fact_type values matching the paper_facts CHECK constraint
115
+ const VALID_FACT_TYPES = new Set([
116
+ 'problem','method','claim','assumption',
117
+ 'dataset','metric','result','baseline',
118
+ 'limitation','future_work','contradiction_note',
119
+ ]);
120
+
121
+ /**
122
+ * Idempotent batch insert of facts into paper_facts.
123
+ *
124
+ * Deduplication: ID is a deterministic sha256 of (doc_source_id, evidence_text, section)
125
+ * so INSERT OR IGNORE correctly skips duplicates on re-run.
126
+ *
127
+ * fact_type is validated against the schema CHECK enum before insert;
128
+ * invalid values fall back to 'claim' to avoid crashing the batch transaction.
129
+ */
130
+ function writeFacts(db, docSourceId, facts) {
131
+ if (facts.length === 0) return;
132
+ const insert = db.prepare(`
133
+ INSERT OR IGNORE INTO paper_facts
134
+ (id, doc_source_id, fact_type, subject, predicate, object,
135
+ value, unit, context, evidence_text, section,
136
+ extraction_source, confidence)
137
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'pdf_llm_section', ?)
138
+ `);
139
+ const tx = db.prepare('BEGIN');
140
+ const commit = db.prepare('COMMIT');
141
+ const rollback = db.prepare('ROLLBACK');
142
+ tx.run();
143
+ try {
144
+ for (const f of facts) {
145
+ // FLAG-7 fix: validate fact_type against enum, fallback to 'claim'
146
+ const factType = VALID_FACT_TYPES.has(f.fact_type) ? f.fact_type : 'claim';
147
+ // FLAG-8 fix: deterministic ID — sha256 of (docSourceId, section, evidence_text)
148
+ const idSeed = `${docSourceId}:${f.section || ''}:${f.evidence_text || ''}`;
149
+ const id = 'pf_' + crypto.createHash('sha256').update(idSeed).digest('hex').slice(0, 16);
150
+ insert.run(
151
+ id, docSourceId, factType,
152
+ f.subject, f.predicate, f.object,
153
+ f.value, f.unit, f.context,
154
+ f.evidence_text, f.section, f.confidence,
155
+ );
156
+ }
157
+ commit.run();
158
+ } catch (err) {
159
+ try { rollback.run(); } catch { /* ignore */ }
160
+ throw err;
161
+ }
162
+ }
163
+
164
+ // Simple entity_type inference from fact fields
165
+ const ENTITY_HINTS = {
166
+ dataset: /\b(dataset|corpus|benchmark|collection)\b/i,
167
+ metric: /\b(accuracy|f1|precision|recall|auc|mse|rmse|bleu|rouge|map|ndcg)\b/i,
168
+ method_family: /\b(transformer|cnn|rnn|lstm|gru|bert|gpt|attention|svm|xgboost|random.?forest)\b/i,
169
+ problem: /\b(classification|regression|detection|segmentation|prediction|recognition)\b/i,
170
+ };
171
+
172
+ function inferEntityType(text) {
173
+ if (!text) return 'concept';
174
+ for (const [type, re] of Object.entries(ENTITY_HINTS)) {
175
+ if (re.test(text)) return type;
176
+ }
177
+ return 'concept';
178
+ }
179
+
180
+ /**
181
+ * Register unique entities inferred from subject/object fields.
182
+ * INSERT OR IGNORE — safe to call multiple times.
183
+ */
184
+ function registerEntities(db, facts) {
185
+ const seen = new Set();
186
+ const candidates = [];
187
+ for (const f of facts) {
188
+ for (const field of [f.subject, f.object]) {
189
+ if (field && field.length >= 2 && field.length <= 100 && !seen.has(field)) {
190
+ seen.add(field);
191
+ candidates.push(field);
192
+ }
193
+ }
194
+ }
195
+ if (candidates.length === 0) return;
196
+
197
+ const insert = db.prepare(`
198
+ INSERT OR IGNORE INTO research_entities (id, entity_type, name)
199
+ VALUES (?, ?, ?)
200
+ `);
201
+ const tx = db.prepare('BEGIN');
202
+ const commit = db.prepare('COMMIT');
203
+ const rollback = db.prepare('ROLLBACK');
204
+ tx.run();
205
+ try {
206
+ for (const name of candidates) {
207
+ const id = 'ent_' + crypto.randomBytes(6).toString('hex');
208
+ const entity_type = inferEntityType(name);
209
+ insert.run(id, entity_type, name);
210
+ }
211
+ commit.run();
212
+ } catch (err) {
213
+ try { rollback.run(); } catch { /* ignore */ }
214
+ throw err;
215
+ }
216
+ }
217
+
218
+ // ── Main extraction entry point ───────────────────────────────────────────────
219
+
220
+ /**
221
+ * Extract structured facts from paper sections using per-section LLM calls.
222
+ * Writes results to paper_facts and research_entities tables.
223
+ *
224
+ * @param {object} db - DatabaseSync instance
225
+ * @param {{ id: number, title: string, slug: string }} docSource
226
+ * @param {{ abstract, introduction, method, experiments, results,
227
+ * discussion, conclusion, _fallback: boolean }} sections
228
+ * @param {{ callHaiku: Function, buildDistillEnv: Function }} providers
229
+ * @param {{ concurrency?: number }} opts
230
+ * @returns {Promise<object[]>} all facts written to DB
231
+ */
232
+ async function extractPaperFacts(db, docSource, sections, providers, { concurrency = 3 } = {}) {
233
+ const { callHaiku, buildDistillEnv } = providers;
234
+ const title = docSource.title || docSource.slug;
235
+
236
+ // Build one task per non-empty section (skip references, skip tiny sections)
237
+ const SKIP_SECTIONS = new Set(['references', '_fallback']);
238
+ const MIN_SECTION_LEN = 100;
239
+
240
+ const tasks = Object.entries(sections)
241
+ .filter(([key, text]) =>
242
+ !SKIP_SECTIONS.has(key) &&
243
+ typeof text === 'string' &&
244
+ text.trim().length >= MIN_SECTION_LEN
245
+ )
246
+ .map(([sectionName, sectionText]) => async () => {
247
+ const prompt = buildSectionFactPrompt(sectionName, sectionText, title);
248
+ let raw;
249
+ try {
250
+ const env = buildDistillEnv();
251
+ raw = await callHaiku(prompt, env, 60000, { model: 'sonnet' });
252
+ } catch {
253
+ return [];
254
+ }
255
+ return parseFacts(raw, sectionName);
256
+ });
257
+
258
+ if (tasks.length === 0) return [];
259
+
260
+ // Run with concurrency limit — LLM calls are all OUTSIDE any DB transaction
261
+ const sectionResults = await withConcurrency(tasks, concurrency);
262
+ const allFacts = sectionResults.flat().filter(f => f && !f.error);
263
+
264
+ if (allFacts.length === 0) return [];
265
+
266
+ // Write to DB in one shot (after all LLM calls complete)
267
+ writeFacts(db, docSource.id, allFacts);
268
+ registerEntities(db, allFacts);
269
+
270
+ return allFacts;
271
+ }
272
+
273
+ // ── Tier 1 wiki prompt builder ────────────────────────────────────────────────
274
+
275
+ /**
276
+ * Build a prompt for generating a Tier 1 wiki page from extracted facts.
277
+ * The page uses a fixed 7-section structure for downstream synthesis.
278
+ *
279
+ * @param {string} title
280
+ * @param {object[]} facts - from paper_facts table
281
+ * @returns {string}
282
+ */
283
+ function buildTier1Prompt(title, facts) {
284
+ // Group facts by type for structured rendering
285
+ const byType = {};
286
+ for (const f of facts) {
287
+ if (!byType[f.fact_type]) byType[f.fact_type] = [];
288
+ byType[f.fact_type].push(f);
289
+ }
290
+
291
+ function renderFacts(types) {
292
+ return types
293
+ .flatMap(t => byType[t] || [])
294
+ .slice(0, 12)
295
+ .map(f => {
296
+ const parts = [f.subject, f.predicate, f.object].filter(Boolean).join(' ');
297
+ const ctx = f.context ? ` (${f.context})` : '';
298
+ const ev = f.evidence_text ? `\n Evidence: "${f.evidence_text}"` : '';
299
+ return `- ${parts}${ctx}${ev}`;
300
+ })
301
+ .join('\n');
302
+ }
303
+
304
+ const problemFacts = renderFacts(['problem', 'assumption']);
305
+ const methodFacts = renderFacts(['method', 'claim']);
306
+ const resultFacts = renderFacts(['result', 'metric', 'baseline']);
307
+ const datasetFacts = renderFacts(['dataset']);
308
+ const limitFacts = renderFacts(['limitation', 'future_work']);
309
+ const allFactCount = facts.length;
310
+
311
+ return `You are writing a Tier 1 wiki page for an academic paper knowledge base.
312
+
313
+ Paper: ${title}
314
+ Total extracted facts: ${allFactCount}
315
+
316
+ Extracted evidence:
317
+
318
+ ## Problems / Assumptions
319
+ ${problemFacts || '(none extracted)'}
320
+
321
+ ## Methods / Claims
322
+ ${methodFacts || '(none extracted)'}
323
+
324
+ ## Results / Metrics / Baselines
325
+ ${resultFacts || '(none extracted)'}
326
+
327
+ ## Datasets
328
+ ${datasetFacts || '(none extracted)'}
329
+
330
+ ## Limitations
331
+ ${limitFacts || '(none extracted)'}
332
+
333
+ Write a wiki page with EXACTLY these seven sections in order:
334
+ ## Summary
335
+ ## Problem Addressed
336
+ ## Method
337
+ ## Key Results
338
+ ## Datasets Used
339
+ ## Limitations
340
+ ## Relation to This Project
341
+
342
+ Rules:
343
+ - Ground every claim in the extracted evidence above
344
+ - Include specific numbers, model names, and dataset names when available
345
+ - "Relation to This Project" should note methodological connections and potential challenges — leave placeholder text "[To be filled by paper-reader-lab]" if unknown
346
+ - Use [[wikilink]] syntax for concepts that deserve their own pages
347
+ - 300–600 words total
348
+ - Respond with only the wiki page content`;
349
+ }
350
+
351
+ module.exports = { extractPaperFacts, writeFacts, registerEntities, buildTier1Prompt };
@@ -0,0 +1,256 @@
1
+ 'use strict';
2
+
3
+ const fs = require('node:fs');
4
+ const path = require('node:path');
5
+
6
+ const { extractText, slugFromFilename, sha256 } = require('./wiki-extract');
7
+ const { buildConnectedComponents, getDocEmbeddings } = require('./wiki-cluster');
8
+ const { buildDocWikiPage, buildTopicClusterPage } = require('./wiki-reflect-build');
9
+ const {
10
+ upsertDocSource, getDocSourceByPath, listStaleDocSources,
11
+ markDocSourcesMissing, getClusterMemberIds,
12
+ listClusterPages,
13
+ } = require('./core/wiki-db');
14
+
15
+ const SUPPORTED_EXTS = new Set(['.md', '.txt', '.pdf']);
16
+ const DRAIN_POLL_MS = 5000;
17
+ const DRAIN_TIMEOUT_MS = 5 * 60 * 1000;
18
+
19
+ function scanFiles(inputPath) {
20
+ const real = fs.realpathSync(inputPath);
21
+ const stat = fs.statSync(real);
22
+ if (stat.isFile()) {
23
+ return SUPPORTED_EXTS.has(path.extname(real).toLowerCase()) ? [real] : [];
24
+ }
25
+ return fs.readdirSync(real)
26
+ .filter(f => SUPPORTED_EXTS.has(path.extname(f).toLowerCase()))
27
+ .map(f => fs.realpathSync(path.join(real, f)));
28
+ }
29
+
30
+ function generateUniqueSlug(db, base) {
31
+ let candidate = base;
32
+ let n = 2;
33
+ const checkWiki = db.prepare('SELECT 1 FROM wiki_pages WHERE slug=?');
34
+ const checkDoc = db.prepare('SELECT 1 FROM doc_sources WHERE slug=?');
35
+ while (checkWiki.get(candidate) || checkDoc.get(candidate)) {
36
+ candidate = `${base}-${n++}`;
37
+ }
38
+ return candidate;
39
+ }
40
+
41
+ async function waitForEmbeddingDrain(db, chunkIds, log = () => {}) {
42
+ if (chunkIds.length === 0) return true;
43
+ const placeholders = chunkIds.map(() => '?').join(',');
44
+ const query = db.prepare(
45
+ `SELECT COUNT(*) as cnt FROM embedding_queue WHERE item_type='chunk' AND item_id IN (${placeholders})`
46
+ );
47
+ const deadline = Date.now() + DRAIN_TIMEOUT_MS;
48
+ while (Date.now() < deadline) {
49
+ const { cnt } = query.get(...chunkIds);
50
+ if (cnt === 0) return true;
51
+ log(`[wiki-import] waiting for ${cnt} embeddings to drain...`);
52
+ await new Promise(r => setTimeout(r, DRAIN_POLL_MS));
53
+ }
54
+ log('[wiki-import] WARNING: embedding drain timed out, skipping clustering');
55
+ return false;
56
+ }
57
+
58
+ async function runWikiImport(db, inputPath, { providers, noCluster = false, log = () => {} } = {}) {
59
+ if (!fs.existsSync(inputPath)) {
60
+ log(`[wiki-import] ERROR: path does not exist: ${inputPath}`);
61
+ return { imported: 0, skipped: 0, failed: 0, clusters: 0 };
62
+ }
63
+
64
+ log(`[wiki-import] scanning: ${inputPath}`);
65
+ const files = scanFiles(inputPath);
66
+ log(`[wiki-import] found ${files.length} supported files`);
67
+
68
+ const seenPaths = [];
69
+ const stats = { imported: 0, skipped: 0, failed: 0, clusters: 0 };
70
+ const extractedTexts = new Map();
71
+
72
+ // Phase 0: Extract + hash check + upsert doc_sources
73
+ for (const filePath of files) {
74
+ seenPaths.push(filePath);
75
+ try {
76
+ const stat = fs.statSync(filePath);
77
+ const existing = getDocSourceByPath(db, filePath);
78
+
79
+ if (existing && existing.mtime_ms === stat.mtimeMs && existing.size_bytes === stat.size) {
80
+ if (!existing.content_stale) { stats.skipped++; continue; }
81
+ }
82
+
83
+ const { text, title, extractor, extractStatus, errorMessage } = await extractText(filePath);
84
+ if (text) extractedTexts.set(filePath, text);
85
+ const fileHash = sha256(fs.readFileSync(filePath));
86
+ const extractedTextHash = text ? sha256(text) : null;
87
+ const baseSlug = slugFromFilename(filePath);
88
+ const slug = existing ? existing.slug : generateUniqueSlug(db, baseSlug);
89
+
90
+ upsertDocSource(db, {
91
+ filePath, fileHash,
92
+ mtimeMs: stat.mtimeMs, sizeBytes: stat.size,
93
+ extractedTextHash, fileType: path.extname(filePath).slice(1).toLowerCase(),
94
+ extractor, extractStatus, title, slug,
95
+ });
96
+
97
+ if (extractStatus !== 'ok') {
98
+ log(`[wiki-import] SKIP ${path.basename(filePath)}: ${errorMessage || extractStatus}`);
99
+ }
100
+ } catch (err) {
101
+ log(`[wiki-import] Phase 0 error for ${path.basename(filePath)}: ${err.message}`);
102
+ }
103
+ }
104
+
105
+ markDocSourcesMissing(db, seenPaths);
106
+
107
+ // Phase 1: Build Tier 1 pages for stale docs
108
+ const stale = listStaleDocSources(db);
109
+ const builtSlugs = [];
110
+ const allChunkIds = [];
111
+
112
+ for (const docSrc of stale) {
113
+ if (docSrc.extract_status !== 'ok') {
114
+ db.prepare("UPDATE doc_sources SET content_stale=0 WHERE id=?").run(docSrc.id);
115
+ continue;
116
+ }
117
+ try {
118
+ const text = extractedTexts.get(docSrc.file_path) || '';
119
+ const allowedSlugs = files.map(f => slugFromFilename(f));
120
+ const result = await buildDocWikiPage(db, docSrc, text, { allowedSlugs, providers });
121
+ if (result) {
122
+ db.prepare("UPDATE doc_sources SET content_stale=0, built_at=? WHERE id=?")
123
+ .run(new Date().toISOString(), docSrc.id);
124
+ builtSlugs.push(docSrc.slug);
125
+ const chunks = db.prepare("SELECT id FROM content_chunks WHERE page_slug=?").all(docSrc.slug);
126
+ allChunkIds.push(...chunks.map(c => c.id));
127
+ stats.imported++;
128
+ log(`[wiki-import] built: ${docSrc.slug}`);
129
+ }
130
+ } catch (err) {
131
+ db.prepare("UPDATE doc_sources SET error_message=? WHERE id=?").run(err.message, docSrc.id);
132
+ stats.failed++;
133
+ log(`[wiki-import] FAILED ${docSrc.slug}: ${err.message}`);
134
+ }
135
+ }
136
+
137
+ // Phase 2: Cascade stale cluster pages
138
+ if (builtSlugs.length > 0) {
139
+ const affected = db.prepare(`
140
+ SELECT DISTINCT page_slug FROM wiki_page_doc_sources
141
+ WHERE role='cluster_member'
142
+ AND doc_source_id IN (SELECT id FROM doc_sources WHERE slug IN (${builtSlugs.map(() => '?').join(',')}))
143
+ `).all(...builtSlugs).map(r => r.page_slug);
144
+ if (affected.length > 0) {
145
+ const ph = affected.map(() => '?').join(',');
146
+ db.prepare(`UPDATE wiki_pages SET staleness=1 WHERE slug IN (${ph})`).run(...affected);
147
+ }
148
+ }
149
+
150
+ // Phase 3: Clustering (Tier 2)
151
+ if (!noCluster) {
152
+ const drained = await waitForEmbeddingDrain(db, allChunkIds, log);
153
+ if (drained) {
154
+ const allDocSlugs = db.prepare("SELECT slug FROM doc_sources WHERE status='active' AND extract_status='ok'").all().map(r => r.slug);
155
+ const embeddings = getDocEmbeddings(db, allDocSlugs);
156
+
157
+ // embeddings is Array<{ slug, vector: Float32Array }>, use .length
158
+ if (embeddings.length >= 3) {
159
+ const clusters = buildConnectedComponents(embeddings, { threshold: 0.75, minSize: 3 });
160
+ const existingClusters = listClusterPages(db).map(cp => ({
161
+ slug: cp.slug,
162
+ memberIds: getClusterMemberIds(db, cp.slug),
163
+ }));
164
+
165
+ const getDocBySlug = db.prepare("SELECT * FROM doc_sources WHERE slug=?");
166
+ for (const memberSlugs of clusters) {
167
+ const docRows = memberSlugs.map(s => getDocBySlug.get(s)).filter(Boolean);
168
+ try {
169
+ // buildTopicClusterPage returns { slug, strippedLinks } or null
170
+ const clusterResult = await buildTopicClusterPage(db, docRows, {
171
+ allowedSlugs: allDocSlugs, providers, existingClusters,
172
+ });
173
+ if (clusterResult) {
174
+ stats.clusters++;
175
+ log(`[wiki-import] cluster: ${clusterResult.slug}`);
176
+ }
177
+ } catch (err) {
178
+ log(`[wiki-import] cluster FAILED: ${err.message}`);
179
+ }
180
+ }
181
+ } else {
182
+ log('[wiki-import] not enough embedded docs for clustering yet');
183
+ }
184
+ }
185
+ }
186
+
187
+ log(`[wiki-import] done — imported: ${stats.imported}, skipped: ${stats.skipped}, failed: ${stats.failed}, clusters: ${stats.clusters}`);
188
+ return stats;
189
+ }
190
+
191
+ module.exports = { runWikiImport, scanFiles, generateUniqueSlug, waitForEmbeddingDrain };
192
+
193
+ // ── CLI entry point ──────────────────────────────────────────────────────────
194
+ // Usage: node wiki-import.js <path> [--no-cluster]
195
+ //
196
+ // <path> File or directory to import (.pdf / .md / .txt)
197
+ // --no-cluster Skip Tier 2 clustering step
198
+ //
199
+ // Output: JSON on stdout, progress on stderr
200
+ // {"imported":3,"skipped":1,"failed":0,"clusters":1}
201
+ //
202
+ if (require.main === module) {
203
+ (async () => {
204
+ const { DatabaseSync } = require('node:sqlite');
205
+ const { DB_PATH } = require('./memory.js');
206
+ const { applyWikiSchema } = require('./memory-wiki-schema.js');
207
+ const { callHaiku, buildDistillEnv } = require('./providers.js');
208
+
209
+ const args = process.argv.slice(2);
210
+ if (args.length === 0 || args[0] === '--help' || args[0] === '-h') {
211
+ process.stderr.write([
212
+ 'Usage: node wiki-import.js <path> [--no-cluster]',
213
+ '',
214
+ ' <path> File or directory (.pdf / .md / .txt)',
215
+ ' --no-cluster Skip Tier 2 embedding-based clustering',
216
+ '',
217
+ 'Examples:',
218
+ ' node ~/.metame/wiki-import.js ~/papers/',
219
+ ' node ~/.metame/wiki-import.js ~/papers/paper.pdf',
220
+ ].join('\n') + '\n');
221
+ process.exit(0);
222
+ }
223
+
224
+ const inputPath = args.find(a => !a.startsWith('--'));
225
+ const noCluster = args.includes('--no-cluster');
226
+
227
+ if (!inputPath) {
228
+ process.stderr.write('Error: path argument required\n');
229
+ process.exit(1);
230
+ }
231
+
232
+ let db;
233
+ try {
234
+ db = new DatabaseSync(DB_PATH);
235
+ applyWikiSchema(db);
236
+ } catch (err) {
237
+ process.stderr.write(`Error: failed to open DB at ${DB_PATH}: ${err.message}\n`);
238
+ process.exit(1);
239
+ }
240
+
241
+ try {
242
+ const stats = await runWikiImport(db, inputPath, {
243
+ providers: { callHaiku, buildDistillEnv },
244
+ noCluster,
245
+ log: (msg) => process.stderr.write(msg + '\n'),
246
+ });
247
+ process.stdout.write(JSON.stringify(stats) + '\n');
248
+ process.exit(0);
249
+ } catch (err) {
250
+ process.stderr.write(`Error: ${err.message}\n`);
251
+ process.exit(1);
252
+ } finally {
253
+ try { db.close(); } catch { /* ignore */ }
254
+ }
255
+ })();
256
+ }