metame-cli 1.5.26 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +4 -1
- package/package.json +1 -1
- package/scripts/agent-layer.js +36 -0
- package/scripts/core/chunker.js +100 -0
- package/scripts/core/embedding.js +225 -0
- package/scripts/core/hybrid-search.js +296 -0
- package/scripts/core/wiki-db.js +545 -0
- package/scripts/core/wiki-prompt.js +88 -0
- package/scripts/core/wiki-slug.js +66 -0
- package/scripts/core/wiki-staleness.js +18 -0
- package/scripts/daemon-agent-commands.js +10 -4
- package/scripts/daemon-bridges.js +16 -0
- package/scripts/daemon-claude-engine.js +62 -8
- package/scripts/daemon-command-router.js +40 -1
- package/scripts/daemon-default.yaml +33 -3
- package/scripts/daemon-embedding.js +162 -0
- package/scripts/daemon-engine-runtime.js +1 -1
- package/scripts/daemon-health-scan.js +185 -0
- package/scripts/daemon-ops-commands.js +9 -18
- package/scripts/daemon-runtime-lifecycle.js +1 -1
- package/scripts/daemon-session-commands.js +4 -0
- package/scripts/daemon-task-scheduler.js +5 -3
- package/scripts/daemon-warm-pool.js +15 -0
- package/scripts/daemon-wiki.js +420 -0
- package/scripts/daemon.js +10 -5
- package/scripts/distill.js +1 -1
- package/scripts/docs/file-transfer.md +0 -1
- package/scripts/docs/maintenance-manual.md +2 -55
- package/scripts/docs/pointer-map.md +0 -34
- package/scripts/feishu-adapter.js +25 -0
- package/scripts/hooks/intent-file-transfer.js +1 -2
- package/scripts/memory-backfill-chunks.js +92 -0
- package/scripts/memory-search.js +49 -6
- package/scripts/memory-wiki-schema.js +255 -0
- package/scripts/memory.js +103 -3
- package/scripts/signal-capture.js +1 -1
- package/scripts/skill-evolution.js +2 -11
- package/scripts/wiki-cluster.js +121 -0
- package/scripts/wiki-extract.js +171 -0
- package/scripts/wiki-facts.js +351 -0
- package/scripts/wiki-import.js +256 -0
- package/scripts/wiki-reflect-build.js +441 -0
- package/scripts/wiki-reflect-export.js +448 -0
- package/scripts/wiki-reflect-query.js +109 -0
- package/scripts/wiki-reflect.js +338 -0
- package/scripts/wiki-synthesis.js +224 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const crypto = require('node:crypto');
|
|
4
|
+
|
|
5
|
+
function cosineSimilarity(a, b) {
|
|
6
|
+
if (a.length !== b.length) return 0;
|
|
7
|
+
let dot = 0, normA = 0, normB = 0;
|
|
8
|
+
for (let i = 0; i < a.length; i++) {
|
|
9
|
+
dot += a[i] * b[i];
|
|
10
|
+
normA += a[i] * a[i];
|
|
11
|
+
normB += b[i] * b[i];
|
|
12
|
+
}
|
|
13
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
14
|
+
return denom === 0 ? 0 : dot / denom;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function membershipHash(slugs) {
|
|
18
|
+
const sorted = [...slugs].sort().join(',');
|
|
19
|
+
return crypto.createHash('sha256').update(sorted).digest('hex');
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function jaccardOverlap(setA, setB) {
|
|
23
|
+
const a = new Set(setA);
|
|
24
|
+
const b = new Set(setB);
|
|
25
|
+
let intersection = 0;
|
|
26
|
+
for (const x of a) if (b.has(x)) intersection++;
|
|
27
|
+
const union = a.size + b.size - intersection;
|
|
28
|
+
return union === 0 ? 0 : intersection / union;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Find existing cluster with Jaccard overlap > 0.5 with newMemberIds.
|
|
33
|
+
* Tie-break: prefer larger stored cluster.
|
|
34
|
+
*/
|
|
35
|
+
function findMatchingCluster(existingClusters, newMemberIds) {
|
|
36
|
+
let best = null;
|
|
37
|
+
let bestScore = 0.5; // strict threshold: must exceed 0.5
|
|
38
|
+
for (const cluster of existingClusters) {
|
|
39
|
+
const score = jaccardOverlap(cluster.memberIds, newMemberIds);
|
|
40
|
+
if (score > bestScore) {
|
|
41
|
+
best = cluster;
|
|
42
|
+
bestScore = score;
|
|
43
|
+
} else if (score === bestScore && best && cluster.memberIds.length > best.memberIds.length) {
|
|
44
|
+
best = cluster;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return best;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Build connected components from embeddings using cosine similarity threshold.
|
|
52
|
+
* @param {Array<{ slug: string, vector: Float32Array|number[] }>} embeddings
|
|
53
|
+
* @param {{ threshold?: number, minSize?: number }} options
|
|
54
|
+
* Uses union-find.
|
|
55
|
+
*/
|
|
56
|
+
function buildConnectedComponents(embeddings, { threshold = 0.75, minSize = 3 } = {}) {
|
|
57
|
+
const n = embeddings.length;
|
|
58
|
+
const slugs = embeddings.map(e => e.slug);
|
|
59
|
+
const parent = Object.fromEntries(slugs.map(s => [s, s]));
|
|
60
|
+
|
|
61
|
+
function find(x) {
|
|
62
|
+
if (parent[x] !== x) parent[x] = find(parent[x]);
|
|
63
|
+
return parent[x];
|
|
64
|
+
}
|
|
65
|
+
function union(x, y) { parent[find(x)] = find(y); }
|
|
66
|
+
|
|
67
|
+
for (let i = 0; i < n; i++) {
|
|
68
|
+
for (let j = i + 1; j < n; j++) {
|
|
69
|
+
if (cosineSimilarity(embeddings[i].vector, embeddings[j].vector) >= threshold) {
|
|
70
|
+
union(slugs[i], slugs[j]);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const groups = {};
|
|
76
|
+
for (const s of slugs) {
|
|
77
|
+
const root = find(s);
|
|
78
|
+
if (!groups[root]) groups[root] = [];
|
|
79
|
+
groups[root].push(s);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return Object.values(groups).filter(g => g.length >= minSize);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Fetch doc-level embeddings from content_chunks by averaging all chunk embeddings per page.
|
|
87
|
+
* @param {object} db - node:sqlite DatabaseSync instance
|
|
88
|
+
* @param {string[]} slugs
|
|
89
|
+
* @returns {Array<{ slug: string, vector: Float32Array }>} one entry per slug that has embeddings
|
|
90
|
+
*/
|
|
91
|
+
function getDocEmbeddings(db, slugs) {
|
|
92
|
+
if (slugs.length === 0) return [];
|
|
93
|
+
const placeholders = ',?'.repeat(slugs.length).slice(1);
|
|
94
|
+
const rows = db.prepare(
|
|
95
|
+
`SELECT page_slug, embedding FROM content_chunks WHERE page_slug IN (${placeholders}) AND embedding IS NOT NULL`
|
|
96
|
+
).all(...slugs);
|
|
97
|
+
|
|
98
|
+
// Group rows by slug
|
|
99
|
+
const bySlug = {};
|
|
100
|
+
for (const row of rows) {
|
|
101
|
+
if (!bySlug[row.page_slug]) bySlug[row.page_slug] = [];
|
|
102
|
+
const buf = Buffer.isBuffer(row.embedding) ? row.embedding : Buffer.from(row.embedding);
|
|
103
|
+
if (buf.byteLength % 4 !== 0) continue; // skip corrupt/truncated embedding row
|
|
104
|
+
bySlug[row.page_slug].push(new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4));
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Average chunk vectors per slug
|
|
108
|
+
const result = [];
|
|
109
|
+
for (const [slug, vecs] of Object.entries(bySlug)) {
|
|
110
|
+
const dim = vecs[0].length;
|
|
111
|
+
const avg = new Float32Array(dim);
|
|
112
|
+
for (const v of vecs) {
|
|
113
|
+
for (let i = 0; i < dim; i++) avg[i] += v[i] / vecs.length;
|
|
114
|
+
}
|
|
115
|
+
result.push({ slug, vector: avg });
|
|
116
|
+
}
|
|
117
|
+
return result;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
module.exports = { cosineSimilarity, buildConnectedComponents, jaccardOverlap,
|
|
121
|
+
findMatchingCluster, membershipHash, getDocEmbeddings };
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('node:fs');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const crypto = require('node:crypto');
|
|
6
|
+
const { execFile } = require('node:child_process');
|
|
7
|
+
const { promisify } = require('node:util');
|
|
8
|
+
const execFileAsync = promisify(execFile);
|
|
9
|
+
|
|
10
|
+
function slugFromFilename(filePath) {
|
|
11
|
+
const base = path.basename(filePath, path.extname(filePath));
|
|
12
|
+
return base
|
|
13
|
+
.toLowerCase()
|
|
14
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
15
|
+
.replace(/^-+|-+$/g, '');
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function extractMarkdownTitle(text) {
|
|
19
|
+
const m = text.match(/^#\s+(.+)$/m);
|
|
20
|
+
return m ? m[1].trim() : null;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async function extractText(filePath) {
|
|
24
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
25
|
+
|
|
26
|
+
if (ext === '.md' || ext === '.txt') {
|
|
27
|
+
try {
|
|
28
|
+
const text = fs.readFileSync(filePath, 'utf8');
|
|
29
|
+
const title = ext === '.md' ? extractMarkdownTitle(text) : null;
|
|
30
|
+
return { text, title, extractor: 'direct', extractStatus: 'ok' };
|
|
31
|
+
} catch (err) {
|
|
32
|
+
return { text: '', title: null, extractor: 'direct', extractStatus: 'error', errorMessage: err.message };
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (ext === '.pdf') {
|
|
37
|
+
return extractPdf(filePath);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
text: '', title: null, extractor: 'unknown', extractStatus: 'error',
|
|
42
|
+
errorMessage: `Unsupported file type: ${ext}`,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
async function extractPdf(filePath) {
|
|
47
|
+
const hasPdftotext = await checkCommand('pdftotext');
|
|
48
|
+
|
|
49
|
+
if (hasPdftotext) {
|
|
50
|
+
try {
|
|
51
|
+
const { stdout } = await execFileAsync('pdftotext', [filePath, '-'], { maxBuffer: 10 * 1024 * 1024 });
|
|
52
|
+
if (!stdout.trim()) {
|
|
53
|
+
return {
|
|
54
|
+
text: '', title: null, extractor: 'pdftotext', extractStatus: 'empty_or_scanned',
|
|
55
|
+
errorMessage: 'PDF produced no text — may be a scanned image. Install OCR for support.',
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
return { text: stdout, title: null, extractor: 'pdftotext', extractStatus: 'ok' };
|
|
59
|
+
} catch {
|
|
60
|
+
// fall through to pdf-parse
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Fallback: pdf-parse
|
|
65
|
+
try {
|
|
66
|
+
const pdfParse = require('pdf-parse');
|
|
67
|
+
const buf = fs.readFileSync(filePath);
|
|
68
|
+
const data = await pdfParse(buf);
|
|
69
|
+
if (!data.text.trim()) {
|
|
70
|
+
return {
|
|
71
|
+
text: '', title: null, extractor: 'pdf-parse', extractStatus: 'empty_or_scanned',
|
|
72
|
+
errorMessage: 'PDF produced no text — may be a scanned image.',
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
return { text: data.text, title: null, extractor: 'pdf-parse', extractStatus: 'ok' };
|
|
76
|
+
} catch (err) {
|
|
77
|
+
const hint = hasPdftotext ? '' : ' Install poppler for better PDF support: brew install poppler';
|
|
78
|
+
return {
|
|
79
|
+
text: '', title: null, extractor: 'pdf-parse', extractStatus: 'error',
|
|
80
|
+
errorMessage: err.message + hint,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Parse flat paper text into named sections.
|
|
87
|
+
*
|
|
88
|
+
* @param {string} text — raw extracted text (e.g. from pdftotext)
|
|
89
|
+
* @returns {{
|
|
90
|
+
* abstract: string, introduction: string, method: string,
|
|
91
|
+
* experiments: string, results: string, discussion: string,
|
|
92
|
+
* conclusion: string, references: string, _fallback: boolean
|
|
93
|
+
* }}
|
|
94
|
+
* _fallback is true when fewer than 2 section headers were found;
|
|
95
|
+
* in that case the text is split into three equal chunks and returned
|
|
96
|
+
* under 'introduction', 'method', 'results' to guarantee non-empty input
|
|
97
|
+
* for downstream fact extraction.
|
|
98
|
+
*/
|
|
99
|
+
function extractSections(text) {
|
|
100
|
+
// Map from canonical key → regex patterns (case-insensitive, optional number prefix)
|
|
101
|
+
const PATTERNS = {
|
|
102
|
+
abstract: /^(?:\d+[\.\s]+)?(?:abstract)\s*$/i,
|
|
103
|
+
introduction: /^(?:\d+[\.\s]+)?(?:introduction|background|overview)\s*$/i,
|
|
104
|
+
method: /^(?:\d+[\.\s]+)?(?:method(?:s|ology)?|approach|proposed\s+method|framework|model|architecture)\s*$/i,
|
|
105
|
+
experiments: /^(?:\d+[\.\s]+)?(?:experiments?|experimental\s+(?:setup|design)|evaluation|setup)\s*$/i,
|
|
106
|
+
results: /^(?:\d+[\.\s]+)?(?:results?|findings|performance)\s*$/i,
|
|
107
|
+
discussion: /^(?:\d+[\.\s]+)?(?:discussion|analysis|ablation)\s*$/i,
|
|
108
|
+
conclusion: /^(?:\d+[\.\s]+)?(?:conclusions?|summary|future\s+work)\s*$/i,
|
|
109
|
+
references: /^(?:\d+[\.\s]+)?(?:references|bibliography)\s*$/i,
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
const lines = text.split('\n');
|
|
113
|
+
const hits = []; // { key, lineIdx }
|
|
114
|
+
|
|
115
|
+
for (let i = 0; i < lines.length; i++) {
|
|
116
|
+
const trimmed = lines[i].trim();
|
|
117
|
+
if (!trimmed || trimmed.length > 80) continue; // section titles are short
|
|
118
|
+
for (const [key, re] of Object.entries(PATTERNS)) {
|
|
119
|
+
if (re.test(trimmed)) {
|
|
120
|
+
hits.push({ key, lineIdx: i });
|
|
121
|
+
break;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Fallback: fewer than 2 distinct headers detected
|
|
127
|
+
if (hits.length < 2) {
|
|
128
|
+
const third = Math.floor(lines.length / 3);
|
|
129
|
+
return {
|
|
130
|
+
abstract: '',
|
|
131
|
+
introduction: lines.slice(0, third).join('\n'),
|
|
132
|
+
method: lines.slice(third, 2 * third).join('\n'),
|
|
133
|
+
experiments: '',
|
|
134
|
+
results: lines.slice(2 * third).join('\n'),
|
|
135
|
+
discussion: '',
|
|
136
|
+
conclusion: '',
|
|
137
|
+
references: '',
|
|
138
|
+
_fallback: true,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Build sections from hits
|
|
143
|
+
const out = { abstract: '', introduction: '', method: '', experiments: '',
|
|
144
|
+
results: '', discussion: '', conclusion: '', references: '', _fallback: false };
|
|
145
|
+
|
|
146
|
+
for (let h = 0; h < hits.length; h++) {
|
|
147
|
+
const { key, lineIdx } = hits[h];
|
|
148
|
+
const endLine = h + 1 < hits.length ? hits[h + 1].lineIdx : lines.length;
|
|
149
|
+
// Deduplicate: keep the longest slice if same key appears twice
|
|
150
|
+
const chunk = lines.slice(lineIdx + 1, endLine).join('\n').trim();
|
|
151
|
+
if (chunk.length > (out[key] || '').length) out[key] = chunk;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return out;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async function checkCommand(cmd) {
|
|
158
|
+
// Use 'which' without shell:true to avoid shell injection
|
|
159
|
+
try {
|
|
160
|
+
await execFileAsync('which', [cmd]);
|
|
161
|
+
return true;
|
|
162
|
+
} catch {
|
|
163
|
+
return false;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function sha256(input) {
|
|
168
|
+
return crypto.createHash('sha256').update(input).digest('hex');
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
module.exports = { extractText, extractSections, slugFromFilename, sha256 };
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* wiki-facts.js — Paper fact extraction, persistence, and entity registration
|
|
5
|
+
*
|
|
6
|
+
* Single responsibility: turn structured paper sections into atomic facts
|
|
7
|
+
* stored in paper_facts + research_entities tables.
|
|
8
|
+
*
|
|
9
|
+
* Exports:
|
|
10
|
+
* extractPaperFacts(db, docSource, sections, providers, opts)
|
|
11
|
+
* → Promise<fact[]> all facts written to DB for this doc
|
|
12
|
+
*
|
|
13
|
+
* writeFacts(db, docSourceId, facts)
|
|
14
|
+
* → void idempotent batch INSERT (conflict ignore by id)
|
|
15
|
+
*
|
|
16
|
+
* registerEntities(db, facts)
|
|
17
|
+
* → void INSERT OR IGNORE entities inferred from subject/object fields
|
|
18
|
+
*
|
|
19
|
+
* buildTier1Prompt(title, facts)
|
|
20
|
+
* → string LLM prompt for Tier 1 wiki page generation
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
const crypto = require('node:crypto');
|
|
24
|
+
|
|
25
|
+
// ── Concurrency helper ────────────────────────────────────────────────────────
|
|
26
|
+
// Hand-written semaphore — no external dependencies.
|
|
27
|
+
async function withConcurrency(tasks, limit) {
|
|
28
|
+
const results = new Array(tasks.length);
|
|
29
|
+
let nextIdx = 0;
|
|
30
|
+
|
|
31
|
+
async function worker() {
|
|
32
|
+
while (nextIdx < tasks.length) {
|
|
33
|
+
const idx = nextIdx++;
|
|
34
|
+
try {
|
|
35
|
+
results[idx] = await tasks[idx]();
|
|
36
|
+
} catch (err) {
|
|
37
|
+
results[idx] = { error: err.message };
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
|
|
43
|
+
return results;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// ── Section → LLM prompt ──────────────────────────────────────────────────────
|
|
47
|
+
const FACT_TYPES = [
|
|
48
|
+
'problem', 'method', 'claim', 'assumption',
|
|
49
|
+
'dataset', 'metric', 'result', 'baseline',
|
|
50
|
+
'limitation', 'future_work', 'contradiction_note',
|
|
51
|
+
];
|
|
52
|
+
|
|
53
|
+
function buildSectionFactPrompt(sectionName, sectionText, paperTitle) {
|
|
54
|
+
const truncated = sectionText.length > 4000
|
|
55
|
+
? sectionText.slice(0, 4000) + '\n[...truncated]'
|
|
56
|
+
: sectionText;
|
|
57
|
+
|
|
58
|
+
return `You are extracting structured facts from a section of an academic paper.
|
|
59
|
+
|
|
60
|
+
Paper title: ${paperTitle}
|
|
61
|
+
Section: ${sectionName}
|
|
62
|
+
|
|
63
|
+
Section text:
|
|
64
|
+
${truncated}
|
|
65
|
+
|
|
66
|
+
Extract all atomic, verifiable facts from this section. For each fact output a JSON object with these fields:
|
|
67
|
+
- fact_type: one of ${FACT_TYPES.join(', ')}
|
|
68
|
+
- subject: the primary entity (model name, method name, system name, etc.)
|
|
69
|
+
- predicate: a short verb phrase (achieves, outperforms, requires, assumes, proposes, uses, ...)
|
|
70
|
+
- object: what the subject does/has/achieves (metric value, baseline name, dataset name, ...)
|
|
71
|
+
- value: numeric value if any (e.g. "0.87")
|
|
72
|
+
- unit: unit if any (e.g. "%", "ms", "F1")
|
|
73
|
+
- context: conditions under which this holds (e.g. "on FORCE 2020 dataset", "with 5-fold CV")
|
|
74
|
+
- evidence_text: exact quote from the section (≤400 characters) that supports this fact
|
|
75
|
+
- confidence: 0.0–1.0 reflecting how clearly stated this fact is
|
|
76
|
+
|
|
77
|
+
Return ONLY a valid JSON array of fact objects. No explanation, no markdown. Empty array [] if no facts found.`;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// ── LLM response parser ───────────────────────────────────────────────────────
|
|
81
|
+
function parseFacts(raw, sectionName) {
|
|
82
|
+
if (!raw || typeof raw !== 'string') return [];
|
|
83
|
+
const trimmed = raw.trim();
|
|
84
|
+
// Strip possible markdown code fences
|
|
85
|
+
const jsonStr = trimmed.startsWith('```')
|
|
86
|
+
? trimmed.replace(/^```[^\n]*\n?/, '').replace(/\n?```$/, '')
|
|
87
|
+
: trimmed;
|
|
88
|
+
try {
|
|
89
|
+
const parsed = JSON.parse(jsonStr);
|
|
90
|
+
if (!Array.isArray(parsed)) return [];
|
|
91
|
+
return parsed
|
|
92
|
+
.filter(f => f && typeof f === 'object' && f.fact_type && f.evidence_text)
|
|
93
|
+
.map(f => ({
|
|
94
|
+
fact_type: String(f.fact_type || 'claim'),
|
|
95
|
+
subject: f.subject ? String(f.subject).slice(0, 200) : null,
|
|
96
|
+
predicate: f.predicate ? String(f.predicate).slice(0, 100) : null,
|
|
97
|
+
object: f.object ? String(f.object).slice(0, 300) : null,
|
|
98
|
+
value: f.value ? String(f.value).slice(0, 50) : null,
|
|
99
|
+
unit: f.unit ? String(f.unit).slice(0, 20) : null,
|
|
100
|
+
context: f.context ? String(f.context).slice(0, 300) : null,
|
|
101
|
+
evidence_text: String(f.evidence_text || '').slice(0, 400),
|
|
102
|
+
section: sectionName,
|
|
103
|
+
confidence: typeof f.confidence === 'number'
|
|
104
|
+
? Math.min(1, Math.max(0, f.confidence))
|
|
105
|
+
: 0.7,
|
|
106
|
+
}));
|
|
107
|
+
} catch {
|
|
108
|
+
return [];
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// ── DB write helpers ──────────────────────────────────────────────────────────
|
|
113
|
+
|
|
114
|
+
// Valid fact_type values matching the paper_facts CHECK constraint
|
|
115
|
+
const VALID_FACT_TYPES = new Set([
|
|
116
|
+
'problem','method','claim','assumption',
|
|
117
|
+
'dataset','metric','result','baseline',
|
|
118
|
+
'limitation','future_work','contradiction_note',
|
|
119
|
+
]);
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Idempotent batch insert of facts into paper_facts.
|
|
123
|
+
*
|
|
124
|
+
* Deduplication: ID is a deterministic sha256 of (doc_source_id, evidence_text, section)
|
|
125
|
+
* so INSERT OR IGNORE correctly skips duplicates on re-run.
|
|
126
|
+
*
|
|
127
|
+
* fact_type is validated against the schema CHECK enum before insert;
|
|
128
|
+
* invalid values fall back to 'claim' to avoid crashing the batch transaction.
|
|
129
|
+
*/
|
|
130
|
+
function writeFacts(db, docSourceId, facts) {
|
|
131
|
+
if (facts.length === 0) return;
|
|
132
|
+
const insert = db.prepare(`
|
|
133
|
+
INSERT OR IGNORE INTO paper_facts
|
|
134
|
+
(id, doc_source_id, fact_type, subject, predicate, object,
|
|
135
|
+
value, unit, context, evidence_text, section,
|
|
136
|
+
extraction_source, confidence)
|
|
137
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'pdf_llm_section', ?)
|
|
138
|
+
`);
|
|
139
|
+
const tx = db.prepare('BEGIN');
|
|
140
|
+
const commit = db.prepare('COMMIT');
|
|
141
|
+
const rollback = db.prepare('ROLLBACK');
|
|
142
|
+
tx.run();
|
|
143
|
+
try {
|
|
144
|
+
for (const f of facts) {
|
|
145
|
+
// FLAG-7 fix: validate fact_type against enum, fallback to 'claim'
|
|
146
|
+
const factType = VALID_FACT_TYPES.has(f.fact_type) ? f.fact_type : 'claim';
|
|
147
|
+
// FLAG-8 fix: deterministic ID — sha256 of (docSourceId, section, evidence_text)
|
|
148
|
+
const idSeed = `${docSourceId}:${f.section || ''}:${f.evidence_text || ''}`;
|
|
149
|
+
const id = 'pf_' + crypto.createHash('sha256').update(idSeed).digest('hex').slice(0, 16);
|
|
150
|
+
insert.run(
|
|
151
|
+
id, docSourceId, factType,
|
|
152
|
+
f.subject, f.predicate, f.object,
|
|
153
|
+
f.value, f.unit, f.context,
|
|
154
|
+
f.evidence_text, f.section, f.confidence,
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
commit.run();
|
|
158
|
+
} catch (err) {
|
|
159
|
+
try { rollback.run(); } catch { /* ignore */ }
|
|
160
|
+
throw err;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Simple entity_type inference from fact fields
|
|
165
|
+
const ENTITY_HINTS = {
|
|
166
|
+
dataset: /\b(dataset|corpus|benchmark|collection)\b/i,
|
|
167
|
+
metric: /\b(accuracy|f1|precision|recall|auc|mse|rmse|bleu|rouge|map|ndcg)\b/i,
|
|
168
|
+
method_family: /\b(transformer|cnn|rnn|lstm|gru|bert|gpt|attention|svm|xgboost|random.?forest)\b/i,
|
|
169
|
+
problem: /\b(classification|regression|detection|segmentation|prediction|recognition)\b/i,
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
function inferEntityType(text) {
|
|
173
|
+
if (!text) return 'concept';
|
|
174
|
+
for (const [type, re] of Object.entries(ENTITY_HINTS)) {
|
|
175
|
+
if (re.test(text)) return type;
|
|
176
|
+
}
|
|
177
|
+
return 'concept';
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Register unique entities inferred from subject/object fields.
|
|
182
|
+
* INSERT OR IGNORE — safe to call multiple times.
|
|
183
|
+
*/
|
|
184
|
+
function registerEntities(db, facts) {
|
|
185
|
+
const seen = new Set();
|
|
186
|
+
const candidates = [];
|
|
187
|
+
for (const f of facts) {
|
|
188
|
+
for (const field of [f.subject, f.object]) {
|
|
189
|
+
if (field && field.length >= 2 && field.length <= 100 && !seen.has(field)) {
|
|
190
|
+
seen.add(field);
|
|
191
|
+
candidates.push(field);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
if (candidates.length === 0) return;
|
|
196
|
+
|
|
197
|
+
const insert = db.prepare(`
|
|
198
|
+
INSERT OR IGNORE INTO research_entities (id, entity_type, name)
|
|
199
|
+
VALUES (?, ?, ?)
|
|
200
|
+
`);
|
|
201
|
+
const tx = db.prepare('BEGIN');
|
|
202
|
+
const commit = db.prepare('COMMIT');
|
|
203
|
+
const rollback = db.prepare('ROLLBACK');
|
|
204
|
+
tx.run();
|
|
205
|
+
try {
|
|
206
|
+
for (const name of candidates) {
|
|
207
|
+
const id = 'ent_' + crypto.randomBytes(6).toString('hex');
|
|
208
|
+
const entity_type = inferEntityType(name);
|
|
209
|
+
insert.run(id, entity_type, name);
|
|
210
|
+
}
|
|
211
|
+
commit.run();
|
|
212
|
+
} catch (err) {
|
|
213
|
+
try { rollback.run(); } catch { /* ignore */ }
|
|
214
|
+
throw err;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// ── Main extraction entry point ───────────────────────────────────────────────
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Extract structured facts from paper sections using per-section LLM calls.
|
|
222
|
+
* Writes results to paper_facts and research_entities tables.
|
|
223
|
+
*
|
|
224
|
+
* @param {object} db - DatabaseSync instance
|
|
225
|
+
* @param {{ id: number, title: string, slug: string }} docSource
|
|
226
|
+
* @param {{ abstract, introduction, method, experiments, results,
|
|
227
|
+
* discussion, conclusion, _fallback: boolean }} sections
|
|
228
|
+
* @param {{ callHaiku: Function, buildDistillEnv: Function }} providers
|
|
229
|
+
* @param {{ concurrency?: number }} opts
|
|
230
|
+
* @returns {Promise<object[]>} all facts written to DB
|
|
231
|
+
*/
|
|
232
|
+
async function extractPaperFacts(db, docSource, sections, providers, { concurrency = 3 } = {}) {
|
|
233
|
+
const { callHaiku, buildDistillEnv } = providers;
|
|
234
|
+
const title = docSource.title || docSource.slug;
|
|
235
|
+
|
|
236
|
+
// Build one task per non-empty section (skip references, skip tiny sections)
|
|
237
|
+
const SKIP_SECTIONS = new Set(['references', '_fallback']);
|
|
238
|
+
const MIN_SECTION_LEN = 100;
|
|
239
|
+
|
|
240
|
+
const tasks = Object.entries(sections)
|
|
241
|
+
.filter(([key, text]) =>
|
|
242
|
+
!SKIP_SECTIONS.has(key) &&
|
|
243
|
+
typeof text === 'string' &&
|
|
244
|
+
text.trim().length >= MIN_SECTION_LEN
|
|
245
|
+
)
|
|
246
|
+
.map(([sectionName, sectionText]) => async () => {
|
|
247
|
+
const prompt = buildSectionFactPrompt(sectionName, sectionText, title);
|
|
248
|
+
let raw;
|
|
249
|
+
try {
|
|
250
|
+
const env = buildDistillEnv();
|
|
251
|
+
raw = await callHaiku(prompt, env, 60000, { model: 'sonnet' });
|
|
252
|
+
} catch {
|
|
253
|
+
return [];
|
|
254
|
+
}
|
|
255
|
+
return parseFacts(raw, sectionName);
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
if (tasks.length === 0) return [];
|
|
259
|
+
|
|
260
|
+
// Run with concurrency limit — LLM calls are all OUTSIDE any DB transaction
|
|
261
|
+
const sectionResults = await withConcurrency(tasks, concurrency);
|
|
262
|
+
const allFacts = sectionResults.flat().filter(f => f && !f.error);
|
|
263
|
+
|
|
264
|
+
if (allFacts.length === 0) return [];
|
|
265
|
+
|
|
266
|
+
// Write to DB in one shot (after all LLM calls complete)
|
|
267
|
+
writeFacts(db, docSource.id, allFacts);
|
|
268
|
+
registerEntities(db, allFacts);
|
|
269
|
+
|
|
270
|
+
return allFacts;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// ── Tier 1 wiki prompt builder ────────────────────────────────────────────────
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* Build a prompt for generating a Tier 1 wiki page from extracted facts.
|
|
277
|
+
* The page uses a fixed 7-section structure for downstream synthesis.
|
|
278
|
+
*
|
|
279
|
+
* @param {string} title
|
|
280
|
+
* @param {object[]} facts - from paper_facts table
|
|
281
|
+
* @returns {string}
|
|
282
|
+
*/
|
|
283
|
+
function buildTier1Prompt(title, facts) {
|
|
284
|
+
// Group facts by type for structured rendering
|
|
285
|
+
const byType = {};
|
|
286
|
+
for (const f of facts) {
|
|
287
|
+
if (!byType[f.fact_type]) byType[f.fact_type] = [];
|
|
288
|
+
byType[f.fact_type].push(f);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function renderFacts(types) {
|
|
292
|
+
return types
|
|
293
|
+
.flatMap(t => byType[t] || [])
|
|
294
|
+
.slice(0, 12)
|
|
295
|
+
.map(f => {
|
|
296
|
+
const parts = [f.subject, f.predicate, f.object].filter(Boolean).join(' ');
|
|
297
|
+
const ctx = f.context ? ` (${f.context})` : '';
|
|
298
|
+
const ev = f.evidence_text ? `\n Evidence: "${f.evidence_text}"` : '';
|
|
299
|
+
return `- ${parts}${ctx}${ev}`;
|
|
300
|
+
})
|
|
301
|
+
.join('\n');
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
const problemFacts = renderFacts(['problem', 'assumption']);
|
|
305
|
+
const methodFacts = renderFacts(['method', 'claim']);
|
|
306
|
+
const resultFacts = renderFacts(['result', 'metric', 'baseline']);
|
|
307
|
+
const datasetFacts = renderFacts(['dataset']);
|
|
308
|
+
const limitFacts = renderFacts(['limitation', 'future_work']);
|
|
309
|
+
const allFactCount = facts.length;
|
|
310
|
+
|
|
311
|
+
return `You are writing a Tier 1 wiki page for an academic paper knowledge base.
|
|
312
|
+
|
|
313
|
+
Paper: ${title}
|
|
314
|
+
Total extracted facts: ${allFactCount}
|
|
315
|
+
|
|
316
|
+
Extracted evidence:
|
|
317
|
+
|
|
318
|
+
## Problems / Assumptions
|
|
319
|
+
${problemFacts || '(none extracted)'}
|
|
320
|
+
|
|
321
|
+
## Methods / Claims
|
|
322
|
+
${methodFacts || '(none extracted)'}
|
|
323
|
+
|
|
324
|
+
## Results / Metrics / Baselines
|
|
325
|
+
${resultFacts || '(none extracted)'}
|
|
326
|
+
|
|
327
|
+
## Datasets
|
|
328
|
+
${datasetFacts || '(none extracted)'}
|
|
329
|
+
|
|
330
|
+
## Limitations
|
|
331
|
+
${limitFacts || '(none extracted)'}
|
|
332
|
+
|
|
333
|
+
Write a wiki page with EXACTLY these seven sections in order:
|
|
334
|
+
## Summary
|
|
335
|
+
## Problem Addressed
|
|
336
|
+
## Method
|
|
337
|
+
## Key Results
|
|
338
|
+
## Datasets Used
|
|
339
|
+
## Limitations
|
|
340
|
+
## Relation to This Project
|
|
341
|
+
|
|
342
|
+
Rules:
|
|
343
|
+
- Ground every claim in the extracted evidence above
|
|
344
|
+
- Include specific numbers, model names, and dataset names when available
|
|
345
|
+
- "Relation to This Project" should note methodological connections and potential challenges — leave placeholder text "[To be filled by paper-reader-lab]" if unknown
|
|
346
|
+
- Use [[wikilink]] syntax for concepts that deserve their own pages
|
|
347
|
+
- 300–600 words total
|
|
348
|
+
- Respond with only the wiki page content`;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
module.exports = { extractPaperFacts, writeFacts, registerEntities, buildTier1Prompt };
|