metame-cli 1.6.0 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +6 -7
- package/package.json +1 -1
- package/scripts/core/chunker.js +100 -0
- package/scripts/core/embedding.js +225 -0
- package/scripts/core/hybrid-search.js +296 -0
- package/scripts/core/wiki-db.js +144 -3
- package/scripts/daemon-bridges.js +9 -6
- package/scripts/daemon-command-router.js +25 -1
- package/scripts/daemon-default.yaml +31 -0
- package/scripts/daemon-embedding.js +162 -0
- package/scripts/daemon-engine-runtime.js +1 -1
- package/scripts/daemon-health-scan.js +185 -0
- package/scripts/daemon-runtime-lifecycle.js +1 -1
- package/scripts/daemon-task-scheduler.js +5 -3
- package/scripts/daemon-wiki.js +126 -4
- package/scripts/daemon.js +4 -2
- package/scripts/feishu-adapter.js +208 -29
- package/scripts/memory-backfill-chunks.js +92 -0
- package/scripts/memory-search.js +43 -15
- package/scripts/memory-wiki-schema.js +161 -2
- package/scripts/memory.js +15 -0
- package/scripts/providers.js +37 -6
- package/scripts/wiki-cluster.js +121 -0
- package/scripts/wiki-extract.js +171 -0
- package/scripts/wiki-facts.js +351 -0
- package/scripts/wiki-import.js +256 -0
- package/scripts/wiki-reflect-build.js +352 -28
- package/scripts/wiki-reflect-export.js +115 -0
- package/scripts/wiki-reflect.js +34 -1
- package/scripts/wiki-synthesis.js +224 -0
package/scripts/memory-search.js
CHANGED
|
@@ -3,14 +3,15 @@
|
|
|
3
3
|
* memory-search.js — Cross-session memory recall CLI
|
|
4
4
|
*
|
|
5
5
|
* Usage:
|
|
6
|
-
* node memory-search.js "<query>" # hybrid search (
|
|
6
|
+
* node memory-search.js "<query>" # hybrid search (FTS5 + vector + RRF)
|
|
7
7
|
* node memory-search.js "<q1>" "<q2>" "<q3>" # multi-keyword parallel search
|
|
8
8
|
* node memory-search.js --facts "<query>" # search facts only
|
|
9
9
|
* node memory-search.js --sessions "<query>" # search sessions only
|
|
10
|
+
* node memory-search.js --fts-only "<query>" # force pure FTS5 (no vector)
|
|
10
11
|
* node memory-search.js --recent # show recent sessions
|
|
11
12
|
*
|
|
12
13
|
* Multi-keyword: results are deduplicated by fact ID, best rank wins.
|
|
13
|
-
*
|
|
14
|
+
* Hybrid: uses FTS5 + vector embeddings + RRF fusion when available, falls back to FTS5.
|
|
14
15
|
*/
|
|
15
16
|
|
|
16
17
|
'use strict';
|
|
@@ -31,8 +32,20 @@ if (!memoryPath) {
|
|
|
31
32
|
const memory = require(memoryPath);
|
|
32
33
|
|
|
33
34
|
const args = process.argv.slice(2);
|
|
34
|
-
|
|
35
|
-
const
|
|
35
|
+
// Parse flags: allow multiple -- flags before queries
|
|
36
|
+
const flags = new Set();
|
|
37
|
+
let firstQueryIdx = 0;
|
|
38
|
+
for (let i = 0; i < args.length; i++) {
|
|
39
|
+
if (args[i].startsWith('--')) { flags.add(args[i]); firstQueryIdx = i + 1; }
|
|
40
|
+
else break;
|
|
41
|
+
}
|
|
42
|
+
const mode = flags.has('--facts') ? '--facts'
|
|
43
|
+
: flags.has('--sessions') ? '--sessions'
|
|
44
|
+
: flags.has('--recent') ? '--recent'
|
|
45
|
+
: flags.has('--fts-only') ? '--fts-only'
|
|
46
|
+
: null;
|
|
47
|
+
const ftsOnly = flags.has('--fts-only');
|
|
48
|
+
const queries = args.slice(firstQueryIdx);
|
|
36
49
|
|
|
37
50
|
async function main() {
|
|
38
51
|
try {
|
|
@@ -79,20 +92,35 @@ async function main() {
|
|
|
79
92
|
limit: 3,
|
|
80
93
|
});
|
|
81
94
|
|
|
82
|
-
// Wiki pages (
|
|
95
|
+
// Wiki pages — hybrid search (FTS5 + vector + RRF) when available
|
|
83
96
|
let wikiResults = [];
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
97
|
+
const useHybrid = typeof memory.hybridSearchWiki === 'function';
|
|
98
|
+
try {
|
|
99
|
+
const allWiki = [];
|
|
100
|
+
const seen = new Set();
|
|
101
|
+
for (const q of queries) {
|
|
102
|
+
const { wikiPages } = useHybrid
|
|
103
|
+
? await memory.hybridSearchWiki(q, { ftsOnly, trackSearch: true })
|
|
104
|
+
: (typeof memory.searchWikiAndFacts === 'function'
|
|
105
|
+
? memory.searchWikiAndFacts(q, { trackSearch: true })
|
|
106
|
+
: { wikiPages: [] });
|
|
107
|
+
for (const p of (wikiPages || [])) {
|
|
108
|
+
if (!seen.has(p.slug)) {
|
|
109
|
+
seen.add(p.slug);
|
|
110
|
+
allWiki.push({
|
|
111
|
+
type: 'wiki',
|
|
112
|
+
slug: p.slug,
|
|
113
|
+
title: p.title,
|
|
114
|
+
excerpt: p.excerpt,
|
|
115
|
+
score: p.score,
|
|
116
|
+
stale: p.stale,
|
|
117
|
+
source: p.source,
|
|
118
|
+
});
|
|
91
119
|
}
|
|
92
120
|
}
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
}
|
|
121
|
+
}
|
|
122
|
+
wikiResults = allWiki.slice(0, 5);
|
|
123
|
+
} catch { /* wiki not available */ }
|
|
96
124
|
|
|
97
125
|
console.log(JSON.stringify([...wikiResults, ...factResults, ...sessionResults], null, 2));
|
|
98
126
|
|
|
@@ -11,6 +11,8 @@
|
|
|
11
11
|
* wiki_pages — topic knowledge pages
|
|
12
12
|
* wiki_topics — controlled topic registry
|
|
13
13
|
* wiki_pages_fts — FTS5 virtual table (content table, trigram tokenizer)
|
|
14
|
+
* content_chunks — chunked page content with optional vector embeddings
|
|
15
|
+
* embedding_queue — durable async queue for embedding generation
|
|
14
16
|
*
|
|
15
17
|
* Triggers:
|
|
16
18
|
* wiki_pages_fts_insert / wiki_pages_fts_update / wiki_pages_fts_delete
|
|
@@ -42,6 +44,9 @@ function applyWikiSchema(db) {
|
|
|
42
44
|
)
|
|
43
45
|
`);
|
|
44
46
|
|
|
47
|
+
// Migration: add timeline column for Compiled Truth + Timeline model (existing DBs)
|
|
48
|
+
try { db.exec("ALTER TABLE wiki_pages ADD COLUMN timeline TEXT DEFAULT ''"); } catch { /* column already exists */ }
|
|
49
|
+
|
|
45
50
|
// ── wiki_topics ─────────────────────────────────────────────────────────────
|
|
46
51
|
db.exec(`
|
|
47
52
|
CREATE TABLE IF NOT EXISTS wiki_topics (
|
|
@@ -74,9 +79,14 @@ function applyWikiSchema(db) {
|
|
|
74
79
|
END
|
|
75
80
|
`);
|
|
76
81
|
|
|
82
|
+
// DROP+CREATE to upgrade existing unguarded trigger on deployed DBs
|
|
83
|
+
db.exec('DROP TRIGGER IF EXISTS wiki_pages_fts_update');
|
|
77
84
|
db.exec(`
|
|
78
|
-
CREATE TRIGGER
|
|
79
|
-
AFTER UPDATE ON wiki_pages
|
|
85
|
+
CREATE TRIGGER wiki_pages_fts_update
|
|
86
|
+
AFTER UPDATE ON wiki_pages
|
|
87
|
+
WHEN old.slug IS NOT new.slug OR old.title IS NOT new.title
|
|
88
|
+
OR old.content IS NOT new.content OR old.topic_tags IS NOT new.topic_tags
|
|
89
|
+
BEGIN
|
|
80
90
|
INSERT INTO wiki_pages_fts(wiki_pages_fts, rowid, slug, title, content, topic_tags)
|
|
81
91
|
VALUES ('delete', old.rowid, old.slug, old.title, old.content, old.topic_tags);
|
|
82
92
|
INSERT INTO wiki_pages_fts(rowid, slug, title, content, topic_tags)
|
|
@@ -91,6 +101,155 @@ function applyWikiSchema(db) {
|
|
|
91
101
|
VALUES ('delete', old.rowid, old.slug, old.title, old.content, old.topic_tags);
|
|
92
102
|
END
|
|
93
103
|
`);
|
|
104
|
+
|
|
105
|
+
// ── content_chunks (vector embedding storage for wiki pages) ────────────────
|
|
106
|
+
db.exec(`
|
|
107
|
+
CREATE TABLE IF NOT EXISTS content_chunks (
|
|
108
|
+
id TEXT PRIMARY KEY,
|
|
109
|
+
page_slug TEXT NOT NULL,
|
|
110
|
+
chunk_text TEXT NOT NULL,
|
|
111
|
+
chunk_idx INTEGER NOT NULL,
|
|
112
|
+
embedding BLOB,
|
|
113
|
+
embedding_model TEXT,
|
|
114
|
+
embedding_dim INTEGER,
|
|
115
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
116
|
+
)
|
|
117
|
+
`);
|
|
118
|
+
try { db.exec('CREATE INDEX IF NOT EXISTS idx_chunks_slug ON content_chunks(page_slug)'); } catch { }
|
|
119
|
+
|
|
120
|
+
// ── embedding_queue (durable async queue for embedding generation) ──────────
|
|
121
|
+
db.exec(`
|
|
122
|
+
CREATE TABLE IF NOT EXISTS embedding_queue (
|
|
123
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
124
|
+
item_type TEXT NOT NULL,
|
|
125
|
+
item_id TEXT NOT NULL,
|
|
126
|
+
model TEXT DEFAULT 'text-embedding-3-small',
|
|
127
|
+
attempts INTEGER DEFAULT 0,
|
|
128
|
+
last_error TEXT,
|
|
129
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
130
|
+
)
|
|
131
|
+
`);
|
|
132
|
+
|
|
133
|
+
// ── doc_sources ───────────────────────────────────────────────────────────
|
|
134
|
+
db.exec(`
|
|
135
|
+
CREATE TABLE IF NOT EXISTS doc_sources (
|
|
136
|
+
id INTEGER PRIMARY KEY,
|
|
137
|
+
file_path TEXT UNIQUE NOT NULL,
|
|
138
|
+
file_hash TEXT NOT NULL,
|
|
139
|
+
mtime_ms INTEGER,
|
|
140
|
+
size_bytes INTEGER,
|
|
141
|
+
extracted_text_hash TEXT,
|
|
142
|
+
file_type TEXT NOT NULL CHECK (file_type IN ('md','txt','pdf')),
|
|
143
|
+
extractor TEXT,
|
|
144
|
+
extract_status TEXT DEFAULT 'pending'
|
|
145
|
+
CHECK (extract_status IN ('ok','empty_or_scanned','error','pending')),
|
|
146
|
+
title TEXT,
|
|
147
|
+
slug TEXT UNIQUE NOT NULL,
|
|
148
|
+
status TEXT DEFAULT 'active'
|
|
149
|
+
CHECK (status IN ('active','orphaned','missing')),
|
|
150
|
+
error_message TEXT,
|
|
151
|
+
indexed_at TEXT NOT NULL,
|
|
152
|
+
last_seen_at TEXT,
|
|
153
|
+
built_at TEXT,
|
|
154
|
+
content_stale INTEGER DEFAULT 1
|
|
155
|
+
)
|
|
156
|
+
`);
|
|
157
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_doc_sources_status ON doc_sources(status)`);
|
|
158
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_doc_sources_file_hash ON doc_sources(file_hash)`);
|
|
159
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_doc_sources_slug ON doc_sources(slug)`);
|
|
160
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_doc_sources_content_stale ON doc_sources(content_stale)`);
|
|
161
|
+
|
|
162
|
+
// ── wiki_page_doc_sources ─────────────────────────────────────────────────
|
|
163
|
+
db.exec(`
|
|
164
|
+
CREATE TABLE IF NOT EXISTS wiki_page_doc_sources (
|
|
165
|
+
page_slug TEXT NOT NULL,
|
|
166
|
+
doc_source_id INTEGER NOT NULL,
|
|
167
|
+
role TEXT NOT NULL CHECK (role IN ('primary','cluster_member')),
|
|
168
|
+
PRIMARY KEY (page_slug, doc_source_id, role),
|
|
169
|
+
FOREIGN KEY (page_slug) REFERENCES wiki_pages(slug) ON DELETE CASCADE,
|
|
170
|
+
FOREIGN KEY (doc_source_id) REFERENCES doc_sources(id) ON DELETE CASCADE
|
|
171
|
+
)
|
|
172
|
+
`);
|
|
173
|
+
|
|
174
|
+
// ── wiki_pages additions (idempotent ALTER) ───────────────────────────────
|
|
175
|
+
for (const [col, def] of [
|
|
176
|
+
['source_type', "TEXT DEFAULT 'memory'"],
|
|
177
|
+
['membership_hash','TEXT'],
|
|
178
|
+
['cluster_size', 'INTEGER'],
|
|
179
|
+
]) {
|
|
180
|
+
try { db.exec(`ALTER TABLE wiki_pages ADD COLUMN ${col} ${def}`); } catch { /* already exists */ }
|
|
181
|
+
}
|
|
182
|
+
db.exec("UPDATE wiki_pages SET source_type = 'memory' WHERE source_type IS NULL");
|
|
183
|
+
|
|
184
|
+
// ── doc_sources additions (idempotent ALTER) ──────────────────────────────
|
|
185
|
+
for (const [col, def] of [
|
|
186
|
+
['doi', 'TEXT'],
|
|
187
|
+
['year', 'INTEGER'],
|
|
188
|
+
['venue', 'TEXT'],
|
|
189
|
+
['zotero_key', 'TEXT'],
|
|
190
|
+
['citation_count', 'INTEGER'],
|
|
191
|
+
]) {
|
|
192
|
+
try { db.exec(`ALTER TABLE doc_sources ADD COLUMN ${col} ${def}`); } catch { /* already exists */ }
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// ── paper_facts ───────────────────────────────────────────────────────────
|
|
196
|
+
db.exec(`
|
|
197
|
+
CREATE TABLE IF NOT EXISTS paper_facts (
|
|
198
|
+
id TEXT PRIMARY KEY,
|
|
199
|
+
doc_source_id INTEGER NOT NULL,
|
|
200
|
+
fact_type TEXT NOT NULL CHECK (fact_type IN (
|
|
201
|
+
'problem','method','claim','assumption',
|
|
202
|
+
'dataset','metric','result','baseline',
|
|
203
|
+
'limitation','future_work','contradiction_note'
|
|
204
|
+
)),
|
|
205
|
+
subject TEXT,
|
|
206
|
+
predicate TEXT,
|
|
207
|
+
object TEXT,
|
|
208
|
+
value TEXT,
|
|
209
|
+
unit TEXT,
|
|
210
|
+
context TEXT,
|
|
211
|
+
evidence_text TEXT NOT NULL,
|
|
212
|
+
section TEXT,
|
|
213
|
+
extraction_source TEXT DEFAULT 'pdf_llm_section'
|
|
214
|
+
CHECK (extraction_source IN (
|
|
215
|
+
'pdf_llm_section',
|
|
216
|
+
'zotero_deep_read',
|
|
217
|
+
'manual'
|
|
218
|
+
)),
|
|
219
|
+
confidence REAL DEFAULT 0.7,
|
|
220
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
221
|
+
FOREIGN KEY (doc_source_id) REFERENCES doc_sources(id) ON DELETE CASCADE
|
|
222
|
+
)
|
|
223
|
+
`);
|
|
224
|
+
db.exec('CREATE INDEX IF NOT EXISTS idx_paper_facts_doc ON paper_facts(doc_source_id)');
|
|
225
|
+
db.exec('CREATE INDEX IF NOT EXISTS idx_paper_facts_type ON paper_facts(fact_type)');
|
|
226
|
+
db.exec('CREATE INDEX IF NOT EXISTS idx_paper_facts_subject ON paper_facts(subject)');
|
|
227
|
+
|
|
228
|
+
// ── research_entities ─────────────────────────────────────────────────────
|
|
229
|
+
db.exec(`
|
|
230
|
+
CREATE TABLE IF NOT EXISTS research_entities (
|
|
231
|
+
id TEXT PRIMARY KEY,
|
|
232
|
+
entity_type TEXT NOT NULL CHECK (entity_type IN (
|
|
233
|
+
'problem','concept','method_family','dataset','metric','application'
|
|
234
|
+
)),
|
|
235
|
+
name TEXT NOT NULL UNIQUE,
|
|
236
|
+
aliases TEXT DEFAULT '[]',
|
|
237
|
+
description TEXT,
|
|
238
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
239
|
+
)
|
|
240
|
+
`);
|
|
241
|
+
|
|
242
|
+
// ── fact_entity_links ─────────────────────────────────────────────────────
|
|
243
|
+
db.exec(`
|
|
244
|
+
CREATE TABLE IF NOT EXISTS fact_entity_links (
|
|
245
|
+
fact_id TEXT NOT NULL,
|
|
246
|
+
entity_id TEXT NOT NULL,
|
|
247
|
+
role TEXT,
|
|
248
|
+
PRIMARY KEY (fact_id, entity_id),
|
|
249
|
+
FOREIGN KEY (fact_id) REFERENCES paper_facts(id) ON DELETE CASCADE,
|
|
250
|
+
FOREIGN KEY (entity_id) REFERENCES research_entities(id) ON DELETE CASCADE
|
|
251
|
+
)
|
|
252
|
+
`);
|
|
94
253
|
}
|
|
95
254
|
|
|
96
255
|
module.exports = { applyWikiSchema };
|
package/scripts/memory.js
CHANGED
|
@@ -47,6 +47,7 @@ function getDb() {
|
|
|
47
47
|
|
|
48
48
|
_db.exec('PRAGMA journal_mode = WAL');
|
|
49
49
|
_db.exec('PRAGMA busy_timeout = 3000');
|
|
50
|
+
_db.exec('PRAGMA foreign_keys = ON');
|
|
50
51
|
|
|
51
52
|
_db.exec(`
|
|
52
53
|
CREATE TABLE IF NOT EXISTS memory_items (
|
|
@@ -547,6 +548,19 @@ function searchWikiAndFacts(query, { trackSearch = true } = {}) {
|
|
|
547
548
|
}
|
|
548
549
|
}
|
|
549
550
|
|
|
551
|
+
/**
|
|
552
|
+
* Hybrid wiki search (FTS5 + vector + RRF fusion).
|
|
553
|
+
* Falls back to pure FTS5 if hybrid-search module is unavailable.
|
|
554
|
+
*/
|
|
555
|
+
async function hybridSearchWiki(query, { ftsOnly = false, expand = false, trackSearch = true } = {}) {
|
|
556
|
+
try {
|
|
557
|
+
const { hybridSearchWiki: fn } = require('./core/hybrid-search');
|
|
558
|
+
return await fn(getDb(), query, { ftsOnly, trackSearch });
|
|
559
|
+
} catch {
|
|
560
|
+
return searchWikiAndFacts(query, { trackSearch });
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
550
564
|
module.exports = {
|
|
551
565
|
// core
|
|
552
566
|
saveMemoryItem,
|
|
@@ -558,6 +572,7 @@ module.exports = {
|
|
|
558
572
|
assembleContext,
|
|
559
573
|
// wiki
|
|
560
574
|
searchWikiAndFacts,
|
|
575
|
+
hybridSearchWiki,
|
|
561
576
|
// compatibility
|
|
562
577
|
saveSession,
|
|
563
578
|
saveFacts,
|
package/scripts/providers.js
CHANGED
|
@@ -159,23 +159,53 @@ function saveProviders(config) {
|
|
|
159
159
|
// PROVIDER ENV BUILDER (Core mechanism)
|
|
160
160
|
// ---------------------------------------------------------
|
|
161
161
|
|
|
162
|
+
/**
|
|
163
|
+
* Read the env mapping defined in ~/.claude/settings.json.
|
|
164
|
+
* Returns a plain string→string object (only string values are kept).
|
|
165
|
+
* Returns {} on any error or if the file/env block is missing.
|
|
166
|
+
*/
|
|
167
|
+
function readClaudeSettingsEnv() {
|
|
168
|
+
const home = process.env.HOME || os.homedir();
|
|
169
|
+
const settingsPath = path.join(home, '.claude', 'settings.json');
|
|
170
|
+
try {
|
|
171
|
+
if (!fs.existsSync(settingsPath)) return {};
|
|
172
|
+
const data = JSON.parse(fs.readFileSync(settingsPath, 'utf8'));
|
|
173
|
+
if (!data || typeof data.env !== 'object' || data.env === null) return {};
|
|
174
|
+
const out = {};
|
|
175
|
+
for (const [k, v] of Object.entries(data.env)) {
|
|
176
|
+
if (typeof v === 'string') out[k] = v;
|
|
177
|
+
}
|
|
178
|
+
return out;
|
|
179
|
+
} catch {
|
|
180
|
+
return {};
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
162
184
|
/**
|
|
163
185
|
* Build env var overrides for a named provider.
|
|
164
|
-
*
|
|
165
|
-
*
|
|
186
|
+
*
|
|
187
|
+
* Always inherits the env mapping from ~/.claude/settings.json (slot mappings
|
|
188
|
+
* like ANTHROPIC_DEFAULT_*_MODEL stay in place across providers).
|
|
189
|
+
* For 'anthropic' (official): returns the inherited Claude settings env unchanged.
|
|
190
|
+
* For custom providers: overrides ANTHROPIC_BASE_URL plus both
|
|
191
|
+
* ANTHROPIC_API_KEY and ANTHROPIC_AUTH_TOKEN with the provider's credentials.
|
|
166
192
|
*/
|
|
167
193
|
function buildEnv(providerName) {
|
|
168
194
|
const config = loadProviders();
|
|
169
195
|
const name = providerName || config.active;
|
|
170
196
|
|
|
171
|
-
|
|
197
|
+
const env = readClaudeSettingsEnv();
|
|
198
|
+
|
|
199
|
+
if (name === 'anthropic') return env;
|
|
172
200
|
|
|
173
201
|
const provider = config.providers[name];
|
|
174
|
-
if (!provider) return
|
|
202
|
+
if (!provider) return env;
|
|
175
203
|
|
|
176
|
-
const env = {};
|
|
177
204
|
if (provider.base_url) env.ANTHROPIC_BASE_URL = provider.base_url;
|
|
178
|
-
if (provider.api_key)
|
|
205
|
+
if (provider.api_key) {
|
|
206
|
+
env.ANTHROPIC_API_KEY = provider.api_key;
|
|
207
|
+
env.ANTHROPIC_AUTH_TOKEN = provider.api_key;
|
|
208
|
+
}
|
|
179
209
|
return env;
|
|
180
210
|
}
|
|
181
211
|
|
|
@@ -390,6 +420,7 @@ function getEngine() { return _currentEngine; }
|
|
|
390
420
|
const api = {
|
|
391
421
|
loadProviders,
|
|
392
422
|
saveProviders,
|
|
423
|
+
readClaudeSettingsEnv,
|
|
393
424
|
buildEnv,
|
|
394
425
|
buildSpawnEnv,
|
|
395
426
|
buildActiveEnv,
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const crypto = require('node:crypto');
|
|
4
|
+
|
|
5
|
+
function cosineSimilarity(a, b) {
|
|
6
|
+
if (a.length !== b.length) return 0;
|
|
7
|
+
let dot = 0, normA = 0, normB = 0;
|
|
8
|
+
for (let i = 0; i < a.length; i++) {
|
|
9
|
+
dot += a[i] * b[i];
|
|
10
|
+
normA += a[i] * a[i];
|
|
11
|
+
normB += b[i] * b[i];
|
|
12
|
+
}
|
|
13
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
14
|
+
return denom === 0 ? 0 : dot / denom;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function membershipHash(slugs) {
|
|
18
|
+
const sorted = [...slugs].sort().join(',');
|
|
19
|
+
return crypto.createHash('sha256').update(sorted).digest('hex');
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function jaccardOverlap(setA, setB) {
|
|
23
|
+
const a = new Set(setA);
|
|
24
|
+
const b = new Set(setB);
|
|
25
|
+
let intersection = 0;
|
|
26
|
+
for (const x of a) if (b.has(x)) intersection++;
|
|
27
|
+
const union = a.size + b.size - intersection;
|
|
28
|
+
return union === 0 ? 0 : intersection / union;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Find existing cluster with Jaccard overlap > 0.5 with newMemberIds.
|
|
33
|
+
* Tie-break: prefer larger stored cluster.
|
|
34
|
+
*/
|
|
35
|
+
function findMatchingCluster(existingClusters, newMemberIds) {
|
|
36
|
+
let best = null;
|
|
37
|
+
let bestScore = 0.5; // strict threshold: must exceed 0.5
|
|
38
|
+
for (const cluster of existingClusters) {
|
|
39
|
+
const score = jaccardOverlap(cluster.memberIds, newMemberIds);
|
|
40
|
+
if (score > bestScore) {
|
|
41
|
+
best = cluster;
|
|
42
|
+
bestScore = score;
|
|
43
|
+
} else if (score === bestScore && best && cluster.memberIds.length > best.memberIds.length) {
|
|
44
|
+
best = cluster;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return best;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Build connected components from embeddings using cosine similarity threshold.
|
|
52
|
+
* @param {Array<{ slug: string, vector: Float32Array|number[] }>} embeddings
|
|
53
|
+
* @param {{ threshold?: number, minSize?: number }} options
|
|
54
|
+
* Uses union-find.
|
|
55
|
+
*/
|
|
56
|
+
function buildConnectedComponents(embeddings, { threshold = 0.75, minSize = 3 } = {}) {
|
|
57
|
+
const n = embeddings.length;
|
|
58
|
+
const slugs = embeddings.map(e => e.slug);
|
|
59
|
+
const parent = Object.fromEntries(slugs.map(s => [s, s]));
|
|
60
|
+
|
|
61
|
+
function find(x) {
|
|
62
|
+
if (parent[x] !== x) parent[x] = find(parent[x]);
|
|
63
|
+
return parent[x];
|
|
64
|
+
}
|
|
65
|
+
function union(x, y) { parent[find(x)] = find(y); }
|
|
66
|
+
|
|
67
|
+
for (let i = 0; i < n; i++) {
|
|
68
|
+
for (let j = i + 1; j < n; j++) {
|
|
69
|
+
if (cosineSimilarity(embeddings[i].vector, embeddings[j].vector) >= threshold) {
|
|
70
|
+
union(slugs[i], slugs[j]);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const groups = {};
|
|
76
|
+
for (const s of slugs) {
|
|
77
|
+
const root = find(s);
|
|
78
|
+
if (!groups[root]) groups[root] = [];
|
|
79
|
+
groups[root].push(s);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return Object.values(groups).filter(g => g.length >= minSize);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Fetch doc-level embeddings from content_chunks by averaging all chunk embeddings per page.
|
|
87
|
+
* @param {object} db - node:sqlite DatabaseSync instance
|
|
88
|
+
* @param {string[]} slugs
|
|
89
|
+
* @returns {Array<{ slug: string, vector: Float32Array }>} one entry per slug that has embeddings
|
|
90
|
+
*/
|
|
91
|
+
function getDocEmbeddings(db, slugs) {
|
|
92
|
+
if (slugs.length === 0) return [];
|
|
93
|
+
const placeholders = ',?'.repeat(slugs.length).slice(1);
|
|
94
|
+
const rows = db.prepare(
|
|
95
|
+
`SELECT page_slug, embedding FROM content_chunks WHERE page_slug IN (${placeholders}) AND embedding IS NOT NULL`
|
|
96
|
+
).all(...slugs);
|
|
97
|
+
|
|
98
|
+
// Group rows by slug
|
|
99
|
+
const bySlug = {};
|
|
100
|
+
for (const row of rows) {
|
|
101
|
+
if (!bySlug[row.page_slug]) bySlug[row.page_slug] = [];
|
|
102
|
+
const buf = Buffer.isBuffer(row.embedding) ? row.embedding : Buffer.from(row.embedding);
|
|
103
|
+
if (buf.byteLength % 4 !== 0) continue; // skip corrupt/truncated embedding row
|
|
104
|
+
bySlug[row.page_slug].push(new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4));
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Average chunk vectors per slug
|
|
108
|
+
const result = [];
|
|
109
|
+
for (const [slug, vecs] of Object.entries(bySlug)) {
|
|
110
|
+
const dim = vecs[0].length;
|
|
111
|
+
const avg = new Float32Array(dim);
|
|
112
|
+
for (const v of vecs) {
|
|
113
|
+
for (let i = 0; i < dim; i++) avg[i] += v[i] / vecs.length;
|
|
114
|
+
}
|
|
115
|
+
result.push({ slug, vector: avg });
|
|
116
|
+
}
|
|
117
|
+
return result;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
module.exports = { cosineSimilarity, buildConnectedComponents, jaccardOverlap,
|
|
121
|
+
findMatchingCluster, membershipHash, getDocEmbeddings };
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('node:fs');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const crypto = require('node:crypto');
|
|
6
|
+
const { execFile } = require('node:child_process');
|
|
7
|
+
const { promisify } = require('node:util');
|
|
8
|
+
const execFileAsync = promisify(execFile);
|
|
9
|
+
|
|
10
|
+
function slugFromFilename(filePath) {
|
|
11
|
+
const base = path.basename(filePath, path.extname(filePath));
|
|
12
|
+
return base
|
|
13
|
+
.toLowerCase()
|
|
14
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
15
|
+
.replace(/^-+|-+$/g, '');
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function extractMarkdownTitle(text) {
|
|
19
|
+
const m = text.match(/^#\s+(.+)$/m);
|
|
20
|
+
return m ? m[1].trim() : null;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async function extractText(filePath) {
|
|
24
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
25
|
+
|
|
26
|
+
if (ext === '.md' || ext === '.txt') {
|
|
27
|
+
try {
|
|
28
|
+
const text = fs.readFileSync(filePath, 'utf8');
|
|
29
|
+
const title = ext === '.md' ? extractMarkdownTitle(text) : null;
|
|
30
|
+
return { text, title, extractor: 'direct', extractStatus: 'ok' };
|
|
31
|
+
} catch (err) {
|
|
32
|
+
return { text: '', title: null, extractor: 'direct', extractStatus: 'error', errorMessage: err.message };
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (ext === '.pdf') {
|
|
37
|
+
return extractPdf(filePath);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
text: '', title: null, extractor: 'unknown', extractStatus: 'error',
|
|
42
|
+
errorMessage: `Unsupported file type: ${ext}`,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
async function extractPdf(filePath) {
|
|
47
|
+
const hasPdftotext = await checkCommand('pdftotext');
|
|
48
|
+
|
|
49
|
+
if (hasPdftotext) {
|
|
50
|
+
try {
|
|
51
|
+
const { stdout } = await execFileAsync('pdftotext', [filePath, '-'], { maxBuffer: 10 * 1024 * 1024 });
|
|
52
|
+
if (!stdout.trim()) {
|
|
53
|
+
return {
|
|
54
|
+
text: '', title: null, extractor: 'pdftotext', extractStatus: 'empty_or_scanned',
|
|
55
|
+
errorMessage: 'PDF produced no text — may be a scanned image. Install OCR for support.',
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
return { text: stdout, title: null, extractor: 'pdftotext', extractStatus: 'ok' };
|
|
59
|
+
} catch {
|
|
60
|
+
// fall through to pdf-parse
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Fallback: pdf-parse
|
|
65
|
+
try {
|
|
66
|
+
const pdfParse = require('pdf-parse');
|
|
67
|
+
const buf = fs.readFileSync(filePath);
|
|
68
|
+
const data = await pdfParse(buf);
|
|
69
|
+
if (!data.text.trim()) {
|
|
70
|
+
return {
|
|
71
|
+
text: '', title: null, extractor: 'pdf-parse', extractStatus: 'empty_or_scanned',
|
|
72
|
+
errorMessage: 'PDF produced no text — may be a scanned image.',
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
return { text: data.text, title: null, extractor: 'pdf-parse', extractStatus: 'ok' };
|
|
76
|
+
} catch (err) {
|
|
77
|
+
const hint = hasPdftotext ? '' : ' Install poppler for better PDF support: brew install poppler';
|
|
78
|
+
return {
|
|
79
|
+
text: '', title: null, extractor: 'pdf-parse', extractStatus: 'error',
|
|
80
|
+
errorMessage: err.message + hint,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Parse flat paper text into named sections.
|
|
87
|
+
*
|
|
88
|
+
* @param {string} text — raw extracted text (e.g. from pdftotext)
|
|
89
|
+
* @returns {{
|
|
90
|
+
* abstract: string, introduction: string, method: string,
|
|
91
|
+
* experiments: string, results: string, discussion: string,
|
|
92
|
+
* conclusion: string, references: string, _fallback: boolean
|
|
93
|
+
* }}
|
|
94
|
+
* _fallback is true when fewer than 2 section headers were found;
|
|
95
|
+
* in that case the text is split into three equal chunks and returned
|
|
96
|
+
* under 'introduction', 'method', 'results' to guarantee non-empty input
|
|
97
|
+
* for downstream fact extraction.
|
|
98
|
+
*/
|
|
99
|
+
function extractSections(text) {
|
|
100
|
+
// Map from canonical key → regex patterns (case-insensitive, optional number prefix)
|
|
101
|
+
const PATTERNS = {
|
|
102
|
+
abstract: /^(?:\d+[\.\s]+)?(?:abstract)\s*$/i,
|
|
103
|
+
introduction: /^(?:\d+[\.\s]+)?(?:introduction|background|overview)\s*$/i,
|
|
104
|
+
method: /^(?:\d+[\.\s]+)?(?:method(?:s|ology)?|approach|proposed\s+method|framework|model|architecture)\s*$/i,
|
|
105
|
+
experiments: /^(?:\d+[\.\s]+)?(?:experiments?|experimental\s+(?:setup|design)|evaluation|setup)\s*$/i,
|
|
106
|
+
results: /^(?:\d+[\.\s]+)?(?:results?|findings|performance)\s*$/i,
|
|
107
|
+
discussion: /^(?:\d+[\.\s]+)?(?:discussion|analysis|ablation)\s*$/i,
|
|
108
|
+
conclusion: /^(?:\d+[\.\s]+)?(?:conclusions?|summary|future\s+work)\s*$/i,
|
|
109
|
+
references: /^(?:\d+[\.\s]+)?(?:references|bibliography)\s*$/i,
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
const lines = text.split('\n');
|
|
113
|
+
const hits = []; // { key, lineIdx }
|
|
114
|
+
|
|
115
|
+
for (let i = 0; i < lines.length; i++) {
|
|
116
|
+
const trimmed = lines[i].trim();
|
|
117
|
+
if (!trimmed || trimmed.length > 80) continue; // section titles are short
|
|
118
|
+
for (const [key, re] of Object.entries(PATTERNS)) {
|
|
119
|
+
if (re.test(trimmed)) {
|
|
120
|
+
hits.push({ key, lineIdx: i });
|
|
121
|
+
break;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Fallback: fewer than 2 distinct headers detected
|
|
127
|
+
if (hits.length < 2) {
|
|
128
|
+
const third = Math.floor(lines.length / 3);
|
|
129
|
+
return {
|
|
130
|
+
abstract: '',
|
|
131
|
+
introduction: lines.slice(0, third).join('\n'),
|
|
132
|
+
method: lines.slice(third, 2 * third).join('\n'),
|
|
133
|
+
experiments: '',
|
|
134
|
+
results: lines.slice(2 * third).join('\n'),
|
|
135
|
+
discussion: '',
|
|
136
|
+
conclusion: '',
|
|
137
|
+
references: '',
|
|
138
|
+
_fallback: true,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Build sections from hits
|
|
143
|
+
const out = { abstract: '', introduction: '', method: '', experiments: '',
|
|
144
|
+
results: '', discussion: '', conclusion: '', references: '', _fallback: false };
|
|
145
|
+
|
|
146
|
+
for (let h = 0; h < hits.length; h++) {
|
|
147
|
+
const { key, lineIdx } = hits[h];
|
|
148
|
+
const endLine = h + 1 < hits.length ? hits[h + 1].lineIdx : lines.length;
|
|
149
|
+
// Deduplicate: keep the longest slice if same key appears twice
|
|
150
|
+
const chunk = lines.slice(lineIdx + 1, endLine).join('\n').trim();
|
|
151
|
+
if (chunk.length > (out[key] || '').length) out[key] = chunk;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return out;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async function checkCommand(cmd) {
|
|
158
|
+
// Use 'which' without shell:true to avoid shell injection
|
|
159
|
+
try {
|
|
160
|
+
await execFileAsync('which', [cmd]);
|
|
161
|
+
return true;
|
|
162
|
+
} catch {
|
|
163
|
+
return false;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function sha256(input) {
|
|
168
|
+
return crypto.createHash('sha256').update(input).digest('hex');
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
module.exports = { extractText, extractSections, slugFromFilename, sha256 };
|