memory-crystal 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +20 -0
- package/CHANGELOG.md +6 -0
- package/LETTERS.md +22 -0
- package/LICENSE +21 -0
- package/README-ENTERPRISE.md +162 -0
- package/README-old.md +275 -0
- package/README.md +91 -0
- package/RELAY.md +88 -0
- package/TECHNICAL.md +379 -0
- package/ai/dev-updates/2026-02-25--cc-air--phase2-architecture-pivot.md +70 -0
- package/ai/dev-updates/2026-02-25--cc-air--phase2-worker-build.md +72 -0
- package/ai/dev-updates/2026-02-26--10-25-16--cc-mini--phase2-implementation.md +49 -0
- package/ai/dev-updates/2026-02-27--20-30-00--cc-mini--readme-overhaul-and-public-deploy.md +69 -0
- package/ai/notes/2026-02-26--cc-air--notes.md +412 -0
- package/ai/notes/2026-02-27--cc-mini--grok-feedback.md +44 -0
- package/ai/notes/2026-02-27--cc-mini--lesa-feedback.md +45 -0
- package/ai/notes/RESEARCH.md +1185 -0
- package/ai/notes/salience-research/README.md +29 -0
- package/ai/notes/salience-research/eurosla-salience-review.md +64 -0
- package/ai/notes/salience-research/full-research-summary.md +269 -0
- package/ai/notes/salience-research/salience-levels-diagram.png +0 -0
- package/ai/plan/2026-02-27--cc-mini--qr-pairing-spec.md +203 -0
- package/ai/plan/_archive/PLAN.md +194 -0
- package/ai/plan/_archive/PRD.md +1014 -0
- package/ai/plan/cc-plans-duplicates-from-dot-claude/2026-02-26--cc-mini--phase2-implementation-plan.md +245 -0
- package/ai/plan/dev-conventions-note.md +70 -0
- package/ai/plan/ldm-os-install-and-boot-architecture.md +285 -0
- package/ai/plan/memory-crystal-phase2-plan.md +192 -0
- package/ai/plan/memory-system-lay-of-the-land.md +214 -0
- package/ai/plan/phase2-ephemeral-relay.md +238 -0
- package/ai/plan/readme-first.md +68 -0
- package/ai/plan/roadmap.md +159 -0
- package/ai/todos/PUNCHLIST.md +44 -0
- package/ai/todos/README.md +31 -0
- package/ai/todos/inboxes/cc-air/2026-02-26--cc-air--post-relay-todos.md +85 -0
- package/ai/todos/inboxes/cc-mini/2026-02-26--cc-mini--phase2-status.md +100 -0
- package/ai/todos/inboxes/cc-mini/_archive/TODO.md +25 -0
- package/ai/todos/inboxes/parker/2026-02-25--cc-air--setup-checklist.md +139 -0
- package/ai/todos/inboxes/parker/2026-02-26--cc-mini--phase2-your-moves.md +72 -0
- package/dist/cc-hook.d.ts +1 -0
- package/dist/cc-hook.js +349 -0
- package/dist/chunk-3VFIJYS4.js +818 -0
- package/dist/chunk-52QE3YI3.js +1169 -0
- package/dist/chunk-AA3OPP4Z.js +432 -0
- package/dist/chunk-D3I3ZSE2.js +411 -0
- package/dist/chunk-EKSACBTJ.js +1070 -0
- package/dist/chunk-F3Y7EL7K.js +83 -0
- package/dist/chunk-JWZXYVET.js +1068 -0
- package/dist/chunk-KYVWO6ZM.js +1069 -0
- package/dist/chunk-L3VHARQH.js +413 -0
- package/dist/chunk-LOVAHSQV.js +411 -0
- package/dist/chunk-LQOYCAGG.js +446 -0
- package/dist/chunk-MK42FMEG.js +147 -0
- package/dist/chunk-NIJCVN3O.js +147 -0
- package/dist/chunk-O2UITJGH.js +465 -0
- package/dist/chunk-PEK6JH65.js +432 -0
- package/dist/chunk-PJ6FFKEX.js +77 -0
- package/dist/chunk-PLUBBZYR.js +800 -0
- package/dist/chunk-SGL6ISBJ.js +1061 -0
- package/dist/chunk-UNHVZB5G.js +411 -0
- package/dist/chunk-VAFTWSTE.js +1061 -0
- package/dist/chunk-XZ3S56RQ.js +1061 -0
- package/dist/chunk-Y72C7F6O.js +148 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +325 -0
- package/dist/core.d.ts +188 -0
- package/dist/core.js +12 -0
- package/dist/crypto.d.ts +16 -0
- package/dist/crypto.js +18 -0
- package/dist/dev-update-SZ2Z4WCQ.js +6 -0
- package/dist/ldm.d.ts +17 -0
- package/dist/ldm.js +12 -0
- package/dist/mcp-server.d.ts +1 -0
- package/dist/mcp-server.js +250 -0
- package/dist/migrate.d.ts +1 -0
- package/dist/migrate.js +89 -0
- package/dist/mirror-sync.d.ts +1 -0
- package/dist/mirror-sync.js +130 -0
- package/dist/openclaw.d.ts +5 -0
- package/dist/openclaw.js +349 -0
- package/dist/poller.d.ts +1 -0
- package/dist/poller.js +272 -0
- package/dist/summarize.d.ts +19 -0
- package/dist/summarize.js +10 -0
- package/dist/worker.js +137 -0
- package/openclaw.plugin.json +11 -0
- package/package.json +40 -0
- package/scripts/migrate-lance-to-sqlite.mjs +217 -0
- package/skills/memory/SKILL.md +61 -0
- package/src/cc-hook.ts +447 -0
- package/src/cli.ts +356 -0
- package/src/core.ts +1472 -0
- package/src/crypto.ts +113 -0
- package/src/dev-update.ts +178 -0
- package/src/ldm.ts +117 -0
- package/src/mcp-server.ts +274 -0
- package/src/migrate.ts +104 -0
- package/src/mirror-sync.ts +175 -0
- package/src/openclaw.ts +250 -0
- package/src/poller.ts +345 -0
- package/src/summarize.ts +210 -0
- package/src/worker.ts +208 -0
- package/tsconfig.json +18 -0
- package/wrangler.toml +20 -0
package/src/core.ts
ADDED
|
@@ -0,0 +1,1472 @@
|
|
|
1
|
+
// memory-crystal/core.ts — Pure logic layer. Zero framework dependencies.
|
|
2
|
+
// Hybrid search: sqlite-vec (vectors) + FTS5 (BM25) + RRF fusion + recency.
|
|
3
|
+
// Dual-writes to LanceDB (safety net) and sqlite-vec (source of truth).
|
|
4
|
+
// Search algorithms ported from QMD (MIT, Tobi Lutke, 2024-2026).
|
|
5
|
+
// Config via function params, not globals. Errors: throw, callers catch.
|
|
6
|
+
|
|
7
|
+
import * as lancedb from '@lancedb/lancedb';
|
|
8
|
+
import Database from 'better-sqlite3';
|
|
9
|
+
import * as sqliteVec from 'sqlite-vec';
|
|
10
|
+
import { readFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'node:fs';
|
|
11
|
+
import { execSync } from 'node:child_process';
|
|
12
|
+
import { join, relative, extname, basename } from 'node:path';
|
|
13
|
+
import { createHash } from 'node:crypto';
|
|
14
|
+
import http from 'node:http';
|
|
15
|
+
import https from 'node:https';
|
|
16
|
+
|
|
17
|
+
// ─── Types ─────────────────────────────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
export interface CrystalConfig {
|
|
20
|
+
/** Root directory for all crystal data */
|
|
21
|
+
dataDir: string;
|
|
22
|
+
/** Embedding provider: 'openai' | 'ollama' | 'google' */
|
|
23
|
+
embeddingProvider: 'openai' | 'ollama' | 'google';
|
|
24
|
+
/** OpenAI API key (required if provider is 'openai') */
|
|
25
|
+
openaiApiKey?: string;
|
|
26
|
+
/** OpenAI embedding model (default: text-embedding-3-small) */
|
|
27
|
+
openaiModel?: string;
|
|
28
|
+
/** Ollama host (default: http://localhost:11434) */
|
|
29
|
+
ollamaHost?: string;
|
|
30
|
+
/** Ollama model (default: nomic-embed-text) */
|
|
31
|
+
ollamaModel?: string;
|
|
32
|
+
/** Google API key (required if provider is 'google') */
|
|
33
|
+
googleApiKey?: string;
|
|
34
|
+
/** Google embedding model (default: text-embedding-004) */
|
|
35
|
+
googleModel?: string;
|
|
36
|
+
/** Remote Worker URL for cloud mirror mode */
|
|
37
|
+
remoteUrl?: string;
|
|
38
|
+
/** Remote auth token */
|
|
39
|
+
remoteToken?: string;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface Chunk {
|
|
43
|
+
id?: number;
|
|
44
|
+
text: string;
|
|
45
|
+
embedding?: number[];
|
|
46
|
+
role: 'user' | 'assistant' | 'system';
|
|
47
|
+
source_type: string; // 'conversation' | 'file' | 'imessage' | 'manual'
|
|
48
|
+
source_id: string; // session key, file path, etc.
|
|
49
|
+
agent_id: string; // 'main' (Lēsa), 'claude-code', etc.
|
|
50
|
+
token_count: number;
|
|
51
|
+
created_at: string; // ISO timestamp
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface Memory {
|
|
55
|
+
id?: number;
|
|
56
|
+
text: string;
|
|
57
|
+
embedding?: number[];
|
|
58
|
+
category: 'fact' | 'preference' | 'event' | 'opinion' | 'skill';
|
|
59
|
+
confidence: number; // 0-1, decays over time
|
|
60
|
+
source_ids: string; // JSON array of chunk IDs
|
|
61
|
+
status: 'active' | 'deprecated' | 'deleted';
|
|
62
|
+
created_at: string;
|
|
63
|
+
updated_at: string;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export interface SearchResult {
|
|
67
|
+
text: string;
|
|
68
|
+
role: string;
|
|
69
|
+
score: number;
|
|
70
|
+
source_type: string;
|
|
71
|
+
source_id: string;
|
|
72
|
+
agent_id: string;
|
|
73
|
+
created_at: string;
|
|
74
|
+
freshness?: "fresh" | "recent" | "aging" | "stale";
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export interface CrystalStatus {
|
|
78
|
+
chunks: number;
|
|
79
|
+
memories: number;
|
|
80
|
+
sources: number;
|
|
81
|
+
agents: string[];
|
|
82
|
+
oldestChunk: string | null;
|
|
83
|
+
newestChunk: string | null;
|
|
84
|
+
embeddingProvider: string;
|
|
85
|
+
dataDir: string;
|
|
86
|
+
capturedSessions: number;
|
|
87
|
+
latestCapture: string | null;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ─── Source Indexing Types (optional feature) ─────────────────────────────
|
|
91
|
+
|
|
92
|
+
export interface SourceCollection {
|
|
93
|
+
id?: number;
|
|
94
|
+
name: string;
|
|
95
|
+
root_path: string;
|
|
96
|
+
glob_patterns: string; // JSON array of include globs
|
|
97
|
+
ignore_patterns: string; // JSON array of ignore globs
|
|
98
|
+
file_count: number;
|
|
99
|
+
chunk_count: number;
|
|
100
|
+
last_sync_at: string | null;
|
|
101
|
+
created_at: string;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export interface SourceFile {
|
|
105
|
+
id?: number;
|
|
106
|
+
collection_id: number;
|
|
107
|
+
file_path: string; // relative to collection root
|
|
108
|
+
file_hash: string; // SHA-256 of content
|
|
109
|
+
file_size: number;
|
|
110
|
+
chunk_count: number;
|
|
111
|
+
last_indexed_at: string;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
export interface SourcesStatus {
|
|
115
|
+
collections: Array<{
|
|
116
|
+
name: string;
|
|
117
|
+
root_path: string;
|
|
118
|
+
file_count: number;
|
|
119
|
+
chunk_count: number;
|
|
120
|
+
last_sync_at: string | null;
|
|
121
|
+
}>;
|
|
122
|
+
total_files: number;
|
|
123
|
+
total_chunks: number;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export interface SyncResult {
|
|
127
|
+
collection: string;
|
|
128
|
+
added: number;
|
|
129
|
+
updated: number;
|
|
130
|
+
removed: number;
|
|
131
|
+
chunks_added: number;
|
|
132
|
+
duration_ms: number;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// ─── Embedding Providers ───────────────────────────────────────────────────
|
|
136
|
+
|
|
137
|
+
async function embedOpenAI(texts: string[], apiKey: string, model: string): Promise<number[][]> {
|
|
138
|
+
return new Promise((resolve, reject) => {
|
|
139
|
+
const body = JSON.stringify({ input: texts, model });
|
|
140
|
+
const req = https.request({
|
|
141
|
+
hostname: 'api.openai.com',
|
|
142
|
+
path: '/v1/embeddings',
|
|
143
|
+
method: 'POST',
|
|
144
|
+
headers: {
|
|
145
|
+
'Content-Type': 'application/json',
|
|
146
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
147
|
+
'Content-Length': Buffer.byteLength(body),
|
|
148
|
+
},
|
|
149
|
+
timeout: 30000,
|
|
150
|
+
}, (res) => {
|
|
151
|
+
let data = '';
|
|
152
|
+
res.on('data', (chunk) => data += chunk);
|
|
153
|
+
res.on('end', () => {
|
|
154
|
+
if (res.statusCode !== 200) {
|
|
155
|
+
reject(new Error(`OpenAI API error ${res.statusCode}: ${data.slice(0, 200)}`));
|
|
156
|
+
return;
|
|
157
|
+
}
|
|
158
|
+
const parsed = JSON.parse(data);
|
|
159
|
+
resolve(parsed.data.map((d: any) => d.embedding));
|
|
160
|
+
});
|
|
161
|
+
});
|
|
162
|
+
req.on('error', reject);
|
|
163
|
+
req.on('timeout', () => { req.destroy(); reject(new Error('OpenAI timeout')); });
|
|
164
|
+
req.write(body);
|
|
165
|
+
req.end();
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async function embedOllama(texts: string[], host: string, model: string): Promise<number[][]> {
|
|
170
|
+
const results: number[][] = [];
|
|
171
|
+
for (const text of texts) {
|
|
172
|
+
const result = await new Promise<number[]>((resolve, reject) => {
|
|
173
|
+
const url = new URL('/api/embeddings', host);
|
|
174
|
+
const body = JSON.stringify({ model, prompt: text });
|
|
175
|
+
const req = http.request({
|
|
176
|
+
hostname: url.hostname,
|
|
177
|
+
port: url.port,
|
|
178
|
+
path: url.pathname,
|
|
179
|
+
method: 'POST',
|
|
180
|
+
headers: {
|
|
181
|
+
'Content-Type': 'application/json',
|
|
182
|
+
'Content-Length': Buffer.byteLength(body),
|
|
183
|
+
},
|
|
184
|
+
timeout: 15000,
|
|
185
|
+
}, (res) => {
|
|
186
|
+
let data = '';
|
|
187
|
+
res.on('data', (chunk) => data += chunk);
|
|
188
|
+
res.on('end', () => {
|
|
189
|
+
if (res.statusCode !== 200) {
|
|
190
|
+
reject(new Error(`Ollama error ${res.statusCode}: ${data.slice(0, 200)}`));
|
|
191
|
+
return;
|
|
192
|
+
}
|
|
193
|
+
resolve(JSON.parse(data).embedding);
|
|
194
|
+
});
|
|
195
|
+
});
|
|
196
|
+
req.on('error', reject);
|
|
197
|
+
req.on('timeout', () => { req.destroy(); reject(new Error('Ollama timeout')); });
|
|
198
|
+
req.write(body);
|
|
199
|
+
req.end();
|
|
200
|
+
});
|
|
201
|
+
results.push(result);
|
|
202
|
+
}
|
|
203
|
+
return results;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
async function embedGoogle(texts: string[], apiKey: string, model: string): Promise<number[][]> {
|
|
207
|
+
return new Promise((resolve, reject) => {
|
|
208
|
+
const body = JSON.stringify({
|
|
209
|
+
requests: texts.map(text => ({ model: `models/${model}`, content: { parts: [{ text }] } })),
|
|
210
|
+
});
|
|
211
|
+
const req = https.request({
|
|
212
|
+
hostname: 'generativelanguage.googleapis.com',
|
|
213
|
+
path: `/v1beta/models/${model}:batchEmbedContents?key=${apiKey}`,
|
|
214
|
+
method: 'POST',
|
|
215
|
+
headers: {
|
|
216
|
+
'Content-Type': 'application/json',
|
|
217
|
+
'Content-Length': Buffer.byteLength(body),
|
|
218
|
+
},
|
|
219
|
+
timeout: 30000,
|
|
220
|
+
}, (res) => {
|
|
221
|
+
let data = '';
|
|
222
|
+
res.on('data', (chunk) => data += chunk);
|
|
223
|
+
res.on('end', () => {
|
|
224
|
+
if (res.statusCode !== 200) {
|
|
225
|
+
reject(new Error(`Google API error ${res.statusCode}: ${data.slice(0, 200)}`));
|
|
226
|
+
return;
|
|
227
|
+
}
|
|
228
|
+
const parsed = JSON.parse(data);
|
|
229
|
+
resolve(parsed.embeddings.map((e: any) => e.values));
|
|
230
|
+
});
|
|
231
|
+
});
|
|
232
|
+
req.on('error', reject);
|
|
233
|
+
req.on('timeout', () => { req.destroy(); reject(new Error('Google timeout')); });
|
|
234
|
+
req.write(body);
|
|
235
|
+
req.end();
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// ─── Crystal Core ──────────────────────────────────────────────────────────
|
|
240
|
+
|
|
241
|
+
export class Crystal {
|
|
242
|
+
private config: CrystalConfig;
|
|
243
|
+
private lanceDb: lancedb.Connection | null = null;
|
|
244
|
+
private sqliteDb: Database.Database | null = null;
|
|
245
|
+
private chunksTable: lancedb.Table | null = null;
|
|
246
|
+
private vecDimensions: number | null = null;
|
|
247
|
+
|
|
248
|
+
constructor(config: CrystalConfig) {
|
|
249
|
+
this.config = config;
|
|
250
|
+
if (!existsSync(config.dataDir)) {
|
|
251
|
+
mkdirSync(config.dataDir, { recursive: true });
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// ── Initialization ──
|
|
256
|
+
|
|
257
|
+
async init(): Promise<void> {
|
|
258
|
+
const lanceDir = join(this.config.dataDir, 'lance');
|
|
259
|
+
const sqlitePath = join(this.config.dataDir, 'crystal.db');
|
|
260
|
+
|
|
261
|
+
if (!existsSync(lanceDir)) mkdirSync(lanceDir, { recursive: true });
|
|
262
|
+
|
|
263
|
+
this.lanceDb = await lancedb.connect(lanceDir);
|
|
264
|
+
this.sqliteDb = new Database(sqlitePath);
|
|
265
|
+
this.sqliteDb.pragma('journal_mode = WAL');
|
|
266
|
+
|
|
267
|
+
// Load sqlite-vec extension for vector search
|
|
268
|
+
sqliteVec.load(this.sqliteDb);
|
|
269
|
+
|
|
270
|
+
this.initSqliteTables();
|
|
271
|
+
this.initChunksTables();
|
|
272
|
+
await this.initLanceTables();
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
private initSqliteTables(): void {
|
|
276
|
+
const db = this.sqliteDb!;
|
|
277
|
+
|
|
278
|
+
db.exec(`
|
|
279
|
+
CREATE TABLE IF NOT EXISTS sources (
|
|
280
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
281
|
+
type TEXT NOT NULL,
|
|
282
|
+
uri TEXT NOT NULL,
|
|
283
|
+
title TEXT,
|
|
284
|
+
agent_id TEXT NOT NULL,
|
|
285
|
+
metadata TEXT DEFAULT '{}',
|
|
286
|
+
ingested_at TEXT NOT NULL,
|
|
287
|
+
chunk_count INTEGER DEFAULT 0
|
|
288
|
+
);
|
|
289
|
+
|
|
290
|
+
CREATE TABLE IF NOT EXISTS capture_state (
|
|
291
|
+
agent_id TEXT NOT NULL,
|
|
292
|
+
source_id TEXT NOT NULL,
|
|
293
|
+
last_message_count INTEGER DEFAULT 0,
|
|
294
|
+
capture_count INTEGER DEFAULT 0,
|
|
295
|
+
last_capture_at TEXT,
|
|
296
|
+
PRIMARY KEY (agent_id, source_id)
|
|
297
|
+
);
|
|
298
|
+
|
|
299
|
+
CREATE TABLE IF NOT EXISTS memories (
|
|
300
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
301
|
+
text TEXT NOT NULL,
|
|
302
|
+
category TEXT NOT NULL DEFAULT 'fact',
|
|
303
|
+
confidence REAL NOT NULL DEFAULT 1.0,
|
|
304
|
+
source_ids TEXT DEFAULT '[]',
|
|
305
|
+
status TEXT NOT NULL DEFAULT 'active',
|
|
306
|
+
created_at TEXT NOT NULL,
|
|
307
|
+
updated_at TEXT NOT NULL
|
|
308
|
+
);
|
|
309
|
+
|
|
310
|
+
CREATE TABLE IF NOT EXISTS entities (
|
|
311
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
312
|
+
name TEXT NOT NULL UNIQUE,
|
|
313
|
+
type TEXT NOT NULL DEFAULT 'concept',
|
|
314
|
+
description TEXT,
|
|
315
|
+
properties TEXT DEFAULT '{}',
|
|
316
|
+
created_at TEXT NOT NULL,
|
|
317
|
+
updated_at TEXT NOT NULL
|
|
318
|
+
);
|
|
319
|
+
|
|
320
|
+
CREATE TABLE IF NOT EXISTS relationships (
|
|
321
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
322
|
+
source_id INTEGER NOT NULL REFERENCES entities(id),
|
|
323
|
+
target_id INTEGER NOT NULL REFERENCES entities(id),
|
|
324
|
+
type TEXT NOT NULL,
|
|
325
|
+
description TEXT,
|
|
326
|
+
weight REAL DEFAULT 1.0,
|
|
327
|
+
valid_from TEXT NOT NULL,
|
|
328
|
+
valid_until TEXT,
|
|
329
|
+
created_at TEXT NOT NULL
|
|
330
|
+
);
|
|
331
|
+
|
|
332
|
+
CREATE INDEX IF NOT EXISTS idx_sources_agent ON sources(agent_id);
|
|
333
|
+
CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status);
|
|
334
|
+
CREATE INDEX IF NOT EXISTS idx_entities_name ON entities(name);
|
|
335
|
+
CREATE INDEX IF NOT EXISTS idx_relationships_source ON relationships(source_id);
|
|
336
|
+
CREATE INDEX IF NOT EXISTS idx_relationships_target ON relationships(target_id);
|
|
337
|
+
|
|
338
|
+
-- Source file indexing (optional feature)
|
|
339
|
+
CREATE TABLE IF NOT EXISTS source_collections (
|
|
340
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
341
|
+
name TEXT NOT NULL UNIQUE,
|
|
342
|
+
root_path TEXT NOT NULL,
|
|
343
|
+
glob_patterns TEXT NOT NULL DEFAULT '["**/*"]',
|
|
344
|
+
ignore_patterns TEXT NOT NULL DEFAULT '[]',
|
|
345
|
+
file_count INTEGER DEFAULT 0,
|
|
346
|
+
chunk_count INTEGER DEFAULT 0,
|
|
347
|
+
last_sync_at TEXT,
|
|
348
|
+
created_at TEXT NOT NULL
|
|
349
|
+
);
|
|
350
|
+
|
|
351
|
+
CREATE TABLE IF NOT EXISTS source_files (
|
|
352
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
353
|
+
collection_id INTEGER NOT NULL REFERENCES source_collections(id) ON DELETE CASCADE,
|
|
354
|
+
file_path TEXT NOT NULL,
|
|
355
|
+
file_hash TEXT NOT NULL,
|
|
356
|
+
file_size INTEGER NOT NULL,
|
|
357
|
+
chunk_count INTEGER DEFAULT 0,
|
|
358
|
+
last_indexed_at TEXT NOT NULL
|
|
359
|
+
);
|
|
360
|
+
|
|
361
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_source_files_path ON source_files(collection_id, file_path);
|
|
362
|
+
CREATE INDEX IF NOT EXISTS idx_source_files_collection ON source_files(collection_id);
|
|
363
|
+
`);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
private initChunksTables(): void {
|
|
367
|
+
const db = this.sqliteDb!;
|
|
368
|
+
|
|
369
|
+
// Chunks table: text + metadata (replaces LanceDB for search reads)
|
|
370
|
+
db.exec(`
|
|
371
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
372
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
373
|
+
text TEXT NOT NULL,
|
|
374
|
+
text_hash TEXT NOT NULL,
|
|
375
|
+
role TEXT,
|
|
376
|
+
source_type TEXT,
|
|
377
|
+
source_id TEXT,
|
|
378
|
+
agent_id TEXT,
|
|
379
|
+
token_count INTEGER,
|
|
380
|
+
created_at TEXT NOT NULL
|
|
381
|
+
);
|
|
382
|
+
|
|
383
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_agent ON chunks(agent_id);
|
|
384
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_type);
|
|
385
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_hash ON chunks(text_hash);
|
|
386
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_created ON chunks(created_at);
|
|
387
|
+
|
|
388
|
+
-- FTS5 full-text search table
|
|
389
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
|
390
|
+
text,
|
|
391
|
+
tokenize='porter unicode61'
|
|
392
|
+
);
|
|
393
|
+
|
|
394
|
+
-- Sync trigger: populate FTS on chunk insert
|
|
395
|
+
CREATE TRIGGER IF NOT EXISTS chunks_fts_insert AFTER INSERT ON chunks
|
|
396
|
+
BEGIN
|
|
397
|
+
INSERT INTO chunks_fts(rowid, text) VALUES (NEW.id, NEW.text);
|
|
398
|
+
END;
|
|
399
|
+
`);
|
|
400
|
+
|
|
401
|
+
// Check if chunks_vec exists and get its dimensions
|
|
402
|
+
const vecTable = db.prepare(
|
|
403
|
+
`SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_vec'`
|
|
404
|
+
).get() as any;
|
|
405
|
+
|
|
406
|
+
if (vecTable) {
|
|
407
|
+
// Vec table exists, figure out its dimensions from existing data
|
|
408
|
+
try {
|
|
409
|
+
const row = db.prepare('SELECT embedding FROM chunks_vec LIMIT 1').get() as any;
|
|
410
|
+
if (row?.embedding) {
|
|
411
|
+
// Float32Array: 4 bytes per dimension
|
|
412
|
+
this.vecDimensions = (row.embedding as Buffer).length / 4;
|
|
413
|
+
}
|
|
414
|
+
} catch {
|
|
415
|
+
// Empty table or error, dimensions will be set on first ingest
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
private ensureVecTable(dimensions: number): void {
|
|
421
|
+
const db = this.sqliteDb!;
|
|
422
|
+
const existing = db.prepare(
|
|
423
|
+
`SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_vec'`
|
|
424
|
+
).get();
|
|
425
|
+
|
|
426
|
+
if (!existing) {
|
|
427
|
+
db.exec(`
|
|
428
|
+
CREATE VIRTUAL TABLE chunks_vec USING vec0(
|
|
429
|
+
chunk_id INTEGER PRIMARY KEY,
|
|
430
|
+
embedding float[${dimensions}] distance_metric=cosine
|
|
431
|
+
);
|
|
432
|
+
`);
|
|
433
|
+
}
|
|
434
|
+
this.vecDimensions = dimensions;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
private async initLanceTables(): Promise<void> {
|
|
438
|
+
const db = this.lanceDb!;
|
|
439
|
+
const tableNames = await db.tableNames();
|
|
440
|
+
|
|
441
|
+
if (tableNames.includes('chunks')) {
|
|
442
|
+
this.chunksTable = await db.openTable('chunks');
|
|
443
|
+
}
|
|
444
|
+
// Table created on first ingest (needs embedding dimensions)
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// ── Embedding ──
|
|
448
|
+
|
|
449
|
+
async embed(texts: string[]): Promise<number[][]> {
|
|
450
|
+
if (texts.length === 0) return [];
|
|
451
|
+
const cfg = this.config;
|
|
452
|
+
|
|
453
|
+
switch (cfg.embeddingProvider) {
|
|
454
|
+
case 'openai': {
|
|
455
|
+
if (!cfg.openaiApiKey) throw new Error('OpenAI API key required');
|
|
456
|
+
const model = cfg.openaiModel || 'text-embedding-3-small';
|
|
457
|
+
// OpenAI has a 300K token limit per request. Sub-batch to stay safe.
|
|
458
|
+
// ~4 chars per token, cap at ~200K tokens (~800K chars) per batch.
|
|
459
|
+
const maxCharsPerBatch = 800000;
|
|
460
|
+
const results: number[][] = [];
|
|
461
|
+
let batch: string[] = [];
|
|
462
|
+
let batchChars = 0;
|
|
463
|
+
|
|
464
|
+
for (const text of texts) {
|
|
465
|
+
if (batchChars + text.length > maxCharsPerBatch && batch.length > 0) {
|
|
466
|
+
results.push(...await embedOpenAI(batch, cfg.openaiApiKey!, model));
|
|
467
|
+
batch = [];
|
|
468
|
+
batchChars = 0;
|
|
469
|
+
}
|
|
470
|
+
batch.push(text);
|
|
471
|
+
batchChars += text.length;
|
|
472
|
+
}
|
|
473
|
+
if (batch.length > 0) {
|
|
474
|
+
results.push(...await embedOpenAI(batch, cfg.openaiApiKey!, model));
|
|
475
|
+
}
|
|
476
|
+
return results;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
case 'ollama':
|
|
480
|
+
return embedOllama(texts, cfg.ollamaHost || 'http://localhost:11434', cfg.ollamaModel || 'nomic-embed-text');
|
|
481
|
+
|
|
482
|
+
case 'google':
|
|
483
|
+
if (!cfg.googleApiKey) throw new Error('Google API key required');
|
|
484
|
+
return embedGoogle(texts, cfg.googleApiKey, cfg.googleModel || 'text-embedding-004');
|
|
485
|
+
|
|
486
|
+
default:
|
|
487
|
+
throw new Error(`Unknown embedding provider: ${cfg.embeddingProvider}`);
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// ── Chunking ──
|
|
492
|
+
|
|
493
|
+
chunkText(text: string, targetTokens = 400, overlapTokens = 80): string[] {
|
|
494
|
+
const targetChars = targetTokens * 4;
|
|
495
|
+
const overlapChars = overlapTokens * 4;
|
|
496
|
+
const chunks: string[] = [];
|
|
497
|
+
let start = 0;
|
|
498
|
+
|
|
499
|
+
while (start < text.length) {
|
|
500
|
+
let end = Math.min(start + targetChars, text.length);
|
|
501
|
+
|
|
502
|
+
if (end < text.length) {
|
|
503
|
+
// Try paragraph boundary first
|
|
504
|
+
const minBreak = start + Math.floor(targetChars * 0.5);
|
|
505
|
+
const paraBreak = text.lastIndexOf('\n\n', end);
|
|
506
|
+
if (paraBreak > minBreak) {
|
|
507
|
+
end = paraBreak;
|
|
508
|
+
} else {
|
|
509
|
+
// Try sentence boundary
|
|
510
|
+
const sentBreak = text.lastIndexOf('. ', end);
|
|
511
|
+
if (sentBreak > minBreak) {
|
|
512
|
+
end = sentBreak + 1;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
const chunk = text.slice(start, end).trim();
|
|
518
|
+
if (chunk.length > 0) chunks.push(chunk);
|
|
519
|
+
|
|
520
|
+
if (end >= text.length) break;
|
|
521
|
+
start = end - overlapChars;
|
|
522
|
+
if (start <= (chunks.length > 0 ? end - targetChars : 0)) {
|
|
523
|
+
start = end;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
return chunks;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
// ── Ingest ──
|
|
531
|
+
|
|
532
|
+
async ingest(chunks: Chunk[]): Promise<number> {
|
|
533
|
+
if (chunks.length === 0) return 0;
|
|
534
|
+
const db = this.sqliteDb!;
|
|
535
|
+
|
|
536
|
+
// 1. Dedup: skip chunks whose text already exists (by SHA-256 hash)
|
|
537
|
+
const newChunks = chunks.filter(c => {
|
|
538
|
+
const hash = createHash('sha256').update(c.text).digest('hex');
|
|
539
|
+
return !db.prepare('SELECT 1 FROM chunks WHERE text_hash = ?').get(hash);
|
|
540
|
+
});
|
|
541
|
+
|
|
542
|
+
if (newChunks.length === 0) return 0;
|
|
543
|
+
|
|
544
|
+
// 2. Embed
|
|
545
|
+
const texts = newChunks.map(c => c.text);
|
|
546
|
+
const embeddings = await this.embed(texts);
|
|
547
|
+
|
|
548
|
+
// 3. Ensure vec table exists (lazy... needs dimensions from first embedding)
|
|
549
|
+
if (!this.vecDimensions && embeddings.length > 0) {
|
|
550
|
+
this.ensureVecTable(embeddings[0].length);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
// 4. Write to sqlite-vec (chunks table trigger populates FTS automatically)
|
|
554
|
+
const insertChunk = db.prepare(`
|
|
555
|
+
INSERT INTO chunks (text, text_hash, role, source_type, source_id, agent_id, token_count, created_at)
|
|
556
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
557
|
+
`);
|
|
558
|
+
const insertVec = db.prepare(`
|
|
559
|
+
INSERT INTO chunks_vec (chunk_id, embedding) VALUES (?, ?)
|
|
560
|
+
`);
|
|
561
|
+
|
|
562
|
+
const transaction = db.transaction(() => {
|
|
563
|
+
for (let i = 0; i < newChunks.length; i++) {
|
|
564
|
+
const c = newChunks[i];
|
|
565
|
+
const hash = createHash('sha256').update(c.text).digest('hex');
|
|
566
|
+
const result = insertChunk.run(
|
|
567
|
+
c.text, hash, c.role, c.source_type, c.source_id,
|
|
568
|
+
c.agent_id, c.token_count, c.created_at || new Date().toISOString()
|
|
569
|
+
);
|
|
570
|
+
// sqlite-vec requires BigInt for INTEGER PRIMARY KEY
|
|
571
|
+
const chunkId = typeof result.lastInsertRowid === 'bigint'
|
|
572
|
+
? result.lastInsertRowid
|
|
573
|
+
: BigInt(result.lastInsertRowid);
|
|
574
|
+
insertVec.run(chunkId, new Float32Array(embeddings[i]));
|
|
575
|
+
}
|
|
576
|
+
});
|
|
577
|
+
transaction();
|
|
578
|
+
|
|
579
|
+
// 5. Dual-write: also write to LanceDB (safety net during transition)
|
|
580
|
+
const records = newChunks.map((chunk, i) => ({
|
|
581
|
+
text: chunk.text,
|
|
582
|
+
vector: embeddings[i],
|
|
583
|
+
role: chunk.role,
|
|
584
|
+
source_type: chunk.source_type,
|
|
585
|
+
source_id: chunk.source_id,
|
|
586
|
+
agent_id: chunk.agent_id,
|
|
587
|
+
token_count: chunk.token_count,
|
|
588
|
+
created_at: chunk.created_at || new Date().toISOString(),
|
|
589
|
+
}));
|
|
590
|
+
|
|
591
|
+
try {
|
|
592
|
+
if (!this.chunksTable) {
|
|
593
|
+
this.chunksTable = await this.lanceDb!.createTable('chunks', records);
|
|
594
|
+
} else {
|
|
595
|
+
await this.chunksTable.add(records);
|
|
596
|
+
}
|
|
597
|
+
} catch (err) {
|
|
598
|
+
// LanceDB write failure is non-fatal during transition
|
|
599
|
+
console.warn('LanceDB dual-write failed (non-fatal):', (err as Error).message);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
return newChunks.length;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
// ── Recency helpers ──
|
|
606
|
+
|
|
607
|
+
private recencyWeight(ageDays: number): number {
|
|
608
|
+
// Linear decay with floor at 0.5. Old stuff never fully disappears
|
|
609
|
+
// but fresh context wins ties. ~50 days to hit the floor.
|
|
610
|
+
return Math.max(0.5, 1.0 - ageDays * 0.01);
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
private freshnessLabel(ageDays: number): "fresh" | "recent" | "aging" | "stale" {
|
|
614
|
+
if (ageDays < 3) return "fresh";
|
|
615
|
+
if (ageDays < 7) return "recent";
|
|
616
|
+
if (ageDays < 14) return "aging";
|
|
617
|
+
return "stale";
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// ── Search (Hybrid: BM25 + Vector + RRF fusion + Recency) ──
|
|
621
|
+
|
|
622
|
+
async search(query: string, limit = 5, filter?: { agent_id?: string; source_type?: string }): Promise<SearchResult[]> {
|
|
623
|
+
const db = this.sqliteDb!;
|
|
624
|
+
|
|
625
|
+
// Check if sqlite-vec has been populated (migration complete)
|
|
626
|
+
const sqliteChunks = (db.prepare('SELECT COUNT(*) as count FROM chunks').get() as any)?.count || 0;
|
|
627
|
+
let lanceChunks = 0;
|
|
628
|
+
if (this.chunksTable) {
|
|
629
|
+
try { lanceChunks = await this.chunksTable.countRows(); } catch {}
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// Use LanceDB fallback if sqlite-vec is empty OR has far fewer chunks than LanceDB
|
|
633
|
+
// (migration not yet done). Once migration runs, sqlite-vec count will match.
|
|
634
|
+
if (sqliteChunks === 0 || (lanceChunks > 0 && sqliteChunks < lanceChunks * 0.5)) {
|
|
635
|
+
return this.searchLanceFallback(query, limit, filter);
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
const [embedding] = await this.embed([query]);
|
|
639
|
+
const fetchLimit = Math.max(limit * 3, 30);
|
|
640
|
+
|
|
641
|
+
// Run FTS and vector search, then fuse with RRF
|
|
642
|
+
const vecResults = this.searchVec(embedding, fetchLimit, filter);
|
|
643
|
+
const ftsResults = this.searchFTS(query, fetchLimit, filter);
|
|
644
|
+
const fused = this.reciprocalRankFusion([ftsResults, vecResults], [1.0, 1.0]);
|
|
645
|
+
|
|
646
|
+
// Apply recency weighting on top of fused scores
|
|
647
|
+
const now = Date.now();
|
|
648
|
+
const scored = fused.map(r => {
|
|
649
|
+
const ageDays = r.created_at ? (now - new Date(r.created_at).getTime()) / 86400000 : 0;
|
|
650
|
+
const recency = r.created_at ? this.recencyWeight(ageDays) : 1;
|
|
651
|
+
// RRF scores max at ~0.08. Rescale to match old cosine range (0.3-0.6)
|
|
652
|
+
// so models treat the results as meaningful. Ranking is unchanged.
|
|
653
|
+
const rescaled = Math.min(r.score * recency * 8, 1.0);
|
|
654
|
+
return {
|
|
655
|
+
...r,
|
|
656
|
+
score: rescaled,
|
|
657
|
+
freshness: r.created_at ? this.freshnessLabel(ageDays) : undefined,
|
|
658
|
+
};
|
|
659
|
+
});
|
|
660
|
+
|
|
661
|
+
return scored.sort((a, b) => b.score - a.score).slice(0, limit);
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
/** Vector search via sqlite-vec. Two-step pattern: MATCH first, then JOIN. */
|
|
665
|
+
private searchVec(embedding: number[], limit: number, filter?: { agent_id?: string; source_type?: string }): SearchResult[] {
|
|
666
|
+
const db = this.sqliteDb!;
|
|
667
|
+
|
|
668
|
+
if (!this.vecDimensions) return [];
|
|
669
|
+
|
|
670
|
+
// Step 1: sqlite-vec MATCH (no JOINs! Virtual tables hang with JOINs.)
|
|
671
|
+
// See: https://github.com/tobi/qmd/pull/23
|
|
672
|
+
const vecRows = db.prepare(`
|
|
673
|
+
SELECT chunk_id, distance
|
|
674
|
+
FROM chunks_vec
|
|
675
|
+
WHERE embedding MATCH ? AND k = ?
|
|
676
|
+
`).all(new Float32Array(embedding), limit) as Array<{ chunk_id: number; distance: number }>;
|
|
677
|
+
|
|
678
|
+
if (vecRows.length === 0) return [];
|
|
679
|
+
|
|
680
|
+
// Step 2: Look up chunk metadata with a separate query
|
|
681
|
+
const ids = vecRows.map(r => r.chunk_id);
|
|
682
|
+
const distMap = new Map(vecRows.map(r => [r.chunk_id, r.distance]));
|
|
683
|
+
|
|
684
|
+
const placeholders = ids.map(() => '?').join(',');
|
|
685
|
+
let sql = `SELECT id, text, role, source_type, source_id, agent_id, created_at FROM chunks WHERE id IN (${placeholders})`;
|
|
686
|
+
const params: any[] = [...ids];
|
|
687
|
+
|
|
688
|
+
if (filter?.agent_id) { sql += ' AND agent_id = ?'; params.push(filter.agent_id); }
|
|
689
|
+
if (filter?.source_type) { sql += ' AND source_type = ?'; params.push(filter.source_type); }
|
|
690
|
+
|
|
691
|
+
const rows = db.prepare(sql).all(...params) as Array<{
|
|
692
|
+
id: number; text: string; role: string; source_type: string;
|
|
693
|
+
source_id: string; agent_id: string; created_at: string;
|
|
694
|
+
}>;
|
|
695
|
+
|
|
696
|
+
return rows.map(row => ({
|
|
697
|
+
text: row.text,
|
|
698
|
+
role: row.role,
|
|
699
|
+
score: 1 - (distMap.get(row.id) || 1), // cosine similarity from distance
|
|
700
|
+
source_type: row.source_type,
|
|
701
|
+
source_id: row.source_id,
|
|
702
|
+
agent_id: row.agent_id,
|
|
703
|
+
created_at: row.created_at,
|
|
704
|
+
}));
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
/** Full-text search via FTS5 with BM25 scoring. */
|
|
708
|
+
private searchFTS(query: string, limit: number, filter?: { agent_id?: string; source_type?: string }): SearchResult[] {
|
|
709
|
+
const db = this.sqliteDb!;
|
|
710
|
+
const ftsQuery = this.buildFTS5Query(query);
|
|
711
|
+
if (!ftsQuery) return [];
|
|
712
|
+
|
|
713
|
+
let sql = `
|
|
714
|
+
SELECT c.id, c.text, c.role, c.source_type, c.source_id, c.agent_id, c.created_at,
|
|
715
|
+
bm25(chunks_fts) as bm25_score
|
|
716
|
+
FROM chunks_fts f
|
|
717
|
+
JOIN chunks c ON c.id = f.rowid
|
|
718
|
+
WHERE chunks_fts MATCH ?
|
|
719
|
+
`;
|
|
720
|
+
const params: any[] = [ftsQuery];
|
|
721
|
+
|
|
722
|
+
if (filter?.agent_id) { sql += ' AND c.agent_id = ?'; params.push(filter.agent_id); }
|
|
723
|
+
if (filter?.source_type) { sql += ' AND c.source_type = ?'; params.push(filter.source_type); }
|
|
724
|
+
|
|
725
|
+
sql += ' ORDER BY bm25_score LIMIT ?';
|
|
726
|
+
params.push(limit);
|
|
727
|
+
|
|
728
|
+
const rows = db.prepare(sql).all(...params) as Array<{
|
|
729
|
+
id: number; text: string; role: string; source_type: string;
|
|
730
|
+
source_id: string; agent_id: string; created_at: string; bm25_score: number;
|
|
731
|
+
}>;
|
|
732
|
+
|
|
733
|
+
return rows.map(row => ({
|
|
734
|
+
text: row.text,
|
|
735
|
+
role: row.role,
|
|
736
|
+
// BM25 scores are negative (lower = better). Normalize to [0..1).
|
|
737
|
+
// |x| / (1 + |x|) maps: strong(-10)->0.91, medium(-2)->0.67, weak(-0.5)->0.33
|
|
738
|
+
score: Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score)),
|
|
739
|
+
source_type: row.source_type,
|
|
740
|
+
source_id: row.source_id,
|
|
741
|
+
agent_id: row.agent_id,
|
|
742
|
+
created_at: row.created_at,
|
|
743
|
+
}));
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
/** Build a safe FTS5 query from user input. */
|
|
747
|
+
private buildFTS5Query(query: string): string | null {
|
|
748
|
+
const terms = query.split(/\s+/)
|
|
749
|
+
.map(t => t.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase())
|
|
750
|
+
.filter(t => t.length > 0);
|
|
751
|
+
if (terms.length === 0) return null;
|
|
752
|
+
if (terms.length === 1) return `"${terms[0]}"*`;
|
|
753
|
+
return terms.map(t => `"${t}"*`).join(' AND ');
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
/**
|
|
757
|
+
* Reciprocal Rank Fusion. Ported from QMD (MIT License, Tobi Lutke, 2024-2026).
|
|
758
|
+
* Fuses multiple ranked result lists into one using RRF scoring.
|
|
759
|
+
* Uses text content as dedup key (instead of QMD's file path).
|
|
760
|
+
*/
|
|
761
|
+
private reciprocalRankFusion(
|
|
762
|
+
resultLists: SearchResult[][],
|
|
763
|
+
weights: number[] = [],
|
|
764
|
+
k: number = 60
|
|
765
|
+
): SearchResult[] {
|
|
766
|
+
const scores = new Map<string, { result: SearchResult; rrfScore: number; topRank: number }>();
|
|
767
|
+
|
|
768
|
+
for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
|
|
769
|
+
const list = resultLists[listIdx];
|
|
770
|
+
if (!list) continue;
|
|
771
|
+
const weight = weights[listIdx] ?? 1.0;
|
|
772
|
+
|
|
773
|
+
for (let rank = 0; rank < list.length; rank++) {
|
|
774
|
+
const result = list[rank];
|
|
775
|
+
if (!result) continue;
|
|
776
|
+
const rrfContribution = weight / (k + rank + 1);
|
|
777
|
+
// Dedup by text content (truncated for perf)
|
|
778
|
+
const dedup = result.text.slice(0, 200);
|
|
779
|
+
const existing = scores.get(dedup);
|
|
780
|
+
|
|
781
|
+
if (existing) {
|
|
782
|
+
existing.rrfScore += rrfContribution;
|
|
783
|
+
existing.topRank = Math.min(existing.topRank, rank);
|
|
784
|
+
} else {
|
|
785
|
+
scores.set(dedup, {
|
|
786
|
+
result,
|
|
787
|
+
rrfScore: rrfContribution,
|
|
788
|
+
topRank: rank,
|
|
789
|
+
});
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
// Top-rank bonus: reward results that appear at or near the top of any list
|
|
795
|
+
for (const entry of scores.values()) {
|
|
796
|
+
if (entry.topRank === 0) {
|
|
797
|
+
entry.rrfScore += 0.05;
|
|
798
|
+
} else if (entry.topRank <= 2) {
|
|
799
|
+
entry.rrfScore += 0.02;
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
return Array.from(scores.values())
|
|
804
|
+
.sort((a, b) => b.rrfScore - a.rrfScore)
|
|
805
|
+
.map(e => ({ ...e.result, score: e.rrfScore }));
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
/** LanceDB fallback for search (used when sqlite-vec tables are empty, pre-migration). */
|
|
809
|
+
private async searchLanceFallback(query: string, limit: number, filter?: { agent_id?: string; source_type?: string }): Promise<SearchResult[]> {
|
|
810
|
+
if (!this.chunksTable) return [];
|
|
811
|
+
|
|
812
|
+
const [embedding] = await this.embed([query]);
|
|
813
|
+
const fetchLimit = Math.max(limit * 3, 30);
|
|
814
|
+
let queryBuilder = this.chunksTable.vectorSearch(embedding).distanceType('cosine').limit(fetchLimit);
|
|
815
|
+
|
|
816
|
+
if (filter?.agent_id) {
|
|
817
|
+
queryBuilder = queryBuilder.where(`agent_id = '${filter.agent_id}'`);
|
|
818
|
+
}
|
|
819
|
+
if (filter?.source_type) {
|
|
820
|
+
queryBuilder = queryBuilder.where(`source_type = '${filter.source_type}'`);
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
const results = await queryBuilder.toArray();
|
|
824
|
+
const now = Date.now();
|
|
825
|
+
|
|
826
|
+
return results.map((row: any) => {
|
|
827
|
+
const cosine = row._distance != null ? 1 - row._distance : 0;
|
|
828
|
+
const createdAt = row.created_at || '';
|
|
829
|
+
const ageDays = createdAt ? (now - new Date(createdAt).getTime()) / 86400000 : 0;
|
|
830
|
+
const weight = createdAt ? this.recencyWeight(ageDays) : 1;
|
|
831
|
+
|
|
832
|
+
return {
|
|
833
|
+
text: row.text,
|
|
834
|
+
role: row.role,
|
|
835
|
+
score: cosine * weight,
|
|
836
|
+
source_type: row.source_type,
|
|
837
|
+
source_id: row.source_id,
|
|
838
|
+
agent_id: row.agent_id,
|
|
839
|
+
created_at: createdAt,
|
|
840
|
+
freshness: createdAt ? this.freshnessLabel(ageDays) : undefined,
|
|
841
|
+
};
|
|
842
|
+
})
|
|
843
|
+
.sort((a, b) => b.score - a.score)
|
|
844
|
+
.slice(0, limit);
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
// ── Remember (explicit fact storage) ──
|
|
848
|
+
|
|
849
|
+
async remember(text: string, category: Memory['category'] = 'fact'): Promise<number> {
|
|
850
|
+
const db = this.sqliteDb!;
|
|
851
|
+
const now = new Date().toISOString();
|
|
852
|
+
|
|
853
|
+
const stmt = db.prepare(`
|
|
854
|
+
INSERT INTO memories (text, category, confidence, source_ids, status, created_at, updated_at)
|
|
855
|
+
VALUES (?, ?, 1.0, '[]', 'active', ?, ?)
|
|
856
|
+
`);
|
|
857
|
+
const result = stmt.run(text, category, now, now);
|
|
858
|
+
|
|
859
|
+
// Also ingest as a chunk for vector search
|
|
860
|
+
await this.ingest([{
|
|
861
|
+
text,
|
|
862
|
+
role: 'system',
|
|
863
|
+
source_type: 'manual',
|
|
864
|
+
source_id: `memory:${result.lastInsertRowid}`,
|
|
865
|
+
agent_id: 'system',
|
|
866
|
+
token_count: Math.ceil(text.length / 4),
|
|
867
|
+
created_at: now,
|
|
868
|
+
}]);
|
|
869
|
+
|
|
870
|
+
return result.lastInsertRowid as number;
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
// ── Forget (deprecate a memory) ──
|
|
874
|
+
|
|
875
|
+
forget(memoryId: number): boolean {
|
|
876
|
+
const db = this.sqliteDb!;
|
|
877
|
+
const now = new Date().toISOString();
|
|
878
|
+
const result = db.prepare(`
|
|
879
|
+
UPDATE memories SET status = 'deprecated', updated_at = ? WHERE id = ? AND status = 'active'
|
|
880
|
+
`).run(now, memoryId);
|
|
881
|
+
return result.changes > 0;
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
// ── Status ──
|
|
885
|
+
|
|
886
|
+
async status(): Promise<CrystalStatus> {
|
|
887
|
+
const db = this.sqliteDb!;
|
|
888
|
+
|
|
889
|
+
// Show the higher of sqlite-vec or LanceDB count during transition
|
|
890
|
+
const sqliteChunks = (db.prepare('SELECT COUNT(*) as count FROM chunks').get() as any)?.count || 0;
|
|
891
|
+
let lanceChunks = 0;
|
|
892
|
+
if (this.chunksTable) {
|
|
893
|
+
try { lanceChunks = await this.chunksTable.countRows(); } catch {}
|
|
894
|
+
}
|
|
895
|
+
const chunks = Math.max(sqliteChunks, lanceChunks);
|
|
896
|
+
|
|
897
|
+
// Time range from sqlite chunks table
|
|
898
|
+
const oldest = (db.prepare('SELECT MIN(created_at) as ts FROM chunks').get() as any)?.ts || null;
|
|
899
|
+
const newest = (db.prepare('SELECT MAX(created_at) as ts FROM chunks').get() as any)?.ts || null;
|
|
900
|
+
|
|
901
|
+
const memories = (db.prepare('SELECT COUNT(*) as count FROM memories WHERE status = ?').get('active') as any)?.count || 0;
|
|
902
|
+
const sources = (db.prepare('SELECT COUNT(*) as count FROM sources').get() as any)?.count || 0;
|
|
903
|
+
|
|
904
|
+
// Get agents from chunks, sources, and capture_state tables
|
|
905
|
+
const chunkAgentRows = db.prepare('SELECT DISTINCT agent_id FROM chunks WHERE agent_id IS NOT NULL').all() as any[];
|
|
906
|
+
const sourceAgentRows = db.prepare('SELECT DISTINCT agent_id FROM sources').all() as any[];
|
|
907
|
+
const captureAgentRows = db.prepare('SELECT DISTINCT agent_id FROM capture_state').all() as any[];
|
|
908
|
+
const agents = [...new Set([
|
|
909
|
+
...chunkAgentRows.map((r: any) => r.agent_id),
|
|
910
|
+
...sourceAgentRows.map((r: any) => r.agent_id),
|
|
911
|
+
...captureAgentRows.map((r: any) => r.agent_id),
|
|
912
|
+
])];
|
|
913
|
+
|
|
914
|
+
// Capture state summary
|
|
915
|
+
const captureInfo = db.prepare(
|
|
916
|
+
'SELECT COUNT(*) as count, MAX(last_capture_at) as latest FROM capture_state'
|
|
917
|
+
).get() as any;
|
|
918
|
+
|
|
919
|
+
return {
|
|
920
|
+
chunks,
|
|
921
|
+
memories,
|
|
922
|
+
sources,
|
|
923
|
+
agents,
|
|
924
|
+
oldestChunk: oldest,
|
|
925
|
+
newestChunk: newest,
|
|
926
|
+
embeddingProvider: this.config.embeddingProvider,
|
|
927
|
+
dataDir: this.config.dataDir,
|
|
928
|
+
capturedSessions: captureInfo?.count || 0,
|
|
929
|
+
latestCapture: captureInfo?.latest || null,
|
|
930
|
+
};
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
// ── Capture State (for incremental ingestion) ──
|
|
934
|
+
|
|
935
|
+
getCaptureState(agentId: string, sourceId: string): { lastMessageCount: number; captureCount: number } {
|
|
936
|
+
const db = this.sqliteDb!;
|
|
937
|
+
const row = db.prepare('SELECT last_message_count, capture_count FROM capture_state WHERE agent_id = ? AND source_id = ?')
|
|
938
|
+
.get(agentId, sourceId) as any;
|
|
939
|
+
if (!row) return { lastMessageCount: 0, captureCount: 0 };
|
|
940
|
+
return {
|
|
941
|
+
lastMessageCount: row.last_message_count,
|
|
942
|
+
captureCount: row.capture_count,
|
|
943
|
+
};
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
setCaptureState(agentId: string, sourceId: string, messageCount: number, captureCount: number): void {
|
|
947
|
+
const db = this.sqliteDb!;
|
|
948
|
+
db.prepare(`
|
|
949
|
+
INSERT OR REPLACE INTO capture_state (agent_id, source_id, last_message_count, capture_count, last_capture_at)
|
|
950
|
+
VALUES (?, ?, ?, ?, ?)
|
|
951
|
+
`).run(agentId, sourceId, messageCount, captureCount, new Date().toISOString());
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
// ── Source File Indexing (optional feature) ──
|
|
955
|
+
//
|
|
956
|
+
// Add directories as "collections", sync to index/re-index changed files.
|
|
957
|
+
// All source chunks get source_type='file' so they're searchable alongside
|
|
958
|
+
// conversations and memories. Nothing here is required... you can use MC
|
|
959
|
+
// without ever touching sources.
|
|
960
|
+
|
|
961
|
+
// Default patterns for files worth indexing
|
|
962
|
+
private static readonly DEFAULT_INCLUDE = [
|
|
963
|
+
'**/*.ts', '**/*.js', '**/*.tsx', '**/*.jsx',
|
|
964
|
+
'**/*.py', '**/*.rs', '**/*.go', '**/*.java',
|
|
965
|
+
'**/*.md', '**/*.txt', '**/*.json', '**/*.yaml', '**/*.yml',
|
|
966
|
+
'**/*.toml', '**/*.sh', '**/*.bash', '**/*.zsh',
|
|
967
|
+
'**/*.css', '**/*.html', '**/*.svg',
|
|
968
|
+
'**/*.sql', '**/*.graphql',
|
|
969
|
+
'**/*.c', '**/*.cpp', '**/*.h', '**/*.hpp',
|
|
970
|
+
'**/*.swift', '**/*.kt', '**/*.rb',
|
|
971
|
+
'**/*.env.example', '**/*.gitignore',
|
|
972
|
+
'**/Makefile', '**/Dockerfile', '**/Cargo.toml',
|
|
973
|
+
'**/package.json', '**/tsconfig.json',
|
|
974
|
+
];
|
|
975
|
+
|
|
976
|
+
private static readonly DEFAULT_IGNORE = [
|
|
977
|
+
'**/node_modules/**', '**/.git/**', '**/dist/**', '**/build/**',
|
|
978
|
+
'**/.next/**', '**/.cache/**', '**/coverage/**', '**/__pycache__/**',
|
|
979
|
+
'**/target/**', '**/vendor/**', '**/.venv/**',
|
|
980
|
+
'**/*.lock', '**/package-lock.json', '**/yarn.lock', '**/bun.lockb',
|
|
981
|
+
'**/*.min.js', '**/*.min.css', '**/*.map',
|
|
982
|
+
'**/*.png', '**/*.jpg', '**/*.jpeg', '**/*.gif', '**/*.ico', '**/*.webp',
|
|
983
|
+
'**/*.woff', '**/*.woff2', '**/*.ttf', '**/*.eot',
|
|
984
|
+
'**/*.mp3', '**/*.mp4', '**/*.wav', '**/*.ogg', '**/*.webm',
|
|
985
|
+
'**/*.zip', '**/*.tar', '**/*.gz', '**/*.br',
|
|
986
|
+
'**/*.sqlite', '**/*.db', '**/*.lance/**',
|
|
987
|
+
'**/*.jsonl',
|
|
988
|
+
'**/secrets/**', '**/.env',
|
|
989
|
+
];
|
|
990
|
+
|
|
991
|
+
/** Add a directory as a source collection for indexing. */
|
|
992
|
+
async sourcesAdd(rootPath: string, name: string, options?: {
|
|
993
|
+
include?: string[];
|
|
994
|
+
ignore?: string[];
|
|
995
|
+
}): Promise<SourceCollection> {
|
|
996
|
+
const db = this.sqliteDb!;
|
|
997
|
+
const now = new Date().toISOString();
|
|
998
|
+
const includePatterns = JSON.stringify(options?.include || Crystal.DEFAULT_INCLUDE);
|
|
999
|
+
const ignorePatterns = JSON.stringify(options?.ignore || Crystal.DEFAULT_IGNORE);
|
|
1000
|
+
|
|
1001
|
+
// Check if collection already exists
|
|
1002
|
+
const existing = db.prepare('SELECT * FROM source_collections WHERE name = ?').get(name) as any;
|
|
1003
|
+
if (existing) {
|
|
1004
|
+
throw new Error(`Collection "${name}" already exists. Use sourcesSync() to update it.`);
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
db.prepare(`
|
|
1008
|
+
INSERT INTO source_collections (name, root_path, glob_patterns, ignore_patterns, created_at)
|
|
1009
|
+
VALUES (?, ?, ?, ?, ?)
|
|
1010
|
+
`).run(name, rootPath, includePatterns, ignorePatterns, now);
|
|
1011
|
+
|
|
1012
|
+
const row = db.prepare('SELECT * FROM source_collections WHERE name = ?').get(name) as any;
|
|
1013
|
+
return row as SourceCollection;
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
/** Remove a source collection and its file records. Chunks remain in LanceDB. */
|
|
1017
|
+
sourcesRemove(name: string): boolean {
|
|
1018
|
+
const db = this.sqliteDb!;
|
|
1019
|
+
const col = db.prepare('SELECT id FROM source_collections WHERE name = ?').get(name) as any;
|
|
1020
|
+
if (!col) return false;
|
|
1021
|
+
db.prepare('DELETE FROM source_files WHERE collection_id = ?').run(col.id);
|
|
1022
|
+
db.prepare('DELETE FROM source_collections WHERE id = ?').run(col.id);
|
|
1023
|
+
return true;
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
/** Sync a collection: scan files, detect changes, re-index what changed. */
|
|
1027
|
+
async sourcesSync(name: string, options?: { dryRun?: boolean; batchSize?: number }): Promise<SyncResult> {
|
|
1028
|
+
const db = this.sqliteDb!;
|
|
1029
|
+
const startTime = Date.now();
|
|
1030
|
+
const batchSize = options?.batchSize || 20;
|
|
1031
|
+
|
|
1032
|
+
const col = db.prepare('SELECT * FROM source_collections WHERE name = ?').get(name) as any;
|
|
1033
|
+
if (!col) throw new Error(`Collection "${name}" not found. Add it first with sourcesAdd().`);
|
|
1034
|
+
|
|
1035
|
+
const includePatterns: string[] = JSON.parse(col.glob_patterns);
|
|
1036
|
+
const ignorePatterns: string[] = JSON.parse(col.ignore_patterns);
|
|
1037
|
+
|
|
1038
|
+
// Scan the directory for matching files
|
|
1039
|
+
const files = this.scanDirectory(col.root_path, includePatterns, ignorePatterns);
|
|
1040
|
+
|
|
1041
|
+
// Get existing file records
|
|
1042
|
+
const existingFiles = new Map<string, { id: number; file_hash: string }>();
|
|
1043
|
+
const rows = db.prepare('SELECT id, file_path, file_hash FROM source_files WHERE collection_id = ?').all(col.id) as any[];
|
|
1044
|
+
for (const row of rows) {
|
|
1045
|
+
existingFiles.set(row.file_path, { id: row.id, file_hash: row.file_hash });
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
let added = 0;
|
|
1049
|
+
let updated = 0;
|
|
1050
|
+
let removed = 0;
|
|
1051
|
+
let chunksAdded = 0;
|
|
1052
|
+
const now = new Date().toISOString();
|
|
1053
|
+
|
|
1054
|
+
// Collect files that need indexing
|
|
1055
|
+
const toIndex: Array<{ relPath: string; absPath: string; hash: string; size: number; isUpdate: boolean }> = [];
|
|
1056
|
+
|
|
1057
|
+
for (const absPath of files) {
|
|
1058
|
+
const relPath = relative(col.root_path, absPath);
|
|
1059
|
+
let content: string;
|
|
1060
|
+
try {
|
|
1061
|
+
content = readFileSync(absPath, 'utf-8');
|
|
1062
|
+
} catch {
|
|
1063
|
+
continue; // skip binary or unreadable files
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
// Skip files > 500KB (likely generated or data)
|
|
1067
|
+
const stat = statSync(absPath);
|
|
1068
|
+
if (stat.size > 500 * 1024) continue;
|
|
1069
|
+
|
|
1070
|
+
const hash = createHash('sha256').update(content).digest('hex');
|
|
1071
|
+
const existing = existingFiles.get(relPath);
|
|
1072
|
+
|
|
1073
|
+
if (existing) {
|
|
1074
|
+
existingFiles.delete(relPath); // mark as seen
|
|
1075
|
+
if (existing.file_hash === hash) continue; // unchanged
|
|
1076
|
+
toIndex.push({ relPath, absPath, hash, size: stat.size, isUpdate: true });
|
|
1077
|
+
} else {
|
|
1078
|
+
toIndex.push({ relPath, absPath, hash, size: stat.size, isUpdate: false });
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
if (options?.dryRun) {
|
|
1083
|
+
const newFiles = toIndex.filter(f => !f.isUpdate).length;
|
|
1084
|
+
const updatedFiles = toIndex.filter(f => f.isUpdate).length;
|
|
1085
|
+
return {
|
|
1086
|
+
collection: name,
|
|
1087
|
+
added: newFiles,
|
|
1088
|
+
updated: updatedFiles,
|
|
1089
|
+
removed: existingFiles.size,
|
|
1090
|
+
chunks_added: 0,
|
|
1091
|
+
duration_ms: Date.now() - startTime,
|
|
1092
|
+
};
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
// Process files in batches
|
|
1096
|
+
for (let i = 0; i < toIndex.length; i += batchSize) {
|
|
1097
|
+
const batch = toIndex.slice(i, i + batchSize);
|
|
1098
|
+
const allChunks: Chunk[] = [];
|
|
1099
|
+
|
|
1100
|
+
for (const file of batch) {
|
|
1101
|
+
const content = readFileSync(file.absPath, 'utf-8');
|
|
1102
|
+
const ext = extname(file.absPath);
|
|
1103
|
+
const fileName = basename(file.absPath);
|
|
1104
|
+
|
|
1105
|
+
// Prepend file path context to help search
|
|
1106
|
+
const header = `File: ${file.relPath}\n\n`;
|
|
1107
|
+
const textChunks = this.chunkText(header + content, 400, 80);
|
|
1108
|
+
const fileChunks: Chunk[] = textChunks.map(text => ({
|
|
1109
|
+
text,
|
|
1110
|
+
role: 'system' as const,
|
|
1111
|
+
source_type: 'file',
|
|
1112
|
+
source_id: `file:${name}:${file.relPath}`,
|
|
1113
|
+
agent_id: 'system',
|
|
1114
|
+
token_count: Math.ceil(text.length / 4),
|
|
1115
|
+
created_at: now,
|
|
1116
|
+
}));
|
|
1117
|
+
|
|
1118
|
+
allChunks.push(...fileChunks);
|
|
1119
|
+
|
|
1120
|
+
// Update or insert file record
|
|
1121
|
+
if (file.isUpdate) {
|
|
1122
|
+
db.prepare(`
|
|
1123
|
+
UPDATE source_files SET file_hash = ?, file_size = ?, chunk_count = ?, last_indexed_at = ?
|
|
1124
|
+
WHERE collection_id = ? AND file_path = ?
|
|
1125
|
+
`).run(file.hash, file.size, fileChunks.length, now, col.id, file.relPath);
|
|
1126
|
+
updated++;
|
|
1127
|
+
} else {
|
|
1128
|
+
db.prepare(`
|
|
1129
|
+
INSERT INTO source_files (collection_id, file_path, file_hash, file_size, chunk_count, last_indexed_at)
|
|
1130
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
1131
|
+
`).run(col.id, file.relPath, file.hash, file.size, fileChunks.length, now);
|
|
1132
|
+
added++;
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
// Embed and ingest the batch
|
|
1137
|
+
if (allChunks.length > 0) {
|
|
1138
|
+
const ingested = await this.ingest(allChunks);
|
|
1139
|
+
chunksAdded += ingested;
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
// Remove files that no longer exist on disk
|
|
1144
|
+
for (const [relPath, { id }] of existingFiles) {
|
|
1145
|
+
db.prepare('DELETE FROM source_files WHERE id = ?').run(id);
|
|
1146
|
+
removed++;
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
// Update collection stats
|
|
1150
|
+
const fileCount = (db.prepare('SELECT COUNT(*) as count FROM source_files WHERE collection_id = ?').get(col.id) as any).count;
|
|
1151
|
+
const chunkCount = (db.prepare('SELECT SUM(chunk_count) as total FROM source_files WHERE collection_id = ?').get(col.id) as any).total || 0;
|
|
1152
|
+
db.prepare('UPDATE source_collections SET file_count = ?, chunk_count = ?, last_sync_at = ? WHERE id = ?')
|
|
1153
|
+
.run(fileCount, chunkCount, now, col.id);
|
|
1154
|
+
|
|
1155
|
+
return {
|
|
1156
|
+
collection: name,
|
|
1157
|
+
added,
|
|
1158
|
+
updated,
|
|
1159
|
+
removed,
|
|
1160
|
+
chunks_added: chunksAdded,
|
|
1161
|
+
duration_ms: Date.now() - startTime,
|
|
1162
|
+
};
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
/** Get status of all source collections. */
|
|
1166
|
+
sourcesStatus(): SourcesStatus {
|
|
1167
|
+
const db = this.sqliteDb!;
|
|
1168
|
+
const collections = db.prepare('SELECT name, root_path, file_count, chunk_count, last_sync_at FROM source_collections').all() as any[];
|
|
1169
|
+
const totalFiles = collections.reduce((sum, c) => sum + c.file_count, 0);
|
|
1170
|
+
const totalChunks = collections.reduce((sum, c) => sum + c.chunk_count, 0);
|
|
1171
|
+
|
|
1172
|
+
return {
|
|
1173
|
+
collections: collections.map(c => ({
|
|
1174
|
+
name: c.name,
|
|
1175
|
+
root_path: c.root_path,
|
|
1176
|
+
file_count: c.file_count,
|
|
1177
|
+
chunk_count: c.chunk_count,
|
|
1178
|
+
last_sync_at: c.last_sync_at,
|
|
1179
|
+
})),
|
|
1180
|
+
total_files: totalFiles,
|
|
1181
|
+
total_chunks: totalChunks,
|
|
1182
|
+
};
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
/** Scan a directory recursively, matching include/ignore patterns. */
|
|
1186
|
+
private scanDirectory(rootPath: string, includePatterns: string[], ignorePatterns: string[]): string[] {
|
|
1187
|
+
const results: string[] = [];
|
|
1188
|
+
|
|
1189
|
+
// Build sets of allowed extensions and ignored directory names for fast filtering
|
|
1190
|
+
const allowedExtensions = new Set<string>();
|
|
1191
|
+
const allowedExactNames = new Set<string>();
|
|
1192
|
+
for (const pattern of includePatterns) {
|
|
1193
|
+
// Extract extension from patterns like "**/*.ts"
|
|
1194
|
+
const extMatch = pattern.match(/\*\*\/\*(\.\w+)$/);
|
|
1195
|
+
if (extMatch) {
|
|
1196
|
+
allowedExtensions.add(extMatch[1]);
|
|
1197
|
+
}
|
|
1198
|
+
// Exact filenames like "**/Makefile"
|
|
1199
|
+
const nameMatch = pattern.match(/\*\*\/([^*]+)$/);
|
|
1200
|
+
if (nameMatch && !nameMatch[1].startsWith('*.')) {
|
|
1201
|
+
allowedExactNames.add(nameMatch[1]);
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
const ignoreDirs = new Set<string>();
|
|
1206
|
+
for (const pattern of ignorePatterns) {
|
|
1207
|
+
// Extract directory names from patterns like "**/node_modules/**"
|
|
1208
|
+
const dirMatch = pattern.match(/\*\*\/([^/*]+)\/\*\*$/);
|
|
1209
|
+
if (dirMatch) {
|
|
1210
|
+
ignoreDirs.add(dirMatch[1]);
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
const ignoreFiles = new Set<string>();
|
|
1215
|
+
for (const pattern of ignorePatterns) {
|
|
1216
|
+
// Extract filenames/extensions to ignore
|
|
1217
|
+
const fileMatch = pattern.match(/\*\*\/\*(\.\w+)$/);
|
|
1218
|
+
if (fileMatch) {
|
|
1219
|
+
ignoreFiles.add(fileMatch[1]);
|
|
1220
|
+
}
|
|
1221
|
+
const exactMatch = pattern.match(/\*\*\/([^*]+)$/);
|
|
1222
|
+
if (exactMatch && !exactMatch[1].includes('/')) {
|
|
1223
|
+
ignoreFiles.add(exactMatch[1]);
|
|
1224
|
+
}
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
const walk = (dir: string) => {
|
|
1228
|
+
let entries: string[];
|
|
1229
|
+
try {
|
|
1230
|
+
entries = readdirSync(dir);
|
|
1231
|
+
} catch {
|
|
1232
|
+
return;
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
for (const entry of entries) {
|
|
1236
|
+
const fullPath = join(dir, entry);
|
|
1237
|
+
let stat;
|
|
1238
|
+
try {
|
|
1239
|
+
stat = statSync(fullPath);
|
|
1240
|
+
} catch {
|
|
1241
|
+
continue;
|
|
1242
|
+
}
|
|
1243
|
+
|
|
1244
|
+
if (stat.isDirectory()) {
|
|
1245
|
+
if (ignoreDirs.has(entry)) continue;
|
|
1246
|
+
if (entry.startsWith('.')) continue; // skip hidden dirs
|
|
1247
|
+
walk(fullPath);
|
|
1248
|
+
} else if (stat.isFile()) {
|
|
1249
|
+
const ext = extname(entry);
|
|
1250
|
+
if (ignoreFiles.has(ext)) continue;
|
|
1251
|
+
if (ignoreFiles.has(entry)) continue;
|
|
1252
|
+
|
|
1253
|
+
if (allowedExtensions.has(ext) || allowedExactNames.has(entry)) {
|
|
1254
|
+
results.push(fullPath);
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
};
|
|
1259
|
+
|
|
1260
|
+
walk(rootPath);
|
|
1261
|
+
return results;
|
|
1262
|
+
}
|
|
1263
|
+
|
|
1264
|
+
// ── Cleanup ──
|
|
1265
|
+
|
|
1266
|
+
close(): void {
|
|
1267
|
+
this.sqliteDb?.close();
|
|
1268
|
+
// LanceDB connection doesn't need explicit close
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
// ─── Config Resolution ─────────────────────────────────────────────────────
|
|
1273
|
+
//
|
|
1274
|
+
// Key resolution order:
|
|
1275
|
+
// 1. Explicit overrides (programmatic)
|
|
1276
|
+
// 2. process.env (set by op-secrets plugin inside OpenClaw, or by user)
|
|
1277
|
+
// 3. .env file in data dir (~/.openclaw/memory-crystal/.env)
|
|
1278
|
+
// 4. 1Password via op CLI (if SA token exists at ~/.openclaw/secrets/op-sa-token)
|
|
1279
|
+
//
|
|
1280
|
+
// Two setup paths:
|
|
1281
|
+
// • .env file: cp .env.example ~/.openclaw/memory-crystal/.env && edit
|
|
1282
|
+
// • 1Password: keys auto-resolved from "Agent Secrets" vault
|
|
1283
|
+
|
|
1284
|
+
export function resolveConfig(overrides?: Partial<CrystalConfig>): CrystalConfig {
|
|
1285
|
+
const openclawHome = process.env.OPENCLAW_HOME || join(process.env.HOME || '', '.openclaw');
|
|
1286
|
+
|
|
1287
|
+
// dataDir resolution order:
|
|
1288
|
+
// 1. Explicit override (always wins)
|
|
1289
|
+
// 2. CRYSTAL_DATA_DIR env var (for testing)
|
|
1290
|
+
// 3. ~/.ldm/memory/ if crystal.db exists there (post-migration)
|
|
1291
|
+
// 4. Legacy ~/.openclaw/memory-crystal/ (pre-migration fallback)
|
|
1292
|
+
let dataDir = overrides?.dataDir || process.env.CRYSTAL_DATA_DIR;
|
|
1293
|
+
if (!dataDir) {
|
|
1294
|
+
const ldmMemory = join(process.env.HOME || '', '.ldm', 'memory');
|
|
1295
|
+
if (existsSync(join(ldmMemory, 'crystal.db'))) {
|
|
1296
|
+
dataDir = ldmMemory;
|
|
1297
|
+
} else {
|
|
1298
|
+
dataDir = join(openclawHome, 'memory-crystal');
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
|
|
1302
|
+
// Load .env file if it exists (doesn't override existing env vars)
|
|
1303
|
+
loadEnvFile(join(dataDir, '.env'));
|
|
1304
|
+
|
|
1305
|
+
// Resolve API keys: env/.env first, then 1Password fallback
|
|
1306
|
+
const openaiApiKey = overrides?.openaiApiKey || process.env.OPENAI_API_KEY || opRead(openclawHome, 'OpenAI API', 'api key');
|
|
1307
|
+
const googleApiKey = overrides?.googleApiKey || process.env.GOOGLE_API_KEY || opRead(openclawHome, 'Google AI', 'api key');
|
|
1308
|
+
const remoteToken = overrides?.remoteToken || process.env.CRYSTAL_REMOTE_TOKEN || opRead(openclawHome, 'Memory Crystal Remote', 'token');
|
|
1309
|
+
|
|
1310
|
+
return {
|
|
1311
|
+
dataDir,
|
|
1312
|
+
embeddingProvider: (overrides?.embeddingProvider || process.env.CRYSTAL_EMBEDDING_PROVIDER || 'openai') as CrystalConfig['embeddingProvider'],
|
|
1313
|
+
openaiApiKey,
|
|
1314
|
+
openaiModel: overrides?.openaiModel || process.env.CRYSTAL_OPENAI_MODEL || 'text-embedding-3-small',
|
|
1315
|
+
ollamaHost: overrides?.ollamaHost || process.env.CRYSTAL_OLLAMA_HOST || 'http://localhost:11434',
|
|
1316
|
+
ollamaModel: overrides?.ollamaModel || process.env.CRYSTAL_OLLAMA_MODEL || 'nomic-embed-text',
|
|
1317
|
+
googleApiKey,
|
|
1318
|
+
googleModel: overrides?.googleModel || process.env.CRYSTAL_GOOGLE_MODEL || 'text-embedding-004',
|
|
1319
|
+
remoteUrl: overrides?.remoteUrl || process.env.CRYSTAL_REMOTE_URL,
|
|
1320
|
+
remoteToken,
|
|
1321
|
+
};
|
|
1322
|
+
}
|
|
1323
|
+
|
|
1324
|
+
/** Load a .env file into process.env. Does NOT override existing vars. */
|
|
1325
|
+
function loadEnvFile(path: string): void {
|
|
1326
|
+
if (!existsSync(path)) return;
|
|
1327
|
+
const content = readFileSync(path, 'utf8');
|
|
1328
|
+
for (const line of content.split('\n')) {
|
|
1329
|
+
const trimmed = line.trim();
|
|
1330
|
+
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
1331
|
+
const eqIdx = trimmed.indexOf('=');
|
|
1332
|
+
if (eqIdx === -1) continue;
|
|
1333
|
+
const key = trimmed.slice(0, eqIdx).trim();
|
|
1334
|
+
let value = trimmed.slice(eqIdx + 1).trim();
|
|
1335
|
+
// Strip surrounding quotes
|
|
1336
|
+
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
|
1337
|
+
value = value.slice(1, -1);
|
|
1338
|
+
}
|
|
1339
|
+
if (key && !process.env[key]) {
|
|
1340
|
+
process.env[key] = value;
|
|
1341
|
+
}
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1345
|
+
/** Read a secret from 1Password via op CLI. Falls back silently on failure. */
|
|
1346
|
+
function opRead(openclawHome: string, item: string, field: string): string | undefined {
|
|
1347
|
+
try {
|
|
1348
|
+
const saTokenPath = join(openclawHome, 'secrets', 'op-sa-token');
|
|
1349
|
+
if (!existsSync(saTokenPath)) return undefined;
|
|
1350
|
+
const saToken = readFileSync(saTokenPath, 'utf8').trim();
|
|
1351
|
+
return execSync(`op read "op://Agent Secrets/${item}/${field}" 2>/dev/null`, {
|
|
1352
|
+
encoding: 'utf8',
|
|
1353
|
+
env: { ...process.env, OP_SERVICE_ACCOUNT_TOKEN: saToken },
|
|
1354
|
+
timeout: 10000,
|
|
1355
|
+
}).trim() || undefined;
|
|
1356
|
+
} catch {
|
|
1357
|
+
return undefined;
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
// ─── Remote Crystal (Cloud Mirror Mode) ────────────────────────────────────
|
|
1362
|
+
// When remoteUrl is set, this class talks to the Cloudflare Worker instead
|
|
1363
|
+
// of local SQLite. Same interface as Crystal for search/remember/forget/status/ingest.
|
|
1364
|
+
|
|
1365
|
+
export class RemoteCrystal {
|
|
1366
|
+
private url: string;
|
|
1367
|
+
private token: string;
|
|
1368
|
+
|
|
1369
|
+
constructor(url: string, token: string) {
|
|
1370
|
+
this.url = url.replace(/\/$/, '');
|
|
1371
|
+
this.token = token;
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1374
|
+
async init(): Promise<void> {
|
|
1375
|
+
// No local DB to initialize — just verify the Worker is reachable
|
|
1376
|
+
const resp = await fetch(`${this.url}/health`);
|
|
1377
|
+
if (!resp.ok) {
|
|
1378
|
+
throw new Error(`Remote crystal unreachable: ${resp.status}`);
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
private async request(path: string, body?: any): Promise<any> {
|
|
1383
|
+
const resp = await fetch(`${this.url}${path}`, {
|
|
1384
|
+
method: body ? 'POST' : 'GET',
|
|
1385
|
+
headers: {
|
|
1386
|
+
'Authorization': `Bearer ${this.token}`,
|
|
1387
|
+
'Content-Type': 'application/json',
|
|
1388
|
+
},
|
|
1389
|
+
...(body ? { body: JSON.stringify(body) } : {}),
|
|
1390
|
+
});
|
|
1391
|
+
|
|
1392
|
+
if (!resp.ok) {
|
|
1393
|
+
const err = await resp.text();
|
|
1394
|
+
throw new Error(`Remote crystal error ${resp.status}: ${err}`);
|
|
1395
|
+
}
|
|
1396
|
+
|
|
1397
|
+
return resp.json();
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
async search(query: string, limit = 5, filter?: { agent_id?: string }): Promise<SearchResult[]> {
|
|
1401
|
+
const data = await this.request('/search', { query, limit, agent_id: filter?.agent_id });
|
|
1402
|
+
return data.results || [];
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
async ingest(chunks: Chunk[]): Promise<number> {
|
|
1406
|
+
const data = await this.request('/ingest', { chunks });
|
|
1407
|
+
return data.ingested || 0;
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
async remember(text: string, category: Memory['category'] = 'fact'): Promise<number> {
|
|
1411
|
+
const data = await this.request('/remember', { text, category });
|
|
1412
|
+
return data.id;
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
forget(memoryId: number): Promise<boolean> {
|
|
1416
|
+
return this.request('/forget', { id: memoryId }).then(d => d.ok);
|
|
1417
|
+
}
|
|
1418
|
+
|
|
1419
|
+
async status(): Promise<CrystalStatus> {
|
|
1420
|
+
const data = await this.request('/status');
|
|
1421
|
+
return {
|
|
1422
|
+
chunks: data.chunks || 0,
|
|
1423
|
+
memories: data.memories || 0,
|
|
1424
|
+
sources: 0,
|
|
1425
|
+
agents: data.agents || [],
|
|
1426
|
+
oldestChunk: data.oldestChunk,
|
|
1427
|
+
newestChunk: data.newestChunk,
|
|
1428
|
+
embeddingProvider: 'remote',
|
|
1429
|
+
dataDir: this.url,
|
|
1430
|
+
capturedSessions: data.capturedSessions || 0,
|
|
1431
|
+
latestCapture: data.newestChunk,
|
|
1432
|
+
};
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
// Expose chunkText from a local Crystal instance for cc-hook to use
|
|
1436
|
+
chunkText(text: string): string[] {
|
|
1437
|
+
// Simple chunking for remote mode — matches Crystal.chunkText() logic
|
|
1438
|
+
const targetChars = 400 * 4; // 400 tokens * ~4 chars
|
|
1439
|
+
const overlapChars = 80 * 4;
|
|
1440
|
+
|
|
1441
|
+
if (text.length <= targetChars) return [text];
|
|
1442
|
+
|
|
1443
|
+
const chunks: string[] = [];
|
|
1444
|
+
let start = 0;
|
|
1445
|
+
while (start < text.length) {
|
|
1446
|
+
let end = start + targetChars;
|
|
1447
|
+
if (end >= text.length) {
|
|
1448
|
+
chunks.push(text.slice(start));
|
|
1449
|
+
break;
|
|
1450
|
+
}
|
|
1451
|
+
// Try to break at paragraph
|
|
1452
|
+
const paraBreak = text.lastIndexOf('\n\n', end);
|
|
1453
|
+
if (paraBreak > start + targetChars * 0.5) end = paraBreak;
|
|
1454
|
+
else {
|
|
1455
|
+
// Try sentence break
|
|
1456
|
+
const sentBreak = text.lastIndexOf('. ', end);
|
|
1457
|
+
if (sentBreak > start + targetChars * 0.5) end = sentBreak + 1;
|
|
1458
|
+
}
|
|
1459
|
+
chunks.push(text.slice(start, end));
|
|
1460
|
+
start = end - overlapChars;
|
|
1461
|
+
}
|
|
1462
|
+
return chunks;
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
|
|
1466
|
+
/** Create the appropriate Crystal instance based on config. */
|
|
1467
|
+
export function createCrystal(config: CrystalConfig): Crystal | RemoteCrystal {
|
|
1468
|
+
if (config.remoteUrl && config.remoteToken) {
|
|
1469
|
+
return new RemoteCrystal(config.remoteUrl, config.remoteToken);
|
|
1470
|
+
}
|
|
1471
|
+
return new Crystal(config);
|
|
1472
|
+
}
|