memory-crystal 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.env.example +20 -0
  2. package/CHANGELOG.md +6 -0
  3. package/LETTERS.md +22 -0
  4. package/LICENSE +21 -0
  5. package/README-ENTERPRISE.md +162 -0
  6. package/README-old.md +275 -0
  7. package/README.md +91 -0
  8. package/RELAY.md +88 -0
  9. package/TECHNICAL.md +379 -0
  10. package/ai/dev-updates/2026-02-25--cc-air--phase2-architecture-pivot.md +70 -0
  11. package/ai/dev-updates/2026-02-25--cc-air--phase2-worker-build.md +72 -0
  12. package/ai/dev-updates/2026-02-26--10-25-16--cc-mini--phase2-implementation.md +49 -0
  13. package/ai/dev-updates/2026-02-27--20-30-00--cc-mini--readme-overhaul-and-public-deploy.md +69 -0
  14. package/ai/notes/2026-02-26--cc-air--notes.md +412 -0
  15. package/ai/notes/2026-02-27--cc-mini--grok-feedback.md +44 -0
  16. package/ai/notes/2026-02-27--cc-mini--lesa-feedback.md +45 -0
  17. package/ai/notes/RESEARCH.md +1185 -0
  18. package/ai/notes/salience-research/README.md +29 -0
  19. package/ai/notes/salience-research/eurosla-salience-review.md +64 -0
  20. package/ai/notes/salience-research/full-research-summary.md +269 -0
  21. package/ai/notes/salience-research/salience-levels-diagram.png +0 -0
  22. package/ai/plan/2026-02-27--cc-mini--qr-pairing-spec.md +203 -0
  23. package/ai/plan/_archive/PLAN.md +194 -0
  24. package/ai/plan/_archive/PRD.md +1014 -0
  25. package/ai/plan/cc-plans-duplicates-from-dot-claude/2026-02-26--cc-mini--phase2-implementation-plan.md +245 -0
  26. package/ai/plan/dev-conventions-note.md +70 -0
  27. package/ai/plan/ldm-os-install-and-boot-architecture.md +285 -0
  28. package/ai/plan/memory-crystal-phase2-plan.md +192 -0
  29. package/ai/plan/memory-system-lay-of-the-land.md +214 -0
  30. package/ai/plan/phase2-ephemeral-relay.md +238 -0
  31. package/ai/plan/readme-first.md +68 -0
  32. package/ai/plan/roadmap.md +159 -0
  33. package/ai/todos/PUNCHLIST.md +44 -0
  34. package/ai/todos/README.md +31 -0
  35. package/ai/todos/inboxes/cc-air/2026-02-26--cc-air--post-relay-todos.md +85 -0
  36. package/ai/todos/inboxes/cc-mini/2026-02-26--cc-mini--phase2-status.md +100 -0
  37. package/ai/todos/inboxes/cc-mini/_archive/TODO.md +25 -0
  38. package/ai/todos/inboxes/parker/2026-02-25--cc-air--setup-checklist.md +139 -0
  39. package/ai/todos/inboxes/parker/2026-02-26--cc-mini--phase2-your-moves.md +72 -0
  40. package/dist/cc-hook.d.ts +1 -0
  41. package/dist/cc-hook.js +349 -0
  42. package/dist/chunk-3VFIJYS4.js +818 -0
  43. package/dist/chunk-52QE3YI3.js +1169 -0
  44. package/dist/chunk-AA3OPP4Z.js +432 -0
  45. package/dist/chunk-D3I3ZSE2.js +411 -0
  46. package/dist/chunk-EKSACBTJ.js +1070 -0
  47. package/dist/chunk-F3Y7EL7K.js +83 -0
  48. package/dist/chunk-JWZXYVET.js +1068 -0
  49. package/dist/chunk-KYVWO6ZM.js +1069 -0
  50. package/dist/chunk-L3VHARQH.js +413 -0
  51. package/dist/chunk-LOVAHSQV.js +411 -0
  52. package/dist/chunk-LQOYCAGG.js +446 -0
  53. package/dist/chunk-MK42FMEG.js +147 -0
  54. package/dist/chunk-NIJCVN3O.js +147 -0
  55. package/dist/chunk-O2UITJGH.js +465 -0
  56. package/dist/chunk-PEK6JH65.js +432 -0
  57. package/dist/chunk-PJ6FFKEX.js +77 -0
  58. package/dist/chunk-PLUBBZYR.js +800 -0
  59. package/dist/chunk-SGL6ISBJ.js +1061 -0
  60. package/dist/chunk-UNHVZB5G.js +411 -0
  61. package/dist/chunk-VAFTWSTE.js +1061 -0
  62. package/dist/chunk-XZ3S56RQ.js +1061 -0
  63. package/dist/chunk-Y72C7F6O.js +148 -0
  64. package/dist/cli.d.ts +1 -0
  65. package/dist/cli.js +325 -0
  66. package/dist/core.d.ts +188 -0
  67. package/dist/core.js +12 -0
  68. package/dist/crypto.d.ts +16 -0
  69. package/dist/crypto.js +18 -0
  70. package/dist/dev-update-SZ2Z4WCQ.js +6 -0
  71. package/dist/ldm.d.ts +17 -0
  72. package/dist/ldm.js +12 -0
  73. package/dist/mcp-server.d.ts +1 -0
  74. package/dist/mcp-server.js +250 -0
  75. package/dist/migrate.d.ts +1 -0
  76. package/dist/migrate.js +89 -0
  77. package/dist/mirror-sync.d.ts +1 -0
  78. package/dist/mirror-sync.js +130 -0
  79. package/dist/openclaw.d.ts +5 -0
  80. package/dist/openclaw.js +349 -0
  81. package/dist/poller.d.ts +1 -0
  82. package/dist/poller.js +272 -0
  83. package/dist/summarize.d.ts +19 -0
  84. package/dist/summarize.js +10 -0
  85. package/dist/worker.js +137 -0
  86. package/openclaw.plugin.json +11 -0
  87. package/package.json +40 -0
  88. package/scripts/migrate-lance-to-sqlite.mjs +217 -0
  89. package/skills/memory/SKILL.md +61 -0
  90. package/src/cc-hook.ts +447 -0
  91. package/src/cli.ts +356 -0
  92. package/src/core.ts +1472 -0
  93. package/src/crypto.ts +113 -0
  94. package/src/dev-update.ts +178 -0
  95. package/src/ldm.ts +117 -0
  96. package/src/mcp-server.ts +274 -0
  97. package/src/migrate.ts +104 -0
  98. package/src/mirror-sync.ts +175 -0
  99. package/src/openclaw.ts +250 -0
  100. package/src/poller.ts +345 -0
  101. package/src/summarize.ts +210 -0
  102. package/src/worker.ts +208 -0
  103. package/tsconfig.json +18 -0
  104. package/wrangler.toml +20 -0
package/src/core.ts ADDED
@@ -0,0 +1,1472 @@
1
+ // memory-crystal/core.ts — Pure logic layer. Zero framework dependencies.
2
+ // Hybrid search: sqlite-vec (vectors) + FTS5 (BM25) + RRF fusion + recency.
3
+ // Dual-writes to LanceDB (safety net) and sqlite-vec (source of truth).
4
+ // Search algorithms ported from QMD (MIT, Tobi Lutke, 2024-2026).
5
+ // Config via function params, not globals. Errors: throw, callers catch.
6
+
7
+ import * as lancedb from '@lancedb/lancedb';
8
+ import Database from 'better-sqlite3';
9
+ import * as sqliteVec from 'sqlite-vec';
10
+ import { readFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'node:fs';
11
+ import { execSync } from 'node:child_process';
12
+ import { join, relative, extname, basename } from 'node:path';
13
+ import { createHash } from 'node:crypto';
14
+ import http from 'node:http';
15
+ import https from 'node:https';
16
+
17
+ // ─── Types ─────────────────────────────────────────────────────────────────
18
+
19
+ export interface CrystalConfig {
20
+ /** Root directory for all crystal data */
21
+ dataDir: string;
22
+ /** Embedding provider: 'openai' | 'ollama' | 'google' */
23
+ embeddingProvider: 'openai' | 'ollama' | 'google';
24
+ /** OpenAI API key (required if provider is 'openai') */
25
+ openaiApiKey?: string;
26
+ /** OpenAI embedding model (default: text-embedding-3-small) */
27
+ openaiModel?: string;
28
+ /** Ollama host (default: http://localhost:11434) */
29
+ ollamaHost?: string;
30
+ /** Ollama model (default: nomic-embed-text) */
31
+ ollamaModel?: string;
32
+ /** Google API key (required if provider is 'google') */
33
+ googleApiKey?: string;
34
+ /** Google embedding model (default: text-embedding-004) */
35
+ googleModel?: string;
36
+ /** Remote Worker URL for cloud mirror mode */
37
+ remoteUrl?: string;
38
+ /** Remote auth token */
39
+ remoteToken?: string;
40
+ }
41
+
42
+ export interface Chunk {
43
+ id?: number;
44
+ text: string;
45
+ embedding?: number[];
46
+ role: 'user' | 'assistant' | 'system';
47
+ source_type: string; // 'conversation' | 'file' | 'imessage' | 'manual'
48
+ source_id: string; // session key, file path, etc.
49
+ agent_id: string; // 'main' (Lēsa), 'claude-code', etc.
50
+ token_count: number;
51
+ created_at: string; // ISO timestamp
52
+ }
53
+
54
+ export interface Memory {
55
+ id?: number;
56
+ text: string;
57
+ embedding?: number[];
58
+ category: 'fact' | 'preference' | 'event' | 'opinion' | 'skill';
59
+ confidence: number; // 0-1, decays over time
60
+ source_ids: string; // JSON array of chunk IDs
61
+ status: 'active' | 'deprecated' | 'deleted';
62
+ created_at: string;
63
+ updated_at: string;
64
+ }
65
+
66
+ export interface SearchResult {
67
+ text: string;
68
+ role: string;
69
+ score: number;
70
+ source_type: string;
71
+ source_id: string;
72
+ agent_id: string;
73
+ created_at: string;
74
+ freshness?: "fresh" | "recent" | "aging" | "stale";
75
+ }
76
+
77
+ export interface CrystalStatus {
78
+ chunks: number;
79
+ memories: number;
80
+ sources: number;
81
+ agents: string[];
82
+ oldestChunk: string | null;
83
+ newestChunk: string | null;
84
+ embeddingProvider: string;
85
+ dataDir: string;
86
+ capturedSessions: number;
87
+ latestCapture: string | null;
88
+ }
89
+
90
+ // ─── Source Indexing Types (optional feature) ─────────────────────────────
91
+
92
+ export interface SourceCollection {
93
+ id?: number;
94
+ name: string;
95
+ root_path: string;
96
+ glob_patterns: string; // JSON array of include globs
97
+ ignore_patterns: string; // JSON array of ignore globs
98
+ file_count: number;
99
+ chunk_count: number;
100
+ last_sync_at: string | null;
101
+ created_at: string;
102
+ }
103
+
104
+ export interface SourceFile {
105
+ id?: number;
106
+ collection_id: number;
107
+ file_path: string; // relative to collection root
108
+ file_hash: string; // SHA-256 of content
109
+ file_size: number;
110
+ chunk_count: number;
111
+ last_indexed_at: string;
112
+ }
113
+
114
+ export interface SourcesStatus {
115
+ collections: Array<{
116
+ name: string;
117
+ root_path: string;
118
+ file_count: number;
119
+ chunk_count: number;
120
+ last_sync_at: string | null;
121
+ }>;
122
+ total_files: number;
123
+ total_chunks: number;
124
+ }
125
+
126
+ export interface SyncResult {
127
+ collection: string;
128
+ added: number;
129
+ updated: number;
130
+ removed: number;
131
+ chunks_added: number;
132
+ duration_ms: number;
133
+ }
134
+
135
+ // ─── Embedding Providers ───────────────────────────────────────────────────
136
+
137
+ async function embedOpenAI(texts: string[], apiKey: string, model: string): Promise<number[][]> {
138
+ return new Promise((resolve, reject) => {
139
+ const body = JSON.stringify({ input: texts, model });
140
+ const req = https.request({
141
+ hostname: 'api.openai.com',
142
+ path: '/v1/embeddings',
143
+ method: 'POST',
144
+ headers: {
145
+ 'Content-Type': 'application/json',
146
+ 'Authorization': `Bearer ${apiKey}`,
147
+ 'Content-Length': Buffer.byteLength(body),
148
+ },
149
+ timeout: 30000,
150
+ }, (res) => {
151
+ let data = '';
152
+ res.on('data', (chunk) => data += chunk);
153
+ res.on('end', () => {
154
+ if (res.statusCode !== 200) {
155
+ reject(new Error(`OpenAI API error ${res.statusCode}: ${data.slice(0, 200)}`));
156
+ return;
157
+ }
158
+ const parsed = JSON.parse(data);
159
+ resolve(parsed.data.map((d: any) => d.embedding));
160
+ });
161
+ });
162
+ req.on('error', reject);
163
+ req.on('timeout', () => { req.destroy(); reject(new Error('OpenAI timeout')); });
164
+ req.write(body);
165
+ req.end();
166
+ });
167
+ }
168
+
169
+ async function embedOllama(texts: string[], host: string, model: string): Promise<number[][]> {
170
+ const results: number[][] = [];
171
+ for (const text of texts) {
172
+ const result = await new Promise<number[]>((resolve, reject) => {
173
+ const url = new URL('/api/embeddings', host);
174
+ const body = JSON.stringify({ model, prompt: text });
175
+ const req = http.request({
176
+ hostname: url.hostname,
177
+ port: url.port,
178
+ path: url.pathname,
179
+ method: 'POST',
180
+ headers: {
181
+ 'Content-Type': 'application/json',
182
+ 'Content-Length': Buffer.byteLength(body),
183
+ },
184
+ timeout: 15000,
185
+ }, (res) => {
186
+ let data = '';
187
+ res.on('data', (chunk) => data += chunk);
188
+ res.on('end', () => {
189
+ if (res.statusCode !== 200) {
190
+ reject(new Error(`Ollama error ${res.statusCode}: ${data.slice(0, 200)}`));
191
+ return;
192
+ }
193
+ resolve(JSON.parse(data).embedding);
194
+ });
195
+ });
196
+ req.on('error', reject);
197
+ req.on('timeout', () => { req.destroy(); reject(new Error('Ollama timeout')); });
198
+ req.write(body);
199
+ req.end();
200
+ });
201
+ results.push(result);
202
+ }
203
+ return results;
204
+ }
205
+
206
+ async function embedGoogle(texts: string[], apiKey: string, model: string): Promise<number[][]> {
207
+ return new Promise((resolve, reject) => {
208
+ const body = JSON.stringify({
209
+ requests: texts.map(text => ({ model: `models/${model}`, content: { parts: [{ text }] } })),
210
+ });
211
+ const req = https.request({
212
+ hostname: 'generativelanguage.googleapis.com',
213
+ path: `/v1beta/models/${model}:batchEmbedContents?key=${apiKey}`,
214
+ method: 'POST',
215
+ headers: {
216
+ 'Content-Type': 'application/json',
217
+ 'Content-Length': Buffer.byteLength(body),
218
+ },
219
+ timeout: 30000,
220
+ }, (res) => {
221
+ let data = '';
222
+ res.on('data', (chunk) => data += chunk);
223
+ res.on('end', () => {
224
+ if (res.statusCode !== 200) {
225
+ reject(new Error(`Google API error ${res.statusCode}: ${data.slice(0, 200)}`));
226
+ return;
227
+ }
228
+ const parsed = JSON.parse(data);
229
+ resolve(parsed.embeddings.map((e: any) => e.values));
230
+ });
231
+ });
232
+ req.on('error', reject);
233
+ req.on('timeout', () => { req.destroy(); reject(new Error('Google timeout')); });
234
+ req.write(body);
235
+ req.end();
236
+ });
237
+ }
238
+
239
+ // ─── Crystal Core ──────────────────────────────────────────────────────────
240
+
241
+ export class Crystal {
242
+ private config: CrystalConfig;
243
+ private lanceDb: lancedb.Connection | null = null;
244
+ private sqliteDb: Database.Database | null = null;
245
+ private chunksTable: lancedb.Table | null = null;
246
+ private vecDimensions: number | null = null;
247
+
248
+ constructor(config: CrystalConfig) {
249
+ this.config = config;
250
+ if (!existsSync(config.dataDir)) {
251
+ mkdirSync(config.dataDir, { recursive: true });
252
+ }
253
+ }
254
+
255
+ // ── Initialization ──
256
+
257
+ async init(): Promise<void> {
258
+ const lanceDir = join(this.config.dataDir, 'lance');
259
+ const sqlitePath = join(this.config.dataDir, 'crystal.db');
260
+
261
+ if (!existsSync(lanceDir)) mkdirSync(lanceDir, { recursive: true });
262
+
263
+ this.lanceDb = await lancedb.connect(lanceDir);
264
+ this.sqliteDb = new Database(sqlitePath);
265
+ this.sqliteDb.pragma('journal_mode = WAL');
266
+
267
+ // Load sqlite-vec extension for vector search
268
+ sqliteVec.load(this.sqliteDb);
269
+
270
+ this.initSqliteTables();
271
+ this.initChunksTables();
272
+ await this.initLanceTables();
273
+ }
274
+
275
+ private initSqliteTables(): void {
276
+ const db = this.sqliteDb!;
277
+
278
+ db.exec(`
279
+ CREATE TABLE IF NOT EXISTS sources (
280
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
281
+ type TEXT NOT NULL,
282
+ uri TEXT NOT NULL,
283
+ title TEXT,
284
+ agent_id TEXT NOT NULL,
285
+ metadata TEXT DEFAULT '{}',
286
+ ingested_at TEXT NOT NULL,
287
+ chunk_count INTEGER DEFAULT 0
288
+ );
289
+
290
+ CREATE TABLE IF NOT EXISTS capture_state (
291
+ agent_id TEXT NOT NULL,
292
+ source_id TEXT NOT NULL,
293
+ last_message_count INTEGER DEFAULT 0,
294
+ capture_count INTEGER DEFAULT 0,
295
+ last_capture_at TEXT,
296
+ PRIMARY KEY (agent_id, source_id)
297
+ );
298
+
299
+ CREATE TABLE IF NOT EXISTS memories (
300
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
301
+ text TEXT NOT NULL,
302
+ category TEXT NOT NULL DEFAULT 'fact',
303
+ confidence REAL NOT NULL DEFAULT 1.0,
304
+ source_ids TEXT DEFAULT '[]',
305
+ status TEXT NOT NULL DEFAULT 'active',
306
+ created_at TEXT NOT NULL,
307
+ updated_at TEXT NOT NULL
308
+ );
309
+
310
+ CREATE TABLE IF NOT EXISTS entities (
311
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
312
+ name TEXT NOT NULL UNIQUE,
313
+ type TEXT NOT NULL DEFAULT 'concept',
314
+ description TEXT,
315
+ properties TEXT DEFAULT '{}',
316
+ created_at TEXT NOT NULL,
317
+ updated_at TEXT NOT NULL
318
+ );
319
+
320
+ CREATE TABLE IF NOT EXISTS relationships (
321
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
322
+ source_id INTEGER NOT NULL REFERENCES entities(id),
323
+ target_id INTEGER NOT NULL REFERENCES entities(id),
324
+ type TEXT NOT NULL,
325
+ description TEXT,
326
+ weight REAL DEFAULT 1.0,
327
+ valid_from TEXT NOT NULL,
328
+ valid_until TEXT,
329
+ created_at TEXT NOT NULL
330
+ );
331
+
332
+ CREATE INDEX IF NOT EXISTS idx_sources_agent ON sources(agent_id);
333
+ CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status);
334
+ CREATE INDEX IF NOT EXISTS idx_entities_name ON entities(name);
335
+ CREATE INDEX IF NOT EXISTS idx_relationships_source ON relationships(source_id);
336
+ CREATE INDEX IF NOT EXISTS idx_relationships_target ON relationships(target_id);
337
+
338
+ -- Source file indexing (optional feature)
339
+ CREATE TABLE IF NOT EXISTS source_collections (
340
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
341
+ name TEXT NOT NULL UNIQUE,
342
+ root_path TEXT NOT NULL,
343
+ glob_patterns TEXT NOT NULL DEFAULT '["**/*"]',
344
+ ignore_patterns TEXT NOT NULL DEFAULT '[]',
345
+ file_count INTEGER DEFAULT 0,
346
+ chunk_count INTEGER DEFAULT 0,
347
+ last_sync_at TEXT,
348
+ created_at TEXT NOT NULL
349
+ );
350
+
351
+ CREATE TABLE IF NOT EXISTS source_files (
352
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
353
+ collection_id INTEGER NOT NULL REFERENCES source_collections(id) ON DELETE CASCADE,
354
+ file_path TEXT NOT NULL,
355
+ file_hash TEXT NOT NULL,
356
+ file_size INTEGER NOT NULL,
357
+ chunk_count INTEGER DEFAULT 0,
358
+ last_indexed_at TEXT NOT NULL
359
+ );
360
+
361
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_source_files_path ON source_files(collection_id, file_path);
362
+ CREATE INDEX IF NOT EXISTS idx_source_files_collection ON source_files(collection_id);
363
+ `);
364
+ }
365
+
366
+ private initChunksTables(): void {
367
+ const db = this.sqliteDb!;
368
+
369
+ // Chunks table: text + metadata (replaces LanceDB for search reads)
370
+ db.exec(`
371
+ CREATE TABLE IF NOT EXISTS chunks (
372
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
373
+ text TEXT NOT NULL,
374
+ text_hash TEXT NOT NULL,
375
+ role TEXT,
376
+ source_type TEXT,
377
+ source_id TEXT,
378
+ agent_id TEXT,
379
+ token_count INTEGER,
380
+ created_at TEXT NOT NULL
381
+ );
382
+
383
+ CREATE INDEX IF NOT EXISTS idx_chunks_agent ON chunks(agent_id);
384
+ CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_type);
385
+ CREATE INDEX IF NOT EXISTS idx_chunks_hash ON chunks(text_hash);
386
+ CREATE INDEX IF NOT EXISTS idx_chunks_created ON chunks(created_at);
387
+
388
+ -- FTS5 full-text search table
389
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
390
+ text,
391
+ tokenize='porter unicode61'
392
+ );
393
+
394
+ -- Sync trigger: populate FTS on chunk insert
395
+ CREATE TRIGGER IF NOT EXISTS chunks_fts_insert AFTER INSERT ON chunks
396
+ BEGIN
397
+ INSERT INTO chunks_fts(rowid, text) VALUES (NEW.id, NEW.text);
398
+ END;
399
+ `);
400
+
401
+ // Check if chunks_vec exists and get its dimensions
402
+ const vecTable = db.prepare(
403
+ `SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_vec'`
404
+ ).get() as any;
405
+
406
+ if (vecTable) {
407
+ // Vec table exists, figure out its dimensions from existing data
408
+ try {
409
+ const row = db.prepare('SELECT embedding FROM chunks_vec LIMIT 1').get() as any;
410
+ if (row?.embedding) {
411
+ // Float32Array: 4 bytes per dimension
412
+ this.vecDimensions = (row.embedding as Buffer).length / 4;
413
+ }
414
+ } catch {
415
+ // Empty table or error, dimensions will be set on first ingest
416
+ }
417
+ }
418
+ }
419
+
420
+ private ensureVecTable(dimensions: number): void {
421
+ const db = this.sqliteDb!;
422
+ const existing = db.prepare(
423
+ `SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_vec'`
424
+ ).get();
425
+
426
+ if (!existing) {
427
+ db.exec(`
428
+ CREATE VIRTUAL TABLE chunks_vec USING vec0(
429
+ chunk_id INTEGER PRIMARY KEY,
430
+ embedding float[${dimensions}] distance_metric=cosine
431
+ );
432
+ `);
433
+ }
434
+ this.vecDimensions = dimensions;
435
+ }
436
+
437
+ private async initLanceTables(): Promise<void> {
438
+ const db = this.lanceDb!;
439
+ const tableNames = await db.tableNames();
440
+
441
+ if (tableNames.includes('chunks')) {
442
+ this.chunksTable = await db.openTable('chunks');
443
+ }
444
+ // Table created on first ingest (needs embedding dimensions)
445
+ }
446
+
447
+ // ── Embedding ──
448
+
449
+ async embed(texts: string[]): Promise<number[][]> {
450
+ if (texts.length === 0) return [];
451
+ const cfg = this.config;
452
+
453
+ switch (cfg.embeddingProvider) {
454
+ case 'openai': {
455
+ if (!cfg.openaiApiKey) throw new Error('OpenAI API key required');
456
+ const model = cfg.openaiModel || 'text-embedding-3-small';
457
+ // OpenAI has a 300K token limit per request. Sub-batch to stay safe.
458
+ // ~4 chars per token, cap at ~200K tokens (~800K chars) per batch.
459
+ const maxCharsPerBatch = 800000;
460
+ const results: number[][] = [];
461
+ let batch: string[] = [];
462
+ let batchChars = 0;
463
+
464
+ for (const text of texts) {
465
+ if (batchChars + text.length > maxCharsPerBatch && batch.length > 0) {
466
+ results.push(...await embedOpenAI(batch, cfg.openaiApiKey!, model));
467
+ batch = [];
468
+ batchChars = 0;
469
+ }
470
+ batch.push(text);
471
+ batchChars += text.length;
472
+ }
473
+ if (batch.length > 0) {
474
+ results.push(...await embedOpenAI(batch, cfg.openaiApiKey!, model));
475
+ }
476
+ return results;
477
+ }
478
+
479
+ case 'ollama':
480
+ return embedOllama(texts, cfg.ollamaHost || 'http://localhost:11434', cfg.ollamaModel || 'nomic-embed-text');
481
+
482
+ case 'google':
483
+ if (!cfg.googleApiKey) throw new Error('Google API key required');
484
+ return embedGoogle(texts, cfg.googleApiKey, cfg.googleModel || 'text-embedding-004');
485
+
486
+ default:
487
+ throw new Error(`Unknown embedding provider: ${cfg.embeddingProvider}`);
488
+ }
489
+ }
490
+
491
+ // ── Chunking ──
492
+
493
+ chunkText(text: string, targetTokens = 400, overlapTokens = 80): string[] {
494
+ const targetChars = targetTokens * 4;
495
+ const overlapChars = overlapTokens * 4;
496
+ const chunks: string[] = [];
497
+ let start = 0;
498
+
499
+ while (start < text.length) {
500
+ let end = Math.min(start + targetChars, text.length);
501
+
502
+ if (end < text.length) {
503
+ // Try paragraph boundary first
504
+ const minBreak = start + Math.floor(targetChars * 0.5);
505
+ const paraBreak = text.lastIndexOf('\n\n', end);
506
+ if (paraBreak > minBreak) {
507
+ end = paraBreak;
508
+ } else {
509
+ // Try sentence boundary
510
+ const sentBreak = text.lastIndexOf('. ', end);
511
+ if (sentBreak > minBreak) {
512
+ end = sentBreak + 1;
513
+ }
514
+ }
515
+ }
516
+
517
+ const chunk = text.slice(start, end).trim();
518
+ if (chunk.length > 0) chunks.push(chunk);
519
+
520
+ if (end >= text.length) break;
521
+ start = end - overlapChars;
522
+ if (start <= (chunks.length > 0 ? end - targetChars : 0)) {
523
+ start = end;
524
+ }
525
+ }
526
+
527
+ return chunks;
528
+ }
529
+
530
+ // ── Ingest ──
531
+
532
+ async ingest(chunks: Chunk[]): Promise<number> {
533
+ if (chunks.length === 0) return 0;
534
+ const db = this.sqliteDb!;
535
+
536
+ // 1. Dedup: skip chunks whose text already exists (by SHA-256 hash)
537
+ const newChunks = chunks.filter(c => {
538
+ const hash = createHash('sha256').update(c.text).digest('hex');
539
+ return !db.prepare('SELECT 1 FROM chunks WHERE text_hash = ?').get(hash);
540
+ });
541
+
542
+ if (newChunks.length === 0) return 0;
543
+
544
+ // 2. Embed
545
+ const texts = newChunks.map(c => c.text);
546
+ const embeddings = await this.embed(texts);
547
+
548
+ // 3. Ensure vec table exists (lazy... needs dimensions from first embedding)
549
+ if (!this.vecDimensions && embeddings.length > 0) {
550
+ this.ensureVecTable(embeddings[0].length);
551
+ }
552
+
553
+ // 4. Write to sqlite-vec (chunks table trigger populates FTS automatically)
554
+ const insertChunk = db.prepare(`
555
+ INSERT INTO chunks (text, text_hash, role, source_type, source_id, agent_id, token_count, created_at)
556
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
557
+ `);
558
+ const insertVec = db.prepare(`
559
+ INSERT INTO chunks_vec (chunk_id, embedding) VALUES (?, ?)
560
+ `);
561
+
562
+ const transaction = db.transaction(() => {
563
+ for (let i = 0; i < newChunks.length; i++) {
564
+ const c = newChunks[i];
565
+ const hash = createHash('sha256').update(c.text).digest('hex');
566
+ const result = insertChunk.run(
567
+ c.text, hash, c.role, c.source_type, c.source_id,
568
+ c.agent_id, c.token_count, c.created_at || new Date().toISOString()
569
+ );
570
+ // sqlite-vec requires BigInt for INTEGER PRIMARY KEY
571
+ const chunkId = typeof result.lastInsertRowid === 'bigint'
572
+ ? result.lastInsertRowid
573
+ : BigInt(result.lastInsertRowid);
574
+ insertVec.run(chunkId, new Float32Array(embeddings[i]));
575
+ }
576
+ });
577
+ transaction();
578
+
579
+ // 5. Dual-write: also write to LanceDB (safety net during transition)
580
+ const records = newChunks.map((chunk, i) => ({
581
+ text: chunk.text,
582
+ vector: embeddings[i],
583
+ role: chunk.role,
584
+ source_type: chunk.source_type,
585
+ source_id: chunk.source_id,
586
+ agent_id: chunk.agent_id,
587
+ token_count: chunk.token_count,
588
+ created_at: chunk.created_at || new Date().toISOString(),
589
+ }));
590
+
591
+ try {
592
+ if (!this.chunksTable) {
593
+ this.chunksTable = await this.lanceDb!.createTable('chunks', records);
594
+ } else {
595
+ await this.chunksTable.add(records);
596
+ }
597
+ } catch (err) {
598
+ // LanceDB write failure is non-fatal during transition
599
+ console.warn('LanceDB dual-write failed (non-fatal):', (err as Error).message);
600
+ }
601
+
602
+ return newChunks.length;
603
+ }
604
+
605
+ // ── Recency helpers ──
606
+
607
+ private recencyWeight(ageDays: number): number {
608
+ // Linear decay with floor at 0.5. Old stuff never fully disappears
609
+ // but fresh context wins ties. ~50 days to hit the floor.
610
+ return Math.max(0.5, 1.0 - ageDays * 0.01);
611
+ }
612
+
613
+ private freshnessLabel(ageDays: number): "fresh" | "recent" | "aging" | "stale" {
614
+ if (ageDays < 3) return "fresh";
615
+ if (ageDays < 7) return "recent";
616
+ if (ageDays < 14) return "aging";
617
+ return "stale";
618
+ }
619
+
620
+ // ── Search (Hybrid: BM25 + Vector + RRF fusion + Recency) ──
621
+
622
+ async search(query: string, limit = 5, filter?: { agent_id?: string; source_type?: string }): Promise<SearchResult[]> {
623
+ const db = this.sqliteDb!;
624
+
625
+ // Check if sqlite-vec has been populated (migration complete)
626
+ const sqliteChunks = (db.prepare('SELECT COUNT(*) as count FROM chunks').get() as any)?.count || 0;
627
+ let lanceChunks = 0;
628
+ if (this.chunksTable) {
629
+ try { lanceChunks = await this.chunksTable.countRows(); } catch {}
630
+ }
631
+
632
+ // Use LanceDB fallback if sqlite-vec is empty OR has far fewer chunks than LanceDB
633
+ // (migration not yet done). Once migration runs, sqlite-vec count will match.
634
+ if (sqliteChunks === 0 || (lanceChunks > 0 && sqliteChunks < lanceChunks * 0.5)) {
635
+ return this.searchLanceFallback(query, limit, filter);
636
+ }
637
+
638
+ const [embedding] = await this.embed([query]);
639
+ const fetchLimit = Math.max(limit * 3, 30);
640
+
641
+ // Run FTS and vector search, then fuse with RRF
642
+ const vecResults = this.searchVec(embedding, fetchLimit, filter);
643
+ const ftsResults = this.searchFTS(query, fetchLimit, filter);
644
+ const fused = this.reciprocalRankFusion([ftsResults, vecResults], [1.0, 1.0]);
645
+
646
+ // Apply recency weighting on top of fused scores
647
+ const now = Date.now();
648
+ const scored = fused.map(r => {
649
+ const ageDays = r.created_at ? (now - new Date(r.created_at).getTime()) / 86400000 : 0;
650
+ const recency = r.created_at ? this.recencyWeight(ageDays) : 1;
651
+ // RRF scores max at ~0.08. Rescale to match old cosine range (0.3-0.6)
652
+ // so models treat the results as meaningful. Ranking is unchanged.
653
+ const rescaled = Math.min(r.score * recency * 8, 1.0);
654
+ return {
655
+ ...r,
656
+ score: rescaled,
657
+ freshness: r.created_at ? this.freshnessLabel(ageDays) : undefined,
658
+ };
659
+ });
660
+
661
+ return scored.sort((a, b) => b.score - a.score).slice(0, limit);
662
+ }
663
+
664
+ /** Vector search via sqlite-vec. Two-step pattern: MATCH first, then JOIN. */
665
+ private searchVec(embedding: number[], limit: number, filter?: { agent_id?: string; source_type?: string }): SearchResult[] {
666
+ const db = this.sqliteDb!;
667
+
668
+ if (!this.vecDimensions) return [];
669
+
670
+ // Step 1: sqlite-vec MATCH (no JOINs! Virtual tables hang with JOINs.)
671
+ // See: https://github.com/tobi/qmd/pull/23
672
+ const vecRows = db.prepare(`
673
+ SELECT chunk_id, distance
674
+ FROM chunks_vec
675
+ WHERE embedding MATCH ? AND k = ?
676
+ `).all(new Float32Array(embedding), limit) as Array<{ chunk_id: number; distance: number }>;
677
+
678
+ if (vecRows.length === 0) return [];
679
+
680
+ // Step 2: Look up chunk metadata with a separate query
681
+ const ids = vecRows.map(r => r.chunk_id);
682
+ const distMap = new Map(vecRows.map(r => [r.chunk_id, r.distance]));
683
+
684
+ const placeholders = ids.map(() => '?').join(',');
685
+ let sql = `SELECT id, text, role, source_type, source_id, agent_id, created_at FROM chunks WHERE id IN (${placeholders})`;
686
+ const params: any[] = [...ids];
687
+
688
+ if (filter?.agent_id) { sql += ' AND agent_id = ?'; params.push(filter.agent_id); }
689
+ if (filter?.source_type) { sql += ' AND source_type = ?'; params.push(filter.source_type); }
690
+
691
+ const rows = db.prepare(sql).all(...params) as Array<{
692
+ id: number; text: string; role: string; source_type: string;
693
+ source_id: string; agent_id: string; created_at: string;
694
+ }>;
695
+
696
+ return rows.map(row => ({
697
+ text: row.text,
698
+ role: row.role,
699
+ score: 1 - (distMap.get(row.id) || 1), // cosine similarity from distance
700
+ source_type: row.source_type,
701
+ source_id: row.source_id,
702
+ agent_id: row.agent_id,
703
+ created_at: row.created_at,
704
+ }));
705
+ }
706
+
707
+ /** Full-text search via FTS5 with BM25 scoring. */
708
+ private searchFTS(query: string, limit: number, filter?: { agent_id?: string; source_type?: string }): SearchResult[] {
709
+ const db = this.sqliteDb!;
710
+ const ftsQuery = this.buildFTS5Query(query);
711
+ if (!ftsQuery) return [];
712
+
713
+ let sql = `
714
+ SELECT c.id, c.text, c.role, c.source_type, c.source_id, c.agent_id, c.created_at,
715
+ bm25(chunks_fts) as bm25_score
716
+ FROM chunks_fts f
717
+ JOIN chunks c ON c.id = f.rowid
718
+ WHERE chunks_fts MATCH ?
719
+ `;
720
+ const params: any[] = [ftsQuery];
721
+
722
+ if (filter?.agent_id) { sql += ' AND c.agent_id = ?'; params.push(filter.agent_id); }
723
+ if (filter?.source_type) { sql += ' AND c.source_type = ?'; params.push(filter.source_type); }
724
+
725
+ sql += ' ORDER BY bm25_score LIMIT ?';
726
+ params.push(limit);
727
+
728
+ const rows = db.prepare(sql).all(...params) as Array<{
729
+ id: number; text: string; role: string; source_type: string;
730
+ source_id: string; agent_id: string; created_at: string; bm25_score: number;
731
+ }>;
732
+
733
+ return rows.map(row => ({
734
+ text: row.text,
735
+ role: row.role,
736
+ // BM25 scores are negative (lower = better). Normalize to [0..1).
737
+ // |x| / (1 + |x|) maps: strong(-10)->0.91, medium(-2)->0.67, weak(-0.5)->0.33
738
+ score: Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score)),
739
+ source_type: row.source_type,
740
+ source_id: row.source_id,
741
+ agent_id: row.agent_id,
742
+ created_at: row.created_at,
743
+ }));
744
+ }
745
+
746
+ /** Build a safe FTS5 query from user input. */
747
+ private buildFTS5Query(query: string): string | null {
748
+ const terms = query.split(/\s+/)
749
+ .map(t => t.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase())
750
+ .filter(t => t.length > 0);
751
+ if (terms.length === 0) return null;
752
+ if (terms.length === 1) return `"${terms[0]}"*`;
753
+ return terms.map(t => `"${t}"*`).join(' AND ');
754
+ }
755
+
756
+ /**
757
+ * Reciprocal Rank Fusion. Ported from QMD (MIT License, Tobi Lutke, 2024-2026).
758
+ * Fuses multiple ranked result lists into one using RRF scoring.
759
+ * Uses text content as dedup key (instead of QMD's file path).
760
+ */
761
+ private reciprocalRankFusion(
762
+ resultLists: SearchResult[][],
763
+ weights: number[] = [],
764
+ k: number = 60
765
+ ): SearchResult[] {
766
+ const scores = new Map<string, { result: SearchResult; rrfScore: number; topRank: number }>();
767
+
768
+ for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
769
+ const list = resultLists[listIdx];
770
+ if (!list) continue;
771
+ const weight = weights[listIdx] ?? 1.0;
772
+
773
+ for (let rank = 0; rank < list.length; rank++) {
774
+ const result = list[rank];
775
+ if (!result) continue;
776
+ const rrfContribution = weight / (k + rank + 1);
777
+ // Dedup by text content (truncated for perf)
778
+ const dedup = result.text.slice(0, 200);
779
+ const existing = scores.get(dedup);
780
+
781
+ if (existing) {
782
+ existing.rrfScore += rrfContribution;
783
+ existing.topRank = Math.min(existing.topRank, rank);
784
+ } else {
785
+ scores.set(dedup, {
786
+ result,
787
+ rrfScore: rrfContribution,
788
+ topRank: rank,
789
+ });
790
+ }
791
+ }
792
+ }
793
+
794
+ // Top-rank bonus: reward results that appear at or near the top of any list
795
+ for (const entry of scores.values()) {
796
+ if (entry.topRank === 0) {
797
+ entry.rrfScore += 0.05;
798
+ } else if (entry.topRank <= 2) {
799
+ entry.rrfScore += 0.02;
800
+ }
801
+ }
802
+
803
+ return Array.from(scores.values())
804
+ .sort((a, b) => b.rrfScore - a.rrfScore)
805
+ .map(e => ({ ...e.result, score: e.rrfScore }));
806
+ }
807
+
808
+ /** LanceDB fallback for search (used when sqlite-vec tables are empty, pre-migration). */
809
+ private async searchLanceFallback(query: string, limit: number, filter?: { agent_id?: string; source_type?: string }): Promise<SearchResult[]> {
810
+ if (!this.chunksTable) return [];
811
+
812
+ const [embedding] = await this.embed([query]);
813
+ const fetchLimit = Math.max(limit * 3, 30);
814
+ let queryBuilder = this.chunksTable.vectorSearch(embedding).distanceType('cosine').limit(fetchLimit);
815
+
816
+ if (filter?.agent_id) {
817
+ queryBuilder = queryBuilder.where(`agent_id = '${filter.agent_id}'`);
818
+ }
819
+ if (filter?.source_type) {
820
+ queryBuilder = queryBuilder.where(`source_type = '${filter.source_type}'`);
821
+ }
822
+
823
+ const results = await queryBuilder.toArray();
824
+ const now = Date.now();
825
+
826
+ return results.map((row: any) => {
827
+ const cosine = row._distance != null ? 1 - row._distance : 0;
828
+ const createdAt = row.created_at || '';
829
+ const ageDays = createdAt ? (now - new Date(createdAt).getTime()) / 86400000 : 0;
830
+ const weight = createdAt ? this.recencyWeight(ageDays) : 1;
831
+
832
+ return {
833
+ text: row.text,
834
+ role: row.role,
835
+ score: cosine * weight,
836
+ source_type: row.source_type,
837
+ source_id: row.source_id,
838
+ agent_id: row.agent_id,
839
+ created_at: createdAt,
840
+ freshness: createdAt ? this.freshnessLabel(ageDays) : undefined,
841
+ };
842
+ })
843
+ .sort((a, b) => b.score - a.score)
844
+ .slice(0, limit);
845
+ }
846
+
847
+ // ── Remember (explicit fact storage) ──
848
+
849
+ async remember(text: string, category: Memory['category'] = 'fact'): Promise<number> {
850
+ const db = this.sqliteDb!;
851
+ const now = new Date().toISOString();
852
+
853
+ const stmt = db.prepare(`
854
+ INSERT INTO memories (text, category, confidence, source_ids, status, created_at, updated_at)
855
+ VALUES (?, ?, 1.0, '[]', 'active', ?, ?)
856
+ `);
857
+ const result = stmt.run(text, category, now, now);
858
+
859
+ // Also ingest as a chunk for vector search
860
+ await this.ingest([{
861
+ text,
862
+ role: 'system',
863
+ source_type: 'manual',
864
+ source_id: `memory:${result.lastInsertRowid}`,
865
+ agent_id: 'system',
866
+ token_count: Math.ceil(text.length / 4),
867
+ created_at: now,
868
+ }]);
869
+
870
+ return result.lastInsertRowid as number;
871
+ }
872
+
873
+ // ── Forget (deprecate a memory) ──
874
+
875
+ forget(memoryId: number): boolean {
876
+ const db = this.sqliteDb!;
877
+ const now = new Date().toISOString();
878
+ const result = db.prepare(`
879
+ UPDATE memories SET status = 'deprecated', updated_at = ? WHERE id = ? AND status = 'active'
880
+ `).run(now, memoryId);
881
+ return result.changes > 0;
882
+ }
883
+
884
+ // ── Status ──
885
+
886
+ async status(): Promise<CrystalStatus> {
887
+ const db = this.sqliteDb!;
888
+
889
+ // Show the higher of sqlite-vec or LanceDB count during transition
890
+ const sqliteChunks = (db.prepare('SELECT COUNT(*) as count FROM chunks').get() as any)?.count || 0;
891
+ let lanceChunks = 0;
892
+ if (this.chunksTable) {
893
+ try { lanceChunks = await this.chunksTable.countRows(); } catch {}
894
+ }
895
+ const chunks = Math.max(sqliteChunks, lanceChunks);
896
+
897
+ // Time range from sqlite chunks table
898
+ const oldest = (db.prepare('SELECT MIN(created_at) as ts FROM chunks').get() as any)?.ts || null;
899
+ const newest = (db.prepare('SELECT MAX(created_at) as ts FROM chunks').get() as any)?.ts || null;
900
+
901
+ const memories = (db.prepare('SELECT COUNT(*) as count FROM memories WHERE status = ?').get('active') as any)?.count || 0;
902
+ const sources = (db.prepare('SELECT COUNT(*) as count FROM sources').get() as any)?.count || 0;
903
+
904
+ // Get agents from chunks, sources, and capture_state tables
905
+ const chunkAgentRows = db.prepare('SELECT DISTINCT agent_id FROM chunks WHERE agent_id IS NOT NULL').all() as any[];
906
+ const sourceAgentRows = db.prepare('SELECT DISTINCT agent_id FROM sources').all() as any[];
907
+ const captureAgentRows = db.prepare('SELECT DISTINCT agent_id FROM capture_state').all() as any[];
908
+ const agents = [...new Set([
909
+ ...chunkAgentRows.map((r: any) => r.agent_id),
910
+ ...sourceAgentRows.map((r: any) => r.agent_id),
911
+ ...captureAgentRows.map((r: any) => r.agent_id),
912
+ ])];
913
+
914
+ // Capture state summary
915
+ const captureInfo = db.prepare(
916
+ 'SELECT COUNT(*) as count, MAX(last_capture_at) as latest FROM capture_state'
917
+ ).get() as any;
918
+
919
+ return {
920
+ chunks,
921
+ memories,
922
+ sources,
923
+ agents,
924
+ oldestChunk: oldest,
925
+ newestChunk: newest,
926
+ embeddingProvider: this.config.embeddingProvider,
927
+ dataDir: this.config.dataDir,
928
+ capturedSessions: captureInfo?.count || 0,
929
+ latestCapture: captureInfo?.latest || null,
930
+ };
931
+ }
932
+
933
+ // ── Capture State (for incremental ingestion) ──
934
+
935
+ getCaptureState(agentId: string, sourceId: string): { lastMessageCount: number; captureCount: number } {
936
+ const db = this.sqliteDb!;
937
+ const row = db.prepare('SELECT last_message_count, capture_count FROM capture_state WHERE agent_id = ? AND source_id = ?')
938
+ .get(agentId, sourceId) as any;
939
+ if (!row) return { lastMessageCount: 0, captureCount: 0 };
940
+ return {
941
+ lastMessageCount: row.last_message_count,
942
+ captureCount: row.capture_count,
943
+ };
944
+ }
945
+
946
+ setCaptureState(agentId: string, sourceId: string, messageCount: number, captureCount: number): void {
947
+ const db = this.sqliteDb!;
948
+ db.prepare(`
949
+ INSERT OR REPLACE INTO capture_state (agent_id, source_id, last_message_count, capture_count, last_capture_at)
950
+ VALUES (?, ?, ?, ?, ?)
951
+ `).run(agentId, sourceId, messageCount, captureCount, new Date().toISOString());
952
+ }
953
+
954
+ // ── Source File Indexing (optional feature) ──
955
+ //
956
+ // Add directories as "collections", sync to index/re-index changed files.
957
+ // All source chunks get source_type='file' so they're searchable alongside
958
+ // conversations and memories. Nothing here is required... you can use MC
959
+ // without ever touching sources.
960
+
961
+ // Default patterns for files worth indexing
962
+ private static readonly DEFAULT_INCLUDE = [
963
+ '**/*.ts', '**/*.js', '**/*.tsx', '**/*.jsx',
964
+ '**/*.py', '**/*.rs', '**/*.go', '**/*.java',
965
+ '**/*.md', '**/*.txt', '**/*.json', '**/*.yaml', '**/*.yml',
966
+ '**/*.toml', '**/*.sh', '**/*.bash', '**/*.zsh',
967
+ '**/*.css', '**/*.html', '**/*.svg',
968
+ '**/*.sql', '**/*.graphql',
969
+ '**/*.c', '**/*.cpp', '**/*.h', '**/*.hpp',
970
+ '**/*.swift', '**/*.kt', '**/*.rb',
971
+ '**/*.env.example', '**/*.gitignore',
972
+ '**/Makefile', '**/Dockerfile', '**/Cargo.toml',
973
+ '**/package.json', '**/tsconfig.json',
974
+ ];
975
+
976
+ private static readonly DEFAULT_IGNORE = [
977
+ '**/node_modules/**', '**/.git/**', '**/dist/**', '**/build/**',
978
+ '**/.next/**', '**/.cache/**', '**/coverage/**', '**/__pycache__/**',
979
+ '**/target/**', '**/vendor/**', '**/.venv/**',
980
+ '**/*.lock', '**/package-lock.json', '**/yarn.lock', '**/bun.lockb',
981
+ '**/*.min.js', '**/*.min.css', '**/*.map',
982
+ '**/*.png', '**/*.jpg', '**/*.jpeg', '**/*.gif', '**/*.ico', '**/*.webp',
983
+ '**/*.woff', '**/*.woff2', '**/*.ttf', '**/*.eot',
984
+ '**/*.mp3', '**/*.mp4', '**/*.wav', '**/*.ogg', '**/*.webm',
985
+ '**/*.zip', '**/*.tar', '**/*.gz', '**/*.br',
986
+ '**/*.sqlite', '**/*.db', '**/*.lance/**',
987
+ '**/*.jsonl',
988
+ '**/secrets/**', '**/.env',
989
+ ];
990
+
991
+ /** Add a directory as a source collection for indexing. */
992
+ async sourcesAdd(rootPath: string, name: string, options?: {
993
+ include?: string[];
994
+ ignore?: string[];
995
+ }): Promise<SourceCollection> {
996
+ const db = this.sqliteDb!;
997
+ const now = new Date().toISOString();
998
+ const includePatterns = JSON.stringify(options?.include || Crystal.DEFAULT_INCLUDE);
999
+ const ignorePatterns = JSON.stringify(options?.ignore || Crystal.DEFAULT_IGNORE);
1000
+
1001
+ // Check if collection already exists
1002
+ const existing = db.prepare('SELECT * FROM source_collections WHERE name = ?').get(name) as any;
1003
+ if (existing) {
1004
+ throw new Error(`Collection "${name}" already exists. Use sourcesSync() to update it.`);
1005
+ }
1006
+
1007
+ db.prepare(`
1008
+ INSERT INTO source_collections (name, root_path, glob_patterns, ignore_patterns, created_at)
1009
+ VALUES (?, ?, ?, ?, ?)
1010
+ `).run(name, rootPath, includePatterns, ignorePatterns, now);
1011
+
1012
+ const row = db.prepare('SELECT * FROM source_collections WHERE name = ?').get(name) as any;
1013
+ return row as SourceCollection;
1014
+ }
1015
+
1016
+ /** Remove a source collection and its file records. Chunks remain in LanceDB. */
1017
+ sourcesRemove(name: string): boolean {
1018
+ const db = this.sqliteDb!;
1019
+ const col = db.prepare('SELECT id FROM source_collections WHERE name = ?').get(name) as any;
1020
+ if (!col) return false;
1021
+ db.prepare('DELETE FROM source_files WHERE collection_id = ?').run(col.id);
1022
+ db.prepare('DELETE FROM source_collections WHERE id = ?').run(col.id);
1023
+ return true;
1024
+ }
1025
+
1026
+ /** Sync a collection: scan files, detect changes, re-index what changed. */
1027
+ async sourcesSync(name: string, options?: { dryRun?: boolean; batchSize?: number }): Promise<SyncResult> {
1028
+ const db = this.sqliteDb!;
1029
+ const startTime = Date.now();
1030
+ const batchSize = options?.batchSize || 20;
1031
+
1032
+ const col = db.prepare('SELECT * FROM source_collections WHERE name = ?').get(name) as any;
1033
+ if (!col) throw new Error(`Collection "${name}" not found. Add it first with sourcesAdd().`);
1034
+
1035
+ const includePatterns: string[] = JSON.parse(col.glob_patterns);
1036
+ const ignorePatterns: string[] = JSON.parse(col.ignore_patterns);
1037
+
1038
+ // Scan the directory for matching files
1039
+ const files = this.scanDirectory(col.root_path, includePatterns, ignorePatterns);
1040
+
1041
+ // Get existing file records
1042
+ const existingFiles = new Map<string, { id: number; file_hash: string }>();
1043
+ const rows = db.prepare('SELECT id, file_path, file_hash FROM source_files WHERE collection_id = ?').all(col.id) as any[];
1044
+ for (const row of rows) {
1045
+ existingFiles.set(row.file_path, { id: row.id, file_hash: row.file_hash });
1046
+ }
1047
+
1048
+ let added = 0;
1049
+ let updated = 0;
1050
+ let removed = 0;
1051
+ let chunksAdded = 0;
1052
+ const now = new Date().toISOString();
1053
+
1054
+ // Collect files that need indexing
1055
+ const toIndex: Array<{ relPath: string; absPath: string; hash: string; size: number; isUpdate: boolean }> = [];
1056
+
1057
+ for (const absPath of files) {
1058
+ const relPath = relative(col.root_path, absPath);
1059
+ let content: string;
1060
+ try {
1061
+ content = readFileSync(absPath, 'utf-8');
1062
+ } catch {
1063
+ continue; // skip binary or unreadable files
1064
+ }
1065
+
1066
+ // Skip files > 500KB (likely generated or data)
1067
+ const stat = statSync(absPath);
1068
+ if (stat.size > 500 * 1024) continue;
1069
+
1070
+ const hash = createHash('sha256').update(content).digest('hex');
1071
+ const existing = existingFiles.get(relPath);
1072
+
1073
+ if (existing) {
1074
+ existingFiles.delete(relPath); // mark as seen
1075
+ if (existing.file_hash === hash) continue; // unchanged
1076
+ toIndex.push({ relPath, absPath, hash, size: stat.size, isUpdate: true });
1077
+ } else {
1078
+ toIndex.push({ relPath, absPath, hash, size: stat.size, isUpdate: false });
1079
+ }
1080
+ }
1081
+
1082
+ if (options?.dryRun) {
1083
+ const newFiles = toIndex.filter(f => !f.isUpdate).length;
1084
+ const updatedFiles = toIndex.filter(f => f.isUpdate).length;
1085
+ return {
1086
+ collection: name,
1087
+ added: newFiles,
1088
+ updated: updatedFiles,
1089
+ removed: existingFiles.size,
1090
+ chunks_added: 0,
1091
+ duration_ms: Date.now() - startTime,
1092
+ };
1093
+ }
1094
+
1095
+ // Process files in batches
1096
+ for (let i = 0; i < toIndex.length; i += batchSize) {
1097
+ const batch = toIndex.slice(i, i + batchSize);
1098
+ const allChunks: Chunk[] = [];
1099
+
1100
+ for (const file of batch) {
1101
+ const content = readFileSync(file.absPath, 'utf-8');
1102
+ const ext = extname(file.absPath);
1103
+ const fileName = basename(file.absPath);
1104
+
1105
+ // Prepend file path context to help search
1106
+ const header = `File: ${file.relPath}\n\n`;
1107
+ const textChunks = this.chunkText(header + content, 400, 80);
1108
+ const fileChunks: Chunk[] = textChunks.map(text => ({
1109
+ text,
1110
+ role: 'system' as const,
1111
+ source_type: 'file',
1112
+ source_id: `file:${name}:${file.relPath}`,
1113
+ agent_id: 'system',
1114
+ token_count: Math.ceil(text.length / 4),
1115
+ created_at: now,
1116
+ }));
1117
+
1118
+ allChunks.push(...fileChunks);
1119
+
1120
+ // Update or insert file record
1121
+ if (file.isUpdate) {
1122
+ db.prepare(`
1123
+ UPDATE source_files SET file_hash = ?, file_size = ?, chunk_count = ?, last_indexed_at = ?
1124
+ WHERE collection_id = ? AND file_path = ?
1125
+ `).run(file.hash, file.size, fileChunks.length, now, col.id, file.relPath);
1126
+ updated++;
1127
+ } else {
1128
+ db.prepare(`
1129
+ INSERT INTO source_files (collection_id, file_path, file_hash, file_size, chunk_count, last_indexed_at)
1130
+ VALUES (?, ?, ?, ?, ?, ?)
1131
+ `).run(col.id, file.relPath, file.hash, file.size, fileChunks.length, now);
1132
+ added++;
1133
+ }
1134
+ }
1135
+
1136
+ // Embed and ingest the batch
1137
+ if (allChunks.length > 0) {
1138
+ const ingested = await this.ingest(allChunks);
1139
+ chunksAdded += ingested;
1140
+ }
1141
+ }
1142
+
1143
+ // Remove files that no longer exist on disk
1144
+ for (const [relPath, { id }] of existingFiles) {
1145
+ db.prepare('DELETE FROM source_files WHERE id = ?').run(id);
1146
+ removed++;
1147
+ }
1148
+
1149
+ // Update collection stats
1150
+ const fileCount = (db.prepare('SELECT COUNT(*) as count FROM source_files WHERE collection_id = ?').get(col.id) as any).count;
1151
+ const chunkCount = (db.prepare('SELECT SUM(chunk_count) as total FROM source_files WHERE collection_id = ?').get(col.id) as any).total || 0;
1152
+ db.prepare('UPDATE source_collections SET file_count = ?, chunk_count = ?, last_sync_at = ? WHERE id = ?')
1153
+ .run(fileCount, chunkCount, now, col.id);
1154
+
1155
+ return {
1156
+ collection: name,
1157
+ added,
1158
+ updated,
1159
+ removed,
1160
+ chunks_added: chunksAdded,
1161
+ duration_ms: Date.now() - startTime,
1162
+ };
1163
+ }
1164
+
1165
+ /** Get status of all source collections. */
1166
+ sourcesStatus(): SourcesStatus {
1167
+ const db = this.sqliteDb!;
1168
+ const collections = db.prepare('SELECT name, root_path, file_count, chunk_count, last_sync_at FROM source_collections').all() as any[];
1169
+ const totalFiles = collections.reduce((sum, c) => sum + c.file_count, 0);
1170
+ const totalChunks = collections.reduce((sum, c) => sum + c.chunk_count, 0);
1171
+
1172
+ return {
1173
+ collections: collections.map(c => ({
1174
+ name: c.name,
1175
+ root_path: c.root_path,
1176
+ file_count: c.file_count,
1177
+ chunk_count: c.chunk_count,
1178
+ last_sync_at: c.last_sync_at,
1179
+ })),
1180
+ total_files: totalFiles,
1181
+ total_chunks: totalChunks,
1182
+ };
1183
+ }
1184
+
1185
+ /** Scan a directory recursively, matching include/ignore patterns. */
1186
+ private scanDirectory(rootPath: string, includePatterns: string[], ignorePatterns: string[]): string[] {
1187
+ const results: string[] = [];
1188
+
1189
+ // Build sets of allowed extensions and ignored directory names for fast filtering
1190
+ const allowedExtensions = new Set<string>();
1191
+ const allowedExactNames = new Set<string>();
1192
+ for (const pattern of includePatterns) {
1193
+ // Extract extension from patterns like "**/*.ts"
1194
+ const extMatch = pattern.match(/\*\*\/\*(\.\w+)$/);
1195
+ if (extMatch) {
1196
+ allowedExtensions.add(extMatch[1]);
1197
+ }
1198
+ // Exact filenames like "**/Makefile"
1199
+ const nameMatch = pattern.match(/\*\*\/([^*]+)$/);
1200
+ if (nameMatch && !nameMatch[1].startsWith('*.')) {
1201
+ allowedExactNames.add(nameMatch[1]);
1202
+ }
1203
+ }
1204
+
1205
+ const ignoreDirs = new Set<string>();
1206
+ for (const pattern of ignorePatterns) {
1207
+ // Extract directory names from patterns like "**/node_modules/**"
1208
+ const dirMatch = pattern.match(/\*\*\/([^/*]+)\/\*\*$/);
1209
+ if (dirMatch) {
1210
+ ignoreDirs.add(dirMatch[1]);
1211
+ }
1212
+ }
1213
+
1214
+ const ignoreFiles = new Set<string>();
1215
+ for (const pattern of ignorePatterns) {
1216
+ // Extract filenames/extensions to ignore
1217
+ const fileMatch = pattern.match(/\*\*\/\*(\.\w+)$/);
1218
+ if (fileMatch) {
1219
+ ignoreFiles.add(fileMatch[1]);
1220
+ }
1221
+ const exactMatch = pattern.match(/\*\*\/([^*]+)$/);
1222
+ if (exactMatch && !exactMatch[1].includes('/')) {
1223
+ ignoreFiles.add(exactMatch[1]);
1224
+ }
1225
+ }
1226
+
1227
+ const walk = (dir: string) => {
1228
+ let entries: string[];
1229
+ try {
1230
+ entries = readdirSync(dir);
1231
+ } catch {
1232
+ return;
1233
+ }
1234
+
1235
+ for (const entry of entries) {
1236
+ const fullPath = join(dir, entry);
1237
+ let stat;
1238
+ try {
1239
+ stat = statSync(fullPath);
1240
+ } catch {
1241
+ continue;
1242
+ }
1243
+
1244
+ if (stat.isDirectory()) {
1245
+ if (ignoreDirs.has(entry)) continue;
1246
+ if (entry.startsWith('.')) continue; // skip hidden dirs
1247
+ walk(fullPath);
1248
+ } else if (stat.isFile()) {
1249
+ const ext = extname(entry);
1250
+ if (ignoreFiles.has(ext)) continue;
1251
+ if (ignoreFiles.has(entry)) continue;
1252
+
1253
+ if (allowedExtensions.has(ext) || allowedExactNames.has(entry)) {
1254
+ results.push(fullPath);
1255
+ }
1256
+ }
1257
+ }
1258
+ };
1259
+
1260
+ walk(rootPath);
1261
+ return results;
1262
+ }
1263
+
1264
+ // ── Cleanup ──
1265
+
1266
+ close(): void {
1267
+ this.sqliteDb?.close();
1268
+ // LanceDB connection doesn't need explicit close
1269
+ }
1270
+ }
1271
+
1272
+ // ─── Config Resolution ─────────────────────────────────────────────────────
1273
+ //
1274
+ // Key resolution order:
1275
+ // 1. Explicit overrides (programmatic)
1276
+ // 2. process.env (set by op-secrets plugin inside OpenClaw, or by user)
1277
+ // 3. .env file in data dir (~/.openclaw/memory-crystal/.env)
1278
+ // 4. 1Password via op CLI (if SA token exists at ~/.openclaw/secrets/op-sa-token)
1279
+ //
1280
+ // Two setup paths:
1281
+ // • .env file: cp .env.example ~/.openclaw/memory-crystal/.env && edit
1282
+ // • 1Password: keys auto-resolved from "Agent Secrets" vault
1283
+
1284
+ export function resolveConfig(overrides?: Partial<CrystalConfig>): CrystalConfig {
1285
+ const openclawHome = process.env.OPENCLAW_HOME || join(process.env.HOME || '', '.openclaw');
1286
+
1287
+ // dataDir resolution order:
1288
+ // 1. Explicit override (always wins)
1289
+ // 2. CRYSTAL_DATA_DIR env var (for testing)
1290
+ // 3. ~/.ldm/memory/ if crystal.db exists there (post-migration)
1291
+ // 4. Legacy ~/.openclaw/memory-crystal/ (pre-migration fallback)
1292
+ let dataDir = overrides?.dataDir || process.env.CRYSTAL_DATA_DIR;
1293
+ if (!dataDir) {
1294
+ const ldmMemory = join(process.env.HOME || '', '.ldm', 'memory');
1295
+ if (existsSync(join(ldmMemory, 'crystal.db'))) {
1296
+ dataDir = ldmMemory;
1297
+ } else {
1298
+ dataDir = join(openclawHome, 'memory-crystal');
1299
+ }
1300
+ }
1301
+
1302
+ // Load .env file if it exists (doesn't override existing env vars)
1303
+ loadEnvFile(join(dataDir, '.env'));
1304
+
1305
+ // Resolve API keys: env/.env first, then 1Password fallback
1306
+ const openaiApiKey = overrides?.openaiApiKey || process.env.OPENAI_API_KEY || opRead(openclawHome, 'OpenAI API', 'api key');
1307
+ const googleApiKey = overrides?.googleApiKey || process.env.GOOGLE_API_KEY || opRead(openclawHome, 'Google AI', 'api key');
1308
+ const remoteToken = overrides?.remoteToken || process.env.CRYSTAL_REMOTE_TOKEN || opRead(openclawHome, 'Memory Crystal Remote', 'token');
1309
+
1310
+ return {
1311
+ dataDir,
1312
+ embeddingProvider: (overrides?.embeddingProvider || process.env.CRYSTAL_EMBEDDING_PROVIDER || 'openai') as CrystalConfig['embeddingProvider'],
1313
+ openaiApiKey,
1314
+ openaiModel: overrides?.openaiModel || process.env.CRYSTAL_OPENAI_MODEL || 'text-embedding-3-small',
1315
+ ollamaHost: overrides?.ollamaHost || process.env.CRYSTAL_OLLAMA_HOST || 'http://localhost:11434',
1316
+ ollamaModel: overrides?.ollamaModel || process.env.CRYSTAL_OLLAMA_MODEL || 'nomic-embed-text',
1317
+ googleApiKey,
1318
+ googleModel: overrides?.googleModel || process.env.CRYSTAL_GOOGLE_MODEL || 'text-embedding-004',
1319
+ remoteUrl: overrides?.remoteUrl || process.env.CRYSTAL_REMOTE_URL,
1320
+ remoteToken,
1321
+ };
1322
+ }
1323
+
1324
+ /** Load a .env file into process.env. Does NOT override existing vars. */
1325
+ function loadEnvFile(path: string): void {
1326
+ if (!existsSync(path)) return;
1327
+ const content = readFileSync(path, 'utf8');
1328
+ for (const line of content.split('\n')) {
1329
+ const trimmed = line.trim();
1330
+ if (!trimmed || trimmed.startsWith('#')) continue;
1331
+ const eqIdx = trimmed.indexOf('=');
1332
+ if (eqIdx === -1) continue;
1333
+ const key = trimmed.slice(0, eqIdx).trim();
1334
+ let value = trimmed.slice(eqIdx + 1).trim();
1335
+ // Strip surrounding quotes
1336
+ if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
1337
+ value = value.slice(1, -1);
1338
+ }
1339
+ if (key && !process.env[key]) {
1340
+ process.env[key] = value;
1341
+ }
1342
+ }
1343
+ }
1344
+
1345
+ /** Read a secret from 1Password via op CLI. Falls back silently on failure. */
1346
+ function opRead(openclawHome: string, item: string, field: string): string | undefined {
1347
+ try {
1348
+ const saTokenPath = join(openclawHome, 'secrets', 'op-sa-token');
1349
+ if (!existsSync(saTokenPath)) return undefined;
1350
+ const saToken = readFileSync(saTokenPath, 'utf8').trim();
1351
+ return execSync(`op read "op://Agent Secrets/${item}/${field}" 2>/dev/null`, {
1352
+ encoding: 'utf8',
1353
+ env: { ...process.env, OP_SERVICE_ACCOUNT_TOKEN: saToken },
1354
+ timeout: 10000,
1355
+ }).trim() || undefined;
1356
+ } catch {
1357
+ return undefined;
1358
+ }
1359
+ }
1360
+
1361
+ // ─── Remote Crystal (Cloud Mirror Mode) ────────────────────────────────────
1362
+ // When remoteUrl is set, this class talks to the Cloudflare Worker instead
1363
+ // of local SQLite. Same interface as Crystal for search/remember/forget/status/ingest.
1364
+
1365
+ export class RemoteCrystal {
1366
+ private url: string;
1367
+ private token: string;
1368
+
1369
+ constructor(url: string, token: string) {
1370
+ this.url = url.replace(/\/$/, '');
1371
+ this.token = token;
1372
+ }
1373
+
1374
+ async init(): Promise<void> {
1375
+ // No local DB to initialize — just verify the Worker is reachable
1376
+ const resp = await fetch(`${this.url}/health`);
1377
+ if (!resp.ok) {
1378
+ throw new Error(`Remote crystal unreachable: ${resp.status}`);
1379
+ }
1380
+ }
1381
+
1382
+ private async request(path: string, body?: any): Promise<any> {
1383
+ const resp = await fetch(`${this.url}${path}`, {
1384
+ method: body ? 'POST' : 'GET',
1385
+ headers: {
1386
+ 'Authorization': `Bearer ${this.token}`,
1387
+ 'Content-Type': 'application/json',
1388
+ },
1389
+ ...(body ? { body: JSON.stringify(body) } : {}),
1390
+ });
1391
+
1392
+ if (!resp.ok) {
1393
+ const err = await resp.text();
1394
+ throw new Error(`Remote crystal error ${resp.status}: ${err}`);
1395
+ }
1396
+
1397
+ return resp.json();
1398
+ }
1399
+
1400
+ async search(query: string, limit = 5, filter?: { agent_id?: string }): Promise<SearchResult[]> {
1401
+ const data = await this.request('/search', { query, limit, agent_id: filter?.agent_id });
1402
+ return data.results || [];
1403
+ }
1404
+
1405
+ async ingest(chunks: Chunk[]): Promise<number> {
1406
+ const data = await this.request('/ingest', { chunks });
1407
+ return data.ingested || 0;
1408
+ }
1409
+
1410
+ async remember(text: string, category: Memory['category'] = 'fact'): Promise<number> {
1411
+ const data = await this.request('/remember', { text, category });
1412
+ return data.id;
1413
+ }
1414
+
1415
+ forget(memoryId: number): Promise<boolean> {
1416
+ return this.request('/forget', { id: memoryId }).then(d => d.ok);
1417
+ }
1418
+
1419
+ async status(): Promise<CrystalStatus> {
1420
+ const data = await this.request('/status');
1421
+ return {
1422
+ chunks: data.chunks || 0,
1423
+ memories: data.memories || 0,
1424
+ sources: 0,
1425
+ agents: data.agents || [],
1426
+ oldestChunk: data.oldestChunk,
1427
+ newestChunk: data.newestChunk,
1428
+ embeddingProvider: 'remote',
1429
+ dataDir: this.url,
1430
+ capturedSessions: data.capturedSessions || 0,
1431
+ latestCapture: data.newestChunk,
1432
+ };
1433
+ }
1434
+
1435
+ // Expose chunkText from a local Crystal instance for cc-hook to use
1436
+ chunkText(text: string): string[] {
1437
+ // Simple chunking for remote mode — matches Crystal.chunkText() logic
1438
+ const targetChars = 400 * 4; // 400 tokens * ~4 chars
1439
+ const overlapChars = 80 * 4;
1440
+
1441
+ if (text.length <= targetChars) return [text];
1442
+
1443
+ const chunks: string[] = [];
1444
+ let start = 0;
1445
+ while (start < text.length) {
1446
+ let end = start + targetChars;
1447
+ if (end >= text.length) {
1448
+ chunks.push(text.slice(start));
1449
+ break;
1450
+ }
1451
+ // Try to break at paragraph
1452
+ const paraBreak = text.lastIndexOf('\n\n', end);
1453
+ if (paraBreak > start + targetChars * 0.5) end = paraBreak;
1454
+ else {
1455
+ // Try sentence break
1456
+ const sentBreak = text.lastIndexOf('. ', end);
1457
+ if (sentBreak > start + targetChars * 0.5) end = sentBreak + 1;
1458
+ }
1459
+ chunks.push(text.slice(start, end));
1460
+ start = end - overlapChars;
1461
+ }
1462
+ return chunks;
1463
+ }
1464
+ }
1465
+
1466
+ /** Create the appropriate Crystal instance based on config. */
1467
+ export function createCrystal(config: CrystalConfig): Crystal | RemoteCrystal {
1468
+ if (config.remoteUrl && config.remoteToken) {
1469
+ return new RemoteCrystal(config.remoteUrl, config.remoteToken);
1470
+ }
1471
+ return new Crystal(config);
1472
+ }