clawmem 0.1.8 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/entity.ts ADDED
@@ -0,0 +1,497 @@
1
+ /**
2
+ * Entity Resolution + Co-occurrence Graph
3
+ *
4
+ * Extracts entities from documents, resolves to canonical forms,
5
+ * tracks mentions and co-occurrences for entity-aware retrieval.
6
+ *
7
+ * Pattern F from ENHANCEMENT-PLAN.md (source: Hindsight entity_resolver.py)
8
+ */
9
+
10
+ import type { Database } from "bun:sqlite";
11
+ import type { LlamaCpp } from "./llm.ts";
12
+ import { extractJsonFromLLM } from "./amem.ts";
13
+
14
+ // =============================================================================
15
+ // Types
16
+ // =============================================================================
17
+
18
+ export interface ExtractedEntity {
19
+ name: string;
20
+ type: string; // person, project, service, tool, concept, org, location
21
+ }
22
+
23
+ export interface ResolvedEntity {
24
+ entity_id: string;
25
+ name: string;
26
+ entity_type: string;
27
+ canonical_id: string | null; // points to canonical form if this is an alias
28
+ }
29
+
30
+ export interface EntityCooccurrence {
31
+ entity_a: string;
32
+ entity_b: string;
33
+ count: number;
34
+ last_cooccurred: string;
35
+ }
36
+
37
+ // =============================================================================
38
+ // Levenshtein Distance (for fuzzy entity matching)
39
+ // =============================================================================
40
+
41
+ function levenshtein(a: string, b: string): number {
42
+ const la = a.length;
43
+ const lb = b.length;
44
+ if (la === 0) return lb;
45
+ if (lb === 0) return la;
46
+
47
+ const matrix: number[][] = [];
48
+ for (let i = 0; i <= la; i++) {
49
+ matrix[i] = [i];
50
+ }
51
+ for (let j = 0; j <= lb; j++) {
52
+ matrix[0]![j] = j;
53
+ }
54
+
55
+ for (let i = 1; i <= la; i++) {
56
+ for (let j = 1; j <= lb; j++) {
57
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
58
+ matrix[i]![j] = Math.min(
59
+ matrix[i - 1]![j]! + 1, // deletion
60
+ matrix[i]![j - 1]! + 1, // insertion
61
+ matrix[i - 1]![j - 1]! + cost // substitution
62
+ );
63
+ }
64
+ }
65
+
66
+ return matrix[la]![lb]!;
67
+ }
68
+
69
+ /**
70
+ * Normalized similarity ratio (0.0 = no match, 1.0 = exact match).
71
+ * Equivalent to Python's SequenceMatcher.ratio().
72
+ */
73
+ function similarityRatio(a: string, b: string): number {
74
+ const maxLen = Math.max(a.length, b.length);
75
+ if (maxLen === 0) return 1.0;
76
+ return 1 - levenshtein(a, b) / maxLen;
77
+ }
78
+
79
+ // =============================================================================
80
+ // Entity ID Generation
81
+ // =============================================================================
82
+
83
+ /**
84
+ * Generate a stable entity ID from name, type, and vault.
85
+ * Vault-qualified to prevent cross-vault entity merges.
86
+ */
87
+ function makeEntityId(name: string, type: string, vault: string = 'default'): string {
88
+ const normalized = name.toLowerCase().replace(/[^a-z0-9]+/g, '_').replace(/^_|_$/g, '');
89
+ return `${vault}:${type}:${normalized}`;
90
+ }
91
+
92
+ // =============================================================================
93
+ // Entity Extraction (LLM-based)
94
+ // =============================================================================
95
+
96
+ /**
97
+ * Extract named entities from document content using LLM.
98
+ * Returns a list of (name, type) pairs.
99
+ */
100
+ export async function extractEntities(
101
+ llm: LlamaCpp,
102
+ title: string,
103
+ content: string
104
+ ): Promise<ExtractedEntity[]> {
105
+ const truncated = content.slice(0, 2000);
106
+
107
+ const prompt = `Extract named entities from this document. Include people, projects, services, tools, organizations, and specific technical components.
108
+
109
+ Title: ${title}
110
+
111
+ Content:
112
+ ${truncated}
113
+
114
+ Return ONLY valid JSON array:
115
+ [
116
+ {"name": "Entity Name", "type": "person|project|service|tool|concept|org|location"}
117
+ ]
118
+
119
+ Rules:
120
+ - Only include specific, named entities (not generic concepts like "database" or "testing")
121
+ - Normalize names: "VM 202" not "vm202", "ClawMem" not "clawmem"
122
+ - 3-15 entities maximum
123
+ - Include the most specific type for each entity
124
+ Return ONLY the JSON array. /no_think`;
125
+
126
+ try {
127
+ const result = await llm.generate(prompt, {
128
+ temperature: 0.2,
129
+ maxTokens: 400,
130
+ });
131
+
132
+ if (!result) return [];
133
+
134
+ const parsed = extractJsonFromLLM(result.text) as ExtractedEntity[] | null;
135
+ if (!Array.isArray(parsed)) return [];
136
+
137
+ // Validate and filter
138
+ return parsed
139
+ .filter(e =>
140
+ typeof e.name === 'string' &&
141
+ typeof e.type === 'string' &&
142
+ e.name.length >= 2 &&
143
+ e.name.length <= 100 &&
144
+ ['person', 'project', 'service', 'tool', 'concept', 'org', 'location'].includes(e.type)
145
+ )
146
+ .slice(0, 15);
147
+ } catch (err) {
148
+ console.log(`[entity] LLM extraction failed:`, err);
149
+ return [];
150
+ }
151
+ }
152
+
153
+ // =============================================================================
154
+ // Entity Resolution (canonical normalization)
155
+ // =============================================================================
156
+
157
+ /**
158
+ * Resolve an entity name to its canonical form.
159
+ * Uses FTS5 candidate lookup + Levenshtein fuzzy matching.
160
+ *
161
+ * Scoped per vault (via vault parameter) to prevent cross-vault false merges.
162
+ *
163
+ * @returns entity_id of canonical match, or null if no match (new entity)
164
+ */
165
+ export function resolveEntityCanonical(
166
+ db: Database,
167
+ name: string,
168
+ type: string,
169
+ vault: string = 'default',
170
+ threshold: number = 0.75
171
+ ): string | null {
172
+ const normalizedName = name.toLowerCase().trim();
173
+
174
+ // Step 1: FTS5 candidate lookup — join to entity_nodes for vault scoping
175
+ let candidates: { entity_id: string; name: string; entity_type: string }[] = [];
176
+ try {
177
+ candidates = db.prepare(`
178
+ SELECT f.entity_id, f.name, f.entity_type
179
+ FROM entities_fts f
180
+ JOIN entity_nodes e ON e.entity_id = f.entity_id
181
+ WHERE entities_fts MATCH ? AND f.entity_type = ? AND e.vault = ?
182
+ LIMIT 20
183
+ `).all(normalizedName.split(/\s+/).map(w => `"${w}"`).join(' OR '), type, vault) as typeof candidates;
184
+ } catch {
185
+ // FTS5 match may fail on special chars — fall back to LIKE on entity_nodes directly
186
+ candidates = db.prepare(`
187
+ SELECT entity_id, name, entity_type
188
+ FROM entity_nodes
189
+ WHERE LOWER(name) LIKE ? AND entity_type = ? AND vault = ?
190
+ LIMIT 20
191
+ `).all(`%${normalizedName}%`, type, vault) as typeof candidates;
192
+ }
193
+
194
+ if (candidates.length === 0) return null;
195
+
196
+ // Step 2: Fuzzy rank candidates by Levenshtein similarity
197
+ let bestMatch: { entity_id: string; score: number } | null = null;
198
+ for (const candidate of candidates) {
199
+ const score = similarityRatio(normalizedName, candidate.name.toLowerCase());
200
+ if (score >= threshold && (!bestMatch || score > bestMatch.score)) {
201
+ bestMatch = { entity_id: candidate.entity_id, score };
202
+ }
203
+ }
204
+
205
+ return bestMatch?.entity_id ?? null;
206
+ }
207
+
208
+ // =============================================================================
209
+ // Entity Storage + Mentions + Co-occurrences
210
+ // =============================================================================
211
+
212
+ /**
213
+ * Upsert an entity into entity_nodes and entities_fts.
214
+ * Returns the entity_id (canonical or new).
215
+ */
216
+ export function upsertEntity(
217
+ db: Database,
218
+ name: string,
219
+ type: string,
220
+ vault: string = 'default'
221
+ ): string {
222
+ // Check for canonical match first
223
+ const canonicalId = resolveEntityCanonical(db, name, type, vault);
224
+
225
+ if (canonicalId) {
226
+ // Existing canonical entity — update mention count and last_seen
227
+ db.prepare(`
228
+ UPDATE entity_nodes
229
+ SET mention_count = mention_count + 1,
230
+ last_seen = datetime('now')
231
+ WHERE entity_id = ?
232
+ `).run(canonicalId);
233
+ return canonicalId;
234
+ }
235
+
236
+ // New entity — insert (vault-qualified ID prevents cross-vault merges)
237
+ const entityId = makeEntityId(name, type, vault);
238
+
239
+ db.prepare(`
240
+ INSERT OR IGNORE INTO entity_nodes (entity_id, entity_type, name, description, created_at, mention_count, last_seen, vault)
241
+ VALUES (?, ?, ?, NULL, datetime('now'), 1, datetime('now'), ?)
242
+ `).run(entityId, type, name, vault);
243
+
244
+ // Insert into FTS index
245
+ try {
246
+ db.prepare(`
247
+ INSERT OR IGNORE INTO entities_fts (entity_id, name, entity_type)
248
+ VALUES (?, ?, ?)
249
+ `).run(entityId, name.toLowerCase(), type);
250
+ } catch {
251
+ // FTS insert may fail if table doesn't exist yet — non-fatal
252
+ }
253
+
254
+ return entityId;
255
+ }
256
+
257
+ /**
258
+ * Record an entity mention for a document.
259
+ */
260
+ export function recordEntityMention(
261
+ db: Database,
262
+ entityId: string,
263
+ docId: number,
264
+ mentionText?: string
265
+ ): void {
266
+ db.prepare(`
267
+ INSERT OR IGNORE INTO entity_mentions (entity_id, doc_id, mention_text, created_at)
268
+ VALUES (?, ?, ?, datetime('now'))
269
+ `).run(entityId, docId, mentionText || null);
270
+ }
271
+
272
+ /**
273
+ * Track co-occurrence between entity pairs that appear in the same document.
274
+ */
275
+ export function trackCoOccurrences(
276
+ db: Database,
277
+ entityIds: string[]
278
+ ): void {
279
+ if (entityIds.length < 2) return;
280
+
281
+ const stmt = db.prepare(`
282
+ INSERT INTO entity_cooccurrences (entity_a, entity_b, count, last_cooccurred)
283
+ VALUES (?, ?, 1, datetime('now'))
284
+ ON CONFLICT(entity_a, entity_b) DO UPDATE SET
285
+ count = count + 1,
286
+ last_cooccurred = datetime('now')
287
+ `);
288
+
289
+ // All pairs, sorted for consistent key order
290
+ for (let i = 0; i < entityIds.length; i++) {
291
+ for (let j = i + 1; j < entityIds.length; j++) {
292
+ const sorted = [entityIds[i]!, entityIds[j]!].sort();
293
+ stmt.run(sorted[0]!, sorted[1]!);
294
+ }
295
+ }
296
+ }
297
+
298
+ // =============================================================================
299
+ // Entity Enrichment Pipeline (called during A-MEM postIndexEnrich)
300
+ // =============================================================================
301
+
302
+ /**
303
+ * Full entity enrichment for a document:
304
+ * 1. Extract entities via LLM
305
+ * 2. Resolve each to canonical form
306
+ * 3. Record mentions
307
+ * 4. Track co-occurrences
308
+ *
309
+ * @returns Number of entities resolved
310
+ */
311
+ export async function enrichDocumentEntities(
312
+ db: Database,
313
+ llm: LlamaCpp,
314
+ docId: number,
315
+ vault: string = 'default'
316
+ ): Promise<number> {
317
+ try {
318
+ // Get document content
319
+ const doc = db.prepare(`
320
+ SELECT d.title, c.doc as body
321
+ FROM documents d
322
+ JOIN content c ON c.hash = d.hash
323
+ WHERE d.id = ? AND d.active = 1
324
+ `).get(docId) as { title: string; body: string } | null;
325
+
326
+ if (!doc) {
327
+ console.log(`[entity] Document ${docId} not found or inactive`);
328
+ return 0;
329
+ }
330
+
331
+ // Step 1: Extract entities
332
+ const entities = await extractEntities(llm, doc.title, doc.body);
333
+ if (entities.length === 0) {
334
+ console.log(`[entity] No entities found in docId ${docId}`);
335
+ return 0;
336
+ }
337
+
338
+ // Step 2-3: Deduplicate entities by name+type, then resolve and record mentions
339
+ const seenKeys = new Set<string>();
340
+ const uniqueEntities: ExtractedEntity[] = [];
341
+ for (const entity of entities) {
342
+ const key = `${entity.type}:${entity.name.toLowerCase().trim()}`;
343
+ if (!seenKeys.has(key)) {
344
+ seenKeys.add(key);
345
+ uniqueEntities.push(entity);
346
+ }
347
+ }
348
+
349
+ const resolvedIds: string[] = [];
350
+ for (const entity of uniqueEntities) {
351
+ const entityId = upsertEntity(db, entity.name, entity.type, vault);
352
+ resolvedIds.push(entityId);
353
+ recordEntityMention(db, entityId, docId, entity.name);
354
+ }
355
+
356
+ // Step 4: Track co-occurrences (deduplicated IDs prevent inflated pair counts)
357
+ trackCoOccurrences(db, resolvedIds);
358
+
359
+ // Step 5: Create entity edges in memory_relations
360
+ for (const entityId of resolvedIds) {
361
+ // Find other documents mentioning this entity
362
+ const otherDocs = db.prepare(`
363
+ SELECT doc_id FROM entity_mentions
364
+ WHERE entity_id = ? AND doc_id != ?
365
+ LIMIT 10
366
+ `).all(entityId, docId) as { doc_id: number }[];
367
+
368
+ for (const other of otherDocs) {
369
+ // Insert entity relation (unidirectional; graph traversal handles inbound for entity/semantic types)
370
+ db.prepare(`
371
+ INSERT OR IGNORE INTO memory_relations (source_id, target_id, relation_type, weight, metadata, created_at)
372
+ VALUES (?, ?, 'entity', 0.7, ?, datetime('now'))
373
+ `).run(docId, other.doc_id, JSON.stringify({ entity: entityId }));
374
+ }
375
+ }
376
+
377
+ console.log(`[entity] Enriched docId ${docId}: ${resolvedIds.length} entities, ${entities.length} extracted`);
378
+ return resolvedIds.length;
379
+ } catch (err) {
380
+ console.log(`[entity] Error enriching docId ${docId}:`, err);
381
+ return 0;
382
+ }
383
+ }
384
+
385
+ // =============================================================================
386
+ // Entity Graph Traversal (for intent_search ENTITY queries)
387
+ // =============================================================================
388
+
389
+ /**
390
+ * Get entity co-occurrence neighbors for a set of seed entities.
391
+ * Returns document IDs reachable via entity co-occurrence graph.
392
+ */
393
+ export function getEntityGraphNeighbors(
394
+ db: Database,
395
+ seedDocIds: number[],
396
+ limit: number = 20
397
+ ): { docId: number; score: number; viaEntity: string }[] {
398
+ if (seedDocIds.length === 0) return [];
399
+
400
+ // Step 1: Find entities mentioned in seed documents
401
+ const placeholders = seedDocIds.map(() => '?').join(',');
402
+ const seedEntities = db.prepare(`
403
+ SELECT DISTINCT entity_id FROM entity_mentions
404
+ WHERE doc_id IN (${placeholders})
405
+ `).all(...seedDocIds) as { entity_id: string }[];
406
+
407
+ if (seedEntities.length === 0) return [];
408
+
409
+ // Step 2: Find co-occurring entities
410
+ const entityIds = seedEntities.map(e => e.entity_id);
411
+ const entityPlaceholders = entityIds.map(() => '?').join(',');
412
+
413
+ const cooccurring = db.prepare(`
414
+ SELECT
415
+ CASE WHEN entity_a IN (${entityPlaceholders}) THEN entity_b ELSE entity_a END as neighbor_entity,
416
+ count
417
+ FROM entity_cooccurrences
418
+ WHERE entity_a IN (${entityPlaceholders}) OR entity_b IN (${entityPlaceholders})
419
+ ORDER BY count DESC
420
+ LIMIT 30
421
+ `).all(...entityIds, ...entityIds, ...entityIds) as { neighbor_entity: string; count: number }[];
422
+
423
+ if (cooccurring.length === 0) return [];
424
+
425
+ // Step 3: Find documents mentioning co-occurring entities
426
+ const results: { docId: number; score: number; viaEntity: string }[] = [];
427
+ const seen = new Set(seedDocIds);
428
+
429
+ for (const co of cooccurring) {
430
+ const docs = db.prepare(`
431
+ SELECT doc_id FROM entity_mentions
432
+ WHERE entity_id = ?
433
+ LIMIT 10
434
+ `).all(co.neighbor_entity) as { doc_id: number }[];
435
+
436
+ for (const doc of docs) {
437
+ if (!seen.has(doc.doc_id)) {
438
+ seen.add(doc.doc_id);
439
+ // Score: normalized co-occurrence count (log scale)
440
+ const score = Math.min(1.0, Math.log1p(co.count) / 5);
441
+ results.push({ docId: doc.doc_id, score, viaEntity: co.neighbor_entity });
442
+ }
443
+ }
444
+ }
445
+
446
+ return results.sort((a, b) => b.score - a.score).slice(0, limit);
447
+ }
448
+
449
+ /**
450
+ * Search for entities by name (for MCP tool exposure).
451
+ */
452
+ export function searchEntities(
453
+ db: Database,
454
+ query: string,
455
+ limit: number = 10
456
+ ): { entity_id: string; name: string; type: string; mention_count: number; cooccurrence_count: number }[] {
457
+ const normalizedQuery = query.toLowerCase().trim();
458
+
459
+ // Try FTS first
460
+ let results: { entity_id: string; name: string; entity_type: string; mention_count: number }[] = [];
461
+ try {
462
+ results = db.prepare(`
463
+ SELECT e.entity_id, e.name, e.entity_type, e.mention_count
464
+ FROM entities_fts f
465
+ JOIN entity_nodes e ON e.entity_id = f.entity_id
466
+ WHERE entities_fts MATCH ?
467
+ ORDER BY e.mention_count DESC
468
+ LIMIT ?
469
+ `).all(normalizedQuery.split(/\s+/).map(w => `"${w}"`).join(' OR '), limit) as typeof results;
470
+ } catch {
471
+ // Fallback to LIKE
472
+ results = db.prepare(`
473
+ SELECT entity_id, name, entity_type, mention_count
474
+ FROM entity_nodes
475
+ WHERE LOWER(name) LIKE ?
476
+ ORDER BY mention_count DESC
477
+ LIMIT ?
478
+ `).all(`%${normalizedQuery}%`, limit) as typeof results;
479
+ }
480
+
481
+ // Enrich with co-occurrence count
482
+ return results.map(r => {
483
+ const coCount = db.prepare(`
484
+ SELECT COALESCE(SUM(count), 0) as total
485
+ FROM entity_cooccurrences
486
+ WHERE entity_a = ? OR entity_b = ?
487
+ `).get(r.entity_id, r.entity_id) as { total: number };
488
+
489
+ return {
490
+ entity_id: r.entity_id,
491
+ name: r.name,
492
+ type: r.entity_type,
493
+ mention_count: r.mention_count || 0,
494
+ cooccurrence_count: coCount.total,
495
+ };
496
+ });
497
+ }