clawmem 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -89,7 +89,7 @@ curl http://host:8090/v1/models
89
89
  | `CLAWMEM_EMBED_MAX_CHARS` | `6000` | Max chars per embedding input. Default fits EmbeddingGemma (2048 tokens). Set to `1100` for granite-278m (512 tokens). Cloud providers skip truncation. |
90
90
  | `CLAWMEM_EMBED_TPM_LIMIT` | `100000` | Tokens-per-minute limit for cloud embedding pacing. Match to your provider tier: Free 100000, Paid 2000000, Premium 50000000. |
91
91
  | `CLAWMEM_EMBED_DIMENSIONS` | (none) | Output dimensions for OpenAI `text-embedding-3-*` Matryoshka models (e.g. `512`, `1024`). Only sent when URL contains `openai.com`. |
92
- | `CLAWMEM_LLM_URL` | `http://localhost:8089` | LLM server for intent, expansion, A-MEM. Falls to `node-llama-cpp` if unset + `NO_LOCAL_MODELS=false`. |
92
+ | `CLAWMEM_LLM_URL` | `http://localhost:8089` | LLM server for intent, expansion, A-MEM, and entity extraction. Falls to `node-llama-cpp` if unset + `NO_LOCAL_MODELS=false`. For better entity extraction quality, point at a 7B+ model or cloud API during `reindex --enrich` (see `docs/internals/entity-resolution.md`). |
93
93
  | `CLAWMEM_RERANK_URL` | `http://localhost:8090` | Reranker server. Falls to `node-llama-cpp` if unset + `NO_LOCAL_MODELS=false`. |
94
94
  | `CLAWMEM_NO_LOCAL_MODELS` | `false` | Blocks `node-llama-cpp` from auto-downloading GGUF models. Set `true` for remote-only setups. |
95
95
  | `CLAWMEM_VAULTS` | (none) | JSON map of vault name → SQLite path for multi-vault mode. E.g. `{"work":"~/.cache/clawmem/work.sqlite"}` |
@@ -485,7 +485,7 @@ The `memory_relations` table is populated by multiple independent sources:
485
485
  | Beads `syncBeadsIssues()` | causal, supporting, semantic | `beads_sync` MCP tool or watcher (.beads/ change) | Queries `bd` CLI (Dolt backend). Maps beads deps: blocks→causal, discovered-from→supporting, relates-to→semantic, plus conditional-blocks→causal, caused-by→causal, supersedes→supporting. Metadata: `{origin: "beads"}`. |
486
486
  | `buildTemporalBackbone()` | temporal | `build_graphs` MCP tool (manual) | Creation-order edges between all active docs. |
487
487
  | `buildSemanticGraph()` | semantic | `build_graphs` MCP tool (manual) | Pure cosine similarity. PK collision: `INSERT OR IGNORE` means A-MEM semantic edges take precedence if they exist first. |
488
- | Entity co-occurrence graph | entity | A-MEM enrichment (indexing) | LLM entity extraction → canonical normalization (FTS5 + Levenshtein) → `entity_mentions` + `entity_cooccurrences` tables. Feeds ENTITY intent queries and MPFP `[entity, semantic]` patterns. |
488
+ | Entity co-occurrence graph | entity | A-MEM enrichment (indexing) | LLM entity extraction → quality filters (title/length/blocklist/location validation) → type-agnostic canonical resolution within compatibility buckets (person, org, location, tech=project/service/tool/concept) → `entity_mentions` + `entity_cooccurrences` tables. Entity edges use IDF-based specificity scoring. Feeds ENTITY intent queries and MPFP `[entity, semantic]` patterns. |
489
489
  | `consolidated_observations` | supporting | Consolidation worker (background) | 3-tier consolidation: facts → observations → mental models. Observations track `proof_count`, `trend` (STABLE/STRENGTHENING/WEAKENING/STALE), and source links. |
490
490
 
491
491
  **Edge collision:** Both `generateMemoryLinks()` and `buildSemanticGraph()` insert `relation_type='semantic'`. PK is `(source_id, target_id, relation_type)` — first writer wins.
@@ -555,28 +555,6 @@ User Query → Intent Classification (WHY/WHEN/ENTITY/WHAT)
555
555
  | `candidateLimit` | Yes (default 30) | No |
556
556
  | Best for | Most queries, progressive disclosure | Causal chains spanning multiple docs |
557
557
 
558
- ## Operational Issue Tracking
559
-
560
- When encountering tool failures, instruction contradictions, retrieval gaps, or workflow friction that would benefit from a fix:
561
-
562
- Write to `docs/issues/YYYY-MM-DD-<slug>.md` with: category, severity, what happened, what was expected, context, suggested fix.
563
-
564
- **File structure:**
565
- ```
566
- # <title>
567
- - Category: tool-failure | instruction-gap | workflow-friction | retrieval-gap | inconsistency
568
- - Severity: critical | high | medium
569
- - Status: open | resolved
570
-
571
- ## Observed
572
- ## Expected
573
- ## Context
574
- ## Suggested Fix
575
- ```
576
-
577
- **Triggers:** repeated tool error, instruction that contradicts observed behavior, retrieval consistently missing known content, workflow requiring unnecessary steps.
578
-
579
- **Do NOT log:** one-off transient errors, user-caused issues, issues already recorded.
580
558
 
581
559
  ## Troubleshooting
582
560
 
@@ -684,7 +662,7 @@ clawmem consolidate [--dry-run] # Find and archive duplicate low-confidence docu
684
662
  ## Integration Notes
685
663
 
686
664
  - **Memory nudge (v0.2.0):** Every N prompts (default 15) without a lifecycle MCP tool call (`memory_pin`/`memory_forget`/`memory_snooze`), context-surfacing appends `<vault-nudge>` prompting proactive memory management. Counter resets on lifecycle tool use. Configure via `CLAWMEM_NUDGE_INTERVAL` (0 to disable).
687
- - **Entity resolution (v0.2.0):** A-MEM enrichment now extracts named entities via LLM, resolves to canonical forms using `entities_fts` + Levenshtein fuzzy matching, and tracks co-occurrence. Entity graph traversal available for ENTITY intent queries via `intent_search`.
665
+ - **Entity resolution (v0.2.0+):** A-MEM enrichment extracts named entities via LLM, resolves to canonical forms using FTS5 + Levenshtein fuzzy matching with **type-agnostic compatibility buckets** (person, org, location stay separate; project/service/tool/concept merge freely as "tech" bucket). Quality filters reject title-as-entity, long names, template placeholders, and invalid locations. Entity edges use IDF-based specificity scoring (rare entities create edges; ubiquitous entities alone cannot). See `docs/internals/entity-resolution.md` for customization (extending type vocabulary and buckets).
688
666
  - QMD retrieval (BM25, vector, RRF, rerank, query expansion) is forked into ClawMem. Do not call standalone QMD tools.
689
667
  - SAME (composite scoring), MAGMA (intent + graph), A-MEM (self-evolving notes) layer on top of QMD substrate.
690
668
  - Three `llama-server` instances (embedding, LLM, reranker) on local or remote GPU. Wrapper defaults to `localhost:8088/8089/8090`.
package/CLAUDE.md CHANGED
@@ -89,7 +89,7 @@ curl http://host:8090/v1/models
89
89
  | `CLAWMEM_EMBED_MAX_CHARS` | `6000` | Max chars per embedding input. Default fits EmbeddingGemma (2048 tokens). Set to `1100` for granite-278m (512 tokens). Cloud providers skip truncation. |
90
90
  | `CLAWMEM_EMBED_TPM_LIMIT` | `100000` | Tokens-per-minute limit for cloud embedding pacing. Match to your provider tier: Free 100000, Paid 2000000, Premium 50000000. |
91
91
  | `CLAWMEM_EMBED_DIMENSIONS` | (none) | Output dimensions for OpenAI `text-embedding-3-*` Matryoshka models (e.g. `512`, `1024`). Only sent when URL contains `openai.com`. |
92
- | `CLAWMEM_LLM_URL` | `http://localhost:8089` | LLM server for intent, expansion, A-MEM. Falls to `node-llama-cpp` if unset + `NO_LOCAL_MODELS=false`. |
92
+ | `CLAWMEM_LLM_URL` | `http://localhost:8089` | LLM server for intent, expansion, A-MEM, and entity extraction. Falls to `node-llama-cpp` if unset + `NO_LOCAL_MODELS=false`. For better entity extraction quality, point at a 7B+ model or cloud API during `reindex --enrich` (see `docs/internals/entity-resolution.md`). |
93
93
  | `CLAWMEM_RERANK_URL` | `http://localhost:8090` | Reranker server. Falls to `node-llama-cpp` if unset + `NO_LOCAL_MODELS=false`. |
94
94
  | `CLAWMEM_NO_LOCAL_MODELS` | `false` | Blocks `node-llama-cpp` from auto-downloading GGUF models. Set `true` for remote-only setups. |
95
95
  | `CLAWMEM_VAULTS` | (none) | JSON map of vault name → SQLite path for multi-vault mode. E.g. `{"work":"~/.cache/clawmem/work.sqlite"}` |
@@ -485,7 +485,7 @@ The `memory_relations` table is populated by multiple independent sources:
485
485
  | Beads `syncBeadsIssues()` | causal, supporting, semantic | `beads_sync` MCP tool or watcher (.beads/ change) | Queries `bd` CLI (Dolt backend). Maps beads deps: blocks→causal, discovered-from→supporting, relates-to→semantic, plus conditional-blocks→causal, caused-by→causal, supersedes→supporting. Metadata: `{origin: "beads"}`. |
486
486
  | `buildTemporalBackbone()` | temporal | `build_graphs` MCP tool (manual) | Creation-order edges between all active docs. |
487
487
  | `buildSemanticGraph()` | semantic | `build_graphs` MCP tool (manual) | Pure cosine similarity. PK collision: `INSERT OR IGNORE` means A-MEM semantic edges take precedence if they exist first. |
488
- | Entity co-occurrence graph | entity | A-MEM enrichment (indexing) | LLM entity extraction → canonical normalization (FTS5 + Levenshtein) → `entity_mentions` + `entity_cooccurrences` tables. Feeds ENTITY intent queries and MPFP `[entity, semantic]` patterns. |
488
+ | Entity co-occurrence graph | entity | A-MEM enrichment (indexing) | LLM entity extraction → quality filters (title/length/blocklist/location validation) → type-agnostic canonical resolution within compatibility buckets (person, org, location, tech=project/service/tool/concept) → `entity_mentions` + `entity_cooccurrences` tables. Entity edges use IDF-based specificity scoring. Feeds ENTITY intent queries and MPFP `[entity, semantic]` patterns. |
489
489
  | `consolidated_observations` | supporting | Consolidation worker (background) | 3-tier consolidation: facts → observations → mental models. Observations track `proof_count`, `trend` (STABLE/STRENGTHENING/WEAKENING/STALE), and source links. |
490
490
 
491
491
  **Edge collision:** Both `generateMemoryLinks()` and `buildSemanticGraph()` insert `relation_type='semantic'`. PK is `(source_id, target_id, relation_type)` — first writer wins.
@@ -555,28 +555,6 @@ User Query → Intent Classification (WHY/WHEN/ENTITY/WHAT)
555
555
  | `candidateLimit` | Yes (default 30) | No |
556
556
  | Best for | Most queries, progressive disclosure | Causal chains spanning multiple docs |
557
557
 
558
- ## Operational Issue Tracking
559
-
560
- When encountering tool failures, instruction contradictions, retrieval gaps, or workflow friction that would benefit from a fix:
561
-
562
- Write to `docs/issues/YYYY-MM-DD-<slug>.md` with: category, severity, what happened, what was expected, context, suggested fix.
563
-
564
- **File structure:**
565
- ```
566
- # <title>
567
- - Category: tool-failure | instruction-gap | workflow-friction | retrieval-gap | inconsistency
568
- - Severity: critical | high | medium
569
- - Status: open | resolved
570
-
571
- ## Observed
572
- ## Expected
573
- ## Context
574
- ## Suggested Fix
575
- ```
576
-
577
- **Triggers:** repeated tool error, instruction that contradicts observed behavior, retrieval consistently missing known content, workflow requiring unnecessary steps.
578
-
579
- **Do NOT log:** one-off transient errors, user-caused issues, issues already recorded.
580
558
 
581
559
  ## Troubleshooting
582
560
 
@@ -684,7 +662,7 @@ clawmem consolidate [--dry-run] # Find and archive duplicate low-confidence docu
684
662
  ## Integration Notes
685
663
 
686
664
  - **Memory nudge (v0.2.0):** Every N prompts (default 15) without a lifecycle MCP tool call (`memory_pin`/`memory_forget`/`memory_snooze`), context-surfacing appends `<vault-nudge>` prompting proactive memory management. Counter resets on lifecycle tool use. Configure via `CLAWMEM_NUDGE_INTERVAL` (0 to disable).
687
- - **Entity resolution (v0.2.0):** A-MEM enrichment now extracts named entities via LLM, resolves to canonical forms using `entities_fts` + Levenshtein fuzzy matching, and tracks co-occurrence. Entity graph traversal available for ENTITY intent queries via `intent_search`.
665
+ - **Entity resolution (v0.2.0+):** A-MEM enrichment extracts named entities via LLM, resolves to canonical forms using FTS5 + Levenshtein fuzzy matching with **type-agnostic compatibility buckets** (person, org, location stay separate; project/service/tool/concept merge freely as "tech" bucket). Quality filters reject title-as-entity, long names, template placeholders, and invalid locations. Entity edges use IDF-based specificity scoring (rare entities create edges; ubiquitous entities alone cannot). See `docs/internals/entity-resolution.md` for customization (extending type vocabulary and buckets).
688
666
  - QMD retrieval (BM25, vector, RRF, rerank, query expansion) is forked into ClawMem. Do not call standalone QMD tools.
689
667
  - SAME (composite scoring), MAGMA (intent + graph), A-MEM (self-evolving notes) layer on top of QMD substrate.
690
668
  - Three `llama-server` instances (embedding, LLM, reranker) on local or remote GPU. Wrapper defaults to `localhost:8088/8089/8090`.
package/README.md CHANGED
@@ -44,9 +44,7 @@ Runs fully local with no API keys and no cloud services. Integrates via Claude C
44
44
 
45
45
  ### v0.2.0 Enhancements
46
46
 
47
- Seven patterns extracted from competitor analysis ([Hindsight](https://github.com/vectorize-io/hindsight), [Hermes Agent](https://github.com/NousResearch/hermes-agent), [claude-mem](https://github.com/thedotmack/claude-mem)):
48
-
49
- - **Entity resolution + co-occurrence graph** — LLM entity extraction during A-MEM enrichment, canonical normalization via FTS5 + Levenshtein fuzzy matching, co-occurrence tracking, entity graph traversal for ENTITY intent queries
47
+ - **Entity resolution + co-occurrence graph** — LLM entity extraction with quality filters, type-agnostic canonical resolution within [compatibility buckets](docs/internals/entity-resolution.md) (extensible type vocabulary), IDF-based entity edge scoring, co-occurrence tracking, entity graph traversal for ENTITY intent queries
50
48
  - **MPFP graph retrieval** — Multi-Path Fact Propagation with meta-path patterns per intent, hop-synchronized edge cache, Forward Push with α=0.15 teleport probability. Replaces single-beam traversal for causal/entity/temporal queries.
51
49
  - **Temporal query extraction** — regex-based date range extraction from natural language queries ("last week", "March 2026"), wired as WHERE filters into BM25 and vector search
52
50
  - **4-way parallel retrieval** — temporal proximity and entity graph channels added as parallel RRF legs in `query` tool (Tier 3 only), alongside existing BM25 + vector channels
@@ -1046,6 +1044,8 @@ Built on the shoulders of:
1046
1044
  - [Beads](https://github.com/steveyegge/beads) — Dolt-backed issue tracker for AI agents
1047
1045
  - [claude-mem](https://github.com/thedotmack/claude-mem) — Claude Code memory integration reference
1048
1046
  - [Engram](https://github.com/Gentleman-Programming/engram) — observation dedup window, topic-key upsert pattern, temporal timeline navigation, duplicate metadata scoring signals
1047
+ - [Hermes Agent](https://github.com/NousResearch/hermes-agent) — memory nudge system (periodic lifecycle tool prompting)
1048
+ - [Hindsight](https://github.com/vectorize-io/hindsight) — entity resolution, MPFP graph traversal, temporal extraction, 3-tier consolidation, observation invalidation, 4-way parallel retrieval
1049
1049
  - [MAGMA](https://arxiv.org/abs/2501.13956) — multi-graph memory agent
1050
1050
  - [memory-lancedb-pro](https://github.com/CortexReach/memory-lancedb-pro) — retrieval gate, length normalization, MMR diversity, access reinforcement algorithms
1051
1051
  - [OpenViking](https://github.com/volcengine/OpenViking) — query decomposition patterns, collection-scoped retrieval, transaction-safe indexing
package/SKILL.md CHANGED
@@ -677,29 +677,6 @@ clawmem consolidate [--dry-run] # Find and archive duplicate low-confidence docu
677
677
  # Jaccard similarity within same collection
678
678
  ```
679
679
 
680
- ---
681
-
682
- ## Operational Issue Tracking
683
-
684
- When encountering tool failures, instruction contradictions, retrieval gaps, or workflow friction:
685
-
686
- Write to `docs/issues/YYYY-MM-DD-<slug>.md`:
687
-
688
- ```
689
- # <title>
690
- - Category: tool-failure | instruction-gap | workflow-friction | retrieval-gap | inconsistency
691
- - Severity: critical | high | medium
692
- - Status: open | resolved
693
-
694
- ## Observed
695
- ## Expected
696
- ## Context
697
- ## Suggested Fix
698
- ```
699
-
700
- **Triggers:** repeated tool error, instruction contradicting observed behavior, retrieval consistently missing known content.
701
-
702
- **Do NOT log:** one-off transient errors, user-caused issues, already recorded issues.
703
680
 
704
681
  ---
705
682
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clawmem",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "description": "On-device context engine and memory for AI agents. Claude Code and OpenClaw. Hooks + MCP server + hybrid RAG search.",
5
5
  "type": "module",
6
6
  "bin": {
package/src/entity.ts CHANGED
@@ -8,7 +8,8 @@
8
8
  */
9
9
 
10
10
  import type { Database } from "bun:sqlite";
11
- import type { LlamaCpp } from "./llm.ts";
11
+ import { createHash } from "crypto";
12
+ import type { LLM } from "./llm.ts";
12
13
  import { extractJsonFromLLM } from "./amem.ts";
13
14
 
14
15
  // =============================================================================
@@ -76,6 +77,77 @@ function similarityRatio(a: string, b: string): number {
76
77
  return 1 - levenshtein(a, b) / maxLen;
77
78
  }
78
79
 
80
+ // =============================================================================
81
+ // Quality Filters
82
+ // =============================================================================
83
+
84
+ const ENTITY_BLOCKLIST = new Set([
85
+ 'entity name', 'entity type', 'description', 'example',
86
+ 'name', 'type', 'value', 'item',
87
+ 'exampletool', 'jane smith', // prompt examples the LLM echoes
88
+ ]);
89
+
90
+ /**
91
+ * Check if an extracted entity is low quality and should be rejected.
92
+ * Catches: title-as-entity, long names, template placeholders, heading labels.
93
+ */
94
+ function isLowQualityEntity(name: string, type: string, docTitle: string): boolean {
95
+ const normalized = name.toLowerCase().trim();
96
+ const normalizedTitle = docTitle.toLowerCase().trim();
97
+
98
+ // Exact or near-exact title match (Levenshtein > 0.85)
99
+ if (normalizedTitle.length > 0 && similarityRatio(normalized, normalizedTitle) > 0.85) return true;
100
+
101
+ // Too long — likely a title or sentence fragment
102
+ if (name.length > 60) return true;
103
+
104
+ // Template placeholders / generic words
105
+ if (ENTITY_BLOCKLIST.has(normalized)) return true;
106
+
107
+ // Heading labels (trailing colon)
108
+ if (name.endsWith(':')) return true;
109
+
110
+ // Location low-trust: if type is location, validate it
111
+ if (type === 'location' && !isValidLocation(name)) return true;
112
+
113
+ return false;
114
+ }
115
+
116
+ /**
117
+ * Validate that a location entity is actually geographic / infrastructure.
118
+ * Rejects long non-geographic names that the LLM mistyped as location.
119
+ */
120
+ function isValidLocation(name: string): boolean {
121
+ // IP addresses
122
+ if (/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/.test(name)) return true;
123
+ // VM identifiers (e.g., "VM 202", "VM 200")
124
+ if (/^VM\s+\d+/i.test(name)) return true;
125
+ // Positive-signal only — no length-based or FQDN fallback
126
+ return false;
127
+ }
128
+
129
+ // =============================================================================
130
+ // Compatibility Buckets (for type-agnostic canonical resolution)
131
+ // =============================================================================
132
+
133
+ // Each bucket contains types that are semantically interchangeable for merging.
134
+ // Types in the same bucket can merge; cross-bucket merges are rejected.
135
+ // The 'tech' bucket captures the common LLM confusion between project/service/tool/concept.
136
+ // Unknown types default to their own isolated bucket (no false merges).
137
+ const ENTITY_BUCKETS: Record<string, string> = {
138
+ person: 'person',
139
+ org: 'org',
140
+ location: 'location',
141
+ project: 'tech',
142
+ service: 'tech',
143
+ tool: 'tech',
144
+ concept: 'tech',
145
+ };
146
+
147
+ function getEntityBucket(type: string): string {
148
+ return ENTITY_BUCKETS[type] ?? type; // unknown types form their own bucket
149
+ }
150
+
79
151
  // =============================================================================
80
152
  // Entity ID Generation
81
153
  // =============================================================================
@@ -98,7 +170,7 @@ function makeEntityId(name: string, type: string, vault: string = 'default'): st
98
170
  * Returns a list of (name, type) pairs.
99
171
  */
100
172
  export async function extractEntities(
101
- llm: LlamaCpp,
173
+ llm: LLM,
102
174
  title: string,
103
175
  content: string
104
176
  ): Promise<ExtractedEntity[]> {
@@ -112,15 +184,16 @@ Content:
112
184
  ${truncated}
113
185
 
114
186
  Return ONLY valid JSON array:
115
- [
116
- {"name": "Entity Name", "type": "person|project|service|tool|concept|org|location"}
117
- ]
187
+ [{"name": "...", "type": "person|project|service|tool|concept|org|location"}]
118
188
 
119
189
  Rules:
120
190
  - Only include specific, named entities (not generic concepts like "database" or "testing")
121
191
  - Normalize names: "VM 202" not "vm202", "ClawMem" not "clawmem"
122
- - 3-15 entities maximum
192
+ - 0-10 entities. Return empty array [] if no specific entities found
123
193
  - Include the most specific type for each entity
194
+ - Do NOT extract the document's title as an entity
195
+ - Do NOT extract heading labels, section names, or sentence fragments
196
+ - Only extract entities that could meaningfully appear in OTHER documents
124
197
  Return ONLY the JSON array. /no_think`;
125
198
 
126
199
  try {
@@ -134,7 +207,7 @@ Return ONLY the JSON array. /no_think`;
134
207
  const parsed = extractJsonFromLLM(result.text) as ExtractedEntity[] | null;
135
208
  if (!Array.isArray(parsed)) return [];
136
209
 
137
- // Validate and filter
210
+ // Validate, filter, and quality-check
138
211
  return parsed
139
212
  .filter(e =>
140
213
  typeof e.name === 'string' &&
@@ -143,7 +216,8 @@ Return ONLY the JSON array. /no_think`;
143
216
  e.name.length <= 100 &&
144
217
  ['person', 'project', 'service', 'tool', 'concept', 'org', 'location'].includes(e.type)
145
218
  )
146
- .slice(0, 15);
219
+ .filter(e => !isLowQualityEntity(e.name, e.type, title))
220
+ .slice(0, 10);
147
221
  } catch (err) {
148
222
  console.log(`[entity] LLM extraction failed:`, err);
149
223
  return [];
@@ -158,7 +232,13 @@ Return ONLY the JSON array. /no_think`;
158
232
  * Resolve an entity name to its canonical form.
159
233
  * Uses FTS5 candidate lookup + Levenshtein fuzzy matching.
160
234
  *
161
- * Scoped per vault (via vault parameter) to prevent cross-vault false merges.
235
+ * Type-agnostic within compatibility buckets:
236
+ * - person: only merges with person
237
+ * - org: only merges with org
238
+ * - location: only merges with location
239
+ * - tech (project/service/tool/concept): merges freely within bucket
240
+ *
241
+ * Scoped per vault to prevent cross-vault false merges.
162
242
  *
163
243
  * @returns entity_id of canonical match, or null if no match (new entity)
164
244
  */
@@ -170,34 +250,41 @@ export function resolveEntityCanonical(
170
250
  threshold: number = 0.75
171
251
  ): string | null {
172
252
  const normalizedName = name.toLowerCase().trim();
253
+ const inputBucket = getEntityBucket(type);
254
+
255
+ // Use lower threshold for person names (enables "Andre (Dre) Konrad" ↔ "Dre Konrad")
256
+ const effectiveThreshold = inputBucket === 'person' ? 0.65 : threshold;
173
257
 
174
- // Step 1: FTS5 candidate lookup — join to entity_nodes for vault scoping
258
+ // Step 1: FTS5 candidate lookup — type-agnostic, vault-scoped
175
259
  let candidates: { entity_id: string; name: string; entity_type: string }[] = [];
176
260
  try {
177
261
  candidates = db.prepare(`
178
262
  SELECT f.entity_id, f.name, f.entity_type
179
263
  FROM entities_fts f
180
264
  JOIN entity_nodes e ON e.entity_id = f.entity_id
181
- WHERE entities_fts MATCH ? AND f.entity_type = ? AND e.vault = ?
265
+ WHERE entities_fts MATCH ? AND e.vault = ?
182
266
  LIMIT 20
183
- `).all(normalizedName.split(/\s+/).map(w => `"${w}"`).join(' OR '), type, vault) as typeof candidates;
267
+ `).all(normalizedName.split(/\s+/).map(w => `"${w}"`).join(' OR '), vault) as typeof candidates;
184
268
  } catch {
185
269
  // FTS5 match may fail on special chars — fall back to LIKE on entity_nodes directly
186
270
  candidates = db.prepare(`
187
271
  SELECT entity_id, name, entity_type
188
272
  FROM entity_nodes
189
- WHERE LOWER(name) LIKE ? AND entity_type = ? AND vault = ?
273
+ WHERE LOWER(name) LIKE ? AND vault = ?
190
274
  LIMIT 20
191
- `).all(`%${normalizedName}%`, type, vault) as typeof candidates;
275
+ `).all(`%${normalizedName}%`, vault) as typeof candidates;
192
276
  }
193
277
 
194
278
  if (candidates.length === 0) return null;
195
279
 
196
- // Step 2: Fuzzy rank candidates by Levenshtein similarity
280
+ // Step 2: Fuzzy rank candidates, filtering by bucket compatibility
197
281
  let bestMatch: { entity_id: string; score: number } | null = null;
198
282
  for (const candidate of candidates) {
283
+ // Reject cross-bucket matches (e.g., don't merge "Andrea" person with "Andrea" project)
284
+ if (getEntityBucket(candidate.entity_type) !== inputBucket) continue;
285
+
199
286
  const score = similarityRatio(normalizedName, candidate.name.toLowerCase());
200
- if (score >= threshold && (!bestMatch || score > bestMatch.score)) {
287
+ if (score >= effectiveThreshold && (!bestMatch || score > bestMatch.score)) {
201
288
  bestMatch = { entity_id: candidate.entity_id, score };
202
289
  }
203
290
  }
@@ -299,23 +386,74 @@ export function trackCoOccurrences(
299
386
  // Entity Enrichment Pipeline (called during A-MEM postIndexEnrich)
300
387
  // =============================================================================
301
388
 
389
+ /**
390
+ * Compute extraction input hash from title + body.
391
+ * Captures the actual input to the LLM prompt — changes to either trigger re-extraction.
392
+ */
393
+ function computeInputHash(title: string, body: string): string {
394
+ return createHash('sha256').update(title + '\0' + body).digest('hex');
395
+ }
396
+
397
+ /**
398
+ * Clear all derived entity state for a document:
399
+ * mentions, co-occurrence contributions, entity edges, and mention counts.
400
+ */
401
+ function clearDocEntityState(db: Database, docId: number): void {
402
+ // Get entity IDs this doc mentions (before deletion)
403
+ const oldMentions = db.prepare(
404
+ `SELECT entity_id FROM entity_mentions WHERE doc_id = ?`
405
+ ).all(docId) as { entity_id: string }[];
406
+
407
+ // Delete mentions
408
+ db.prepare(`DELETE FROM entity_mentions WHERE doc_id = ?`).run(docId);
409
+
410
+ // Decrement mention_count for each entity
411
+ for (const m of oldMentions) {
412
+ db.prepare(`
413
+ UPDATE entity_nodes SET mention_count = MAX(0, mention_count - 1) WHERE entity_id = ?
414
+ `).run(m.entity_id);
415
+ }
416
+
417
+ // Remove entity edges involving this doc
418
+ db.prepare(`
419
+ DELETE FROM memory_relations WHERE (source_id = ? OR target_id = ?) AND relation_type = 'entity'
420
+ `).run(docId, docId);
421
+
422
+ // Decrement co-occurrence counts for entity pairs from this doc
423
+ if (oldMentions.length >= 2) {
424
+ const ids = oldMentions.map(m => m.entity_id);
425
+ for (let i = 0; i < ids.length; i++) {
426
+ for (let j = i + 1; j < ids.length; j++) {
427
+ const sorted = [ids[i]!, ids[j]!].sort();
428
+ db.prepare(`
429
+ UPDATE entity_cooccurrences SET count = MAX(0, count - 1)
430
+ WHERE entity_a = ? AND entity_b = ?
431
+ `).run(sorted[0]!, sorted[1]!);
432
+ }
433
+ }
434
+ // Clean up zero-count rows
435
+ db.prepare(`DELETE FROM entity_cooccurrences WHERE count <= 0`).run();
436
+ }
437
+ }
438
+
302
439
  /**
303
440
  * Full entity enrichment for a document:
304
- * 1. Extract entities via LLM
305
- * 2. Resolve each to canonical form
306
- * 3. Record mentions
307
- * 4. Track co-occurrences
441
+ * 1. Check enrichment state (skip if input unchanged)
442
+ * 2. Extract entities via LLM
443
+ * 3. Resolve each to canonical form
444
+ * 4. Record mentions + co-occurrences + entity edges
445
+ * 5. Persist enrichment state for idempotency
308
446
  *
309
447
  * @returns Number of entities resolved
310
448
  */
311
449
  export async function enrichDocumentEntities(
312
450
  db: Database,
313
- llm: LlamaCpp,
451
+ llm: LLM,
314
452
  docId: number,
315
453
  vault: string = 'default'
316
454
  ): Promise<number> {
317
455
  try {
318
- // Get document content
456
+ // Get document content (snapshot for extraction)
319
457
  const doc = db.prepare(`
320
458
  SELECT d.title, c.doc as body
321
459
  FROM documents d
@@ -328,14 +466,34 @@ export async function enrichDocumentEntities(
328
466
  return 0;
329
467
  }
330
468
 
331
- // Step 1: Extract entities
469
+ // Compute extraction input hash (title + body — the actual LLM prompt input)
470
+ const inputHash = computeInputHash(doc.title, doc.body);
471
+
472
+ // Check enrichment state — skip if already enriched with same input
473
+ const existingState = db.prepare(
474
+ `SELECT input_hash FROM entity_enrichment_state WHERE doc_id = ?`
475
+ ).get(docId) as { input_hash: string } | undefined;
476
+
477
+ if (existingState?.input_hash === inputHash) {
478
+ return 0; // Same input, already enriched — skip
479
+ }
480
+
481
+ // Step 1: Extract entities via LLM
332
482
  const entities = await extractEntities(llm, doc.title, doc.body);
333
- if (entities.length === 0) {
334
- console.log(`[entity] No entities found in docId ${docId}`);
483
+
484
+ // Recheck input hash before writing — abort if content changed during LLM call
485
+ const recheckHash = db.prepare(`
486
+ SELECT d.title, c.doc as body FROM documents d
487
+ JOIN content c ON c.hash = d.hash WHERE d.id = ? AND d.active = 1
488
+ `).get(docId) as { title: string; body: string } | null;
489
+
490
+ if (!recheckHash || computeInputHash(recheckHash.title, recheckHash.body) !== inputHash) {
491
+ console.log(`[entity] Document ${docId} changed during extraction — aborting`);
335
492
  return 0;
336
493
  }
337
494
 
338
- // Step 2-3: Deduplicate entities by name+type, then resolve and record mentions
495
+ // Step 3: Deduplicate entities by surface form, then resolve canonical IDs
496
+ // Done BEFORE transaction to avoid calling upsertEntity (which mutates counters) for dupes
339
497
  const seenKeys = new Set<string>();
340
498
  const uniqueEntities: ExtractedEntity[] = [];
341
499
  for (const entity of entities) {
@@ -346,36 +504,149 @@ export async function enrichDocumentEntities(
346
504
  }
347
505
  }
348
506
 
349
- const resolvedIds: string[] = [];
507
+ // Resolve canonical IDs first (read-only lookups, no counter mutation yet)
508
+ const resolvedPairs: { entity: ExtractedEntity; canonicalId: string }[] = [];
509
+ const seenCanonicalIds = new Set<string>();
350
510
  for (const entity of uniqueEntities) {
351
- const entityId = upsertEntity(db, entity.name, entity.type, vault);
352
- resolvedIds.push(entityId);
353
- recordEntityMention(db, entityId, docId, entity.name);
511
+ const canonicalId = resolveEntityCanonical(db, entity.name, entity.type, vault)
512
+ || makeEntityId(entity.name, entity.type, vault);
513
+ if (!seenCanonicalIds.has(canonicalId)) {
514
+ seenCanonicalIds.add(canonicalId);
515
+ resolvedPairs.push({ entity, canonicalId });
516
+ }
354
517
  }
355
518
 
356
- // Step 4: Track co-occurrences (deduplicated IDs prevent inflated pair counts)
357
- trackCoOccurrences(db, resolvedIds);
519
+ // All writes in a transaction partial failure rolls back cleanly
520
+ try {
521
+ db.exec("BEGIN");
358
522
 
359
- // Step 5: Create entity edges in memory_relations
360
- for (const entityId of resolvedIds) {
361
- // Find other documents mentioning this entity
362
- const otherDocs = db.prepare(`
363
- SELECT doc_id FROM entity_mentions
364
- WHERE entity_id = ? AND doc_id != ?
365
- LIMIT 10
366
- `).all(entityId, docId) as { doc_id: number }[];
523
+ // Re-check enrichment state inside transaction (prevents concurrent overcount)
524
+ const txState = db.prepare(
525
+ `SELECT input_hash FROM entity_enrichment_state WHERE doc_id = ?`
526
+ ).get(docId) as { input_hash: string } | undefined;
527
+
528
+ if (txState?.input_hash === inputHash) {
529
+ db.exec("ROLLBACK");
530
+ return 0; // Another worker already committed this exact enrichment
531
+ }
532
+
533
+ // Clear old derived state if re-enriching (content changed or state was externally wiped)
534
+ const hasOldMentions = db.prepare(
535
+ `SELECT 1 FROM entity_mentions WHERE doc_id = ? LIMIT 1`
536
+ ).get(docId);
537
+ if (txState || existingState || hasOldMentions) {
538
+ clearDocEntityState(db, docId);
539
+ }
540
+
541
+ if (entities.length === 0) {
542
+ db.prepare(`
543
+ INSERT OR REPLACE INTO entity_enrichment_state (doc_id, input_hash, enriched_at)
544
+ VALUES (?, ?, datetime('now'))
545
+ `).run(docId, inputHash);
546
+ db.exec("COMMIT");
547
+ console.log(`[entity] No entities found in docId ${docId}`);
548
+ return 0;
549
+ }
550
+
551
+ // Mutate counters using precomputed canonical IDs (no redundant re-resolution)
552
+ const resolvedIds: string[] = [];
553
+ for (const { entity, canonicalId } of resolvedPairs) {
554
+ // Check if canonical entity already exists
555
+ const existing = db.prepare(
556
+ `SELECT entity_id FROM entity_nodes WHERE entity_id = ?`
557
+ ).get(canonicalId) as { entity_id: string } | undefined;
558
+
559
+ if (existing) {
560
+ // Existing canonical — increment count
561
+ db.prepare(`
562
+ UPDATE entity_nodes SET mention_count = mention_count + 1, last_seen = datetime('now')
563
+ WHERE entity_id = ?
564
+ `).run(canonicalId);
565
+ } else {
566
+ // New entity — insert
567
+ db.prepare(`
568
+ INSERT OR IGNORE INTO entity_nodes (entity_id, entity_type, name, description, created_at, mention_count, last_seen, vault)
569
+ VALUES (?, ?, ?, NULL, datetime('now'), 1, datetime('now'), ?)
570
+ `).run(canonicalId, entity.type, entity.name, vault);
571
+ try {
572
+ db.prepare(`
573
+ INSERT OR IGNORE INTO entities_fts (entity_id, name, entity_type)
574
+ VALUES (?, ?, ?)
575
+ `).run(canonicalId, entity.name.toLowerCase(), entity.type);
576
+ } catch { /* FTS insert non-fatal */ }
577
+ }
578
+
579
+ resolvedIds.push(canonicalId);
580
+ recordEntityMention(db, canonicalId, docId, entity.name);
581
+ }
582
+
583
+ // Step 4: Track co-occurrences (deduplicate resolvedIds to prevent self-pairs)
584
+ const uniqueResolvedIds = [...new Set(resolvedIds)];
585
+ trackCoOccurrences(db, uniqueResolvedIds);
586
+
587
+ // Step 5: Create entity edges with IDF-based specificity scoring
588
+ // Rare entities justify edges; ubiquitous entities alone cannot
589
+ const totalDocs = (db.prepare(`SELECT COUNT(*) as cnt FROM documents WHERE active = 1`).get() as { cnt: number }).cnt;
590
+
591
+ // Collect candidate target docs and their shared entities
592
+ const targetEntityMap = new Map<number, string[]>(); // docId → [entityIds]
593
+ for (const entityId of resolvedIds) {
594
+ const otherDocs = db.prepare(`
595
+ SELECT doc_id FROM entity_mentions
596
+ WHERE entity_id = ? AND doc_id != ?
597
+ LIMIT 20
598
+ `).all(entityId, docId) as { doc_id: number }[];
599
+
600
+ for (const other of otherDocs) {
601
+ const existing = targetEntityMap.get(other.doc_id) || [];
602
+ existing.push(entityId);
603
+ targetEntityMap.set(other.doc_id, existing);
604
+ }
605
+ }
606
+
607
+ // Compute IDF per entity (cache for this enrichment)
608
+ const entityIdf = new Map<string, number>();
609
+ for (const entityId of resolvedIds) {
610
+ if (!entityIdf.has(entityId)) {
611
+ const docFreq = (db.prepare(
612
+ `SELECT COUNT(DISTINCT doc_id) as cnt FROM entity_mentions WHERE entity_id = ?`
613
+ ).get(entityId) as { cnt: number }).cnt;
614
+ entityIdf.set(entityId, Math.log((totalDocs + 1) / (docFreq + 1)));
615
+ }
616
+ }
617
+
618
+ // Create edges only when max entity IDF exceeds threshold
619
+ const idfThreshold = 3.0; // ln-based: filters entities in >5% of docs (e.g., 13+ docs in 262-doc corpus)
620
+ for (const [targetDocId, sharedEntities] of targetEntityMap) {
621
+ const maxIdf = Math.max(...sharedEntities.map(eid => entityIdf.get(eid) || 0));
622
+ if (maxIdf < idfThreshold) continue; // Skip — only ubiquitous entities shared
623
+
624
+ // Weight: IDF specificity + shared-count bonus (multi-entity overlap outranks single)
625
+ const sharedBonus = Math.min(0.15, 0.05 * (sharedEntities.length - 1));
626
+ const weight = Math.min(1.0, 0.3 + 0.12 * maxIdf + sharedBonus);
627
+ const bestEntity = sharedEntities.reduce((best, eid) =>
628
+ (entityIdf.get(eid) || 0) > (entityIdf.get(best) || 0) ? eid : best
629
+ );
367
630
 
368
- for (const other of otherDocs) {
369
- // Insert entity relation (unidirectional; graph traversal handles inbound for entity/semantic types)
370
631
  db.prepare(`
371
632
  INSERT OR IGNORE INTO memory_relations (source_id, target_id, relation_type, weight, metadata, created_at)
372
- VALUES (?, ?, 'entity', 0.7, ?, datetime('now'))
373
- `).run(docId, other.doc_id, JSON.stringify({ entity: entityId }));
633
+ VALUES (?, ?, 'entity', ?, ?, datetime('now'))
634
+ `).run(docId, targetDocId, weight, JSON.stringify({ entity: bestEntity, shared: sharedEntities.length }));
374
635
  }
375
- }
376
636
 
377
- console.log(`[entity] Enriched docId ${docId}: ${resolvedIds.length} entities, ${entities.length} extracted`);
378
- return resolvedIds.length;
637
+ // Persist enrichment state LAST only after all derived data written
638
+ db.prepare(`
639
+ INSERT OR REPLACE INTO entity_enrichment_state (doc_id, input_hash, enriched_at)
640
+ VALUES (?, ?, datetime('now'))
641
+ `).run(docId, inputHash);
642
+
643
+ db.exec("COMMIT");
644
+ console.log(`[entity] Enriched docId ${docId}: ${resolvedIds.length} entities, ${entities.length} extracted`);
645
+ return resolvedIds.length;
646
+ } catch (txErr) {
647
+ try { db.exec("ROLLBACK"); } catch { /* already rolled back */ }
648
+ throw txErr; // re-throw to outer catch
649
+ }
379
650
  } catch (err) {
380
651
  console.log(`[entity] Error enriching docId ${docId}:`, err);
381
652
  return 0;
package/src/intent.ts CHANGED
@@ -10,7 +10,7 @@
10
10
 
11
11
  import type { Database } from "bun:sqlite";
12
12
  import { createHash } from "crypto";
13
- import type { LlamaCpp } from "./llm.ts";
13
+ import type { LLM } from "./llm.ts";
14
14
 
15
15
  export type IntentType = 'WHY' | 'WHEN' | 'ENTITY' | 'WHAT';
16
16
 
@@ -179,7 +179,7 @@ function classifyIntentHeuristic(query: string): IntentResult {
179
179
  */
180
180
  export async function classifyIntent(
181
181
  query: string,
182
- llm: LlamaCpp,
182
+ llm: LLM,
183
183
  db: Database
184
184
  ): Promise<IntentResult> {
185
185
  // Check cache first (1 hour TTL)
@@ -268,7 +268,7 @@ export type QueryClause = {
268
268
  */
269
269
  export async function decomposeQuery(
270
270
  query: string,
271
- llm: LlamaCpp,
271
+ llm: LLM,
272
272
  db: Database,
273
273
  sessionContext?: string
274
274
  ): Promise<QueryClause[]> {
package/src/store.ts CHANGED
@@ -711,6 +711,16 @@ function initializeDatabase(db: Database): void {
711
711
  // Entity FTS5 for fuzzy name lookup
712
712
  db.exec(`CREATE VIRTUAL TABLE IF NOT EXISTS entities_fts USING fts5(entity_id, name, entity_type)`);
713
713
 
714
+ // Entity enrichment state: tracks what input was used for extraction (idempotent --enrich)
715
+ db.exec(`
716
+ CREATE TABLE IF NOT EXISTS entity_enrichment_state (
717
+ doc_id INTEGER PRIMARY KEY,
718
+ input_hash TEXT NOT NULL,
719
+ enriched_at TEXT NOT NULL,
720
+ FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE
721
+ )
722
+ `);
723
+
714
724
  // 3-tier consolidation: observations synthesized from clusters of related facts
715
725
  db.exec(`
716
726
  CREATE TABLE IF NOT EXISTS consolidated_observations (