@geravant/sinain 1.13.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/.env.example +4 -2
  2. package/config-shared.js +1 -0
  3. package/package.json +4 -1
  4. package/sinain-agent/run.sh +36 -4
  5. package/sinain-core/src/buffers/feed-buffer.ts +6 -4
  6. package/sinain-core/src/index.ts +50 -19
  7. package/sinain-memory/graph_query.py +12 -3
  8. package/sinain-memory/knowledge_integrator.py +194 -10
  9. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  10. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  11. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  12. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  13. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  14. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  15. package/sinain-memory/eval/__init__.py +0 -0
  16. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  17. package/sinain-memory/eval/assertions.py +0 -267
  18. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  19. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  20. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  21. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  22. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  23. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  24. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  25. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  27. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  31. package/sinain-memory/eval/benchmarks/config.py +0 -23
  32. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  33. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  34. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  35. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  38. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  39. package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
  40. package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
  41. package/sinain-memory/eval/benchmarks/query.py +0 -193
  42. package/sinain-memory/eval/benchmarks/report.py +0 -87
  43. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
  44. package/sinain-memory/eval/benchmarks/runner.py +0 -283
  45. package/sinain-memory/eval/judges/__init__.py +0 -0
  46. package/sinain-memory/eval/judges/base_judge.py +0 -61
  47. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  48. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  49. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  50. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  51. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  52. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  53. package/sinain-memory/eval/schemas.py +0 -247
  54. package/sinain-memory/tests/__init__.py +0 -0
  55. package/sinain-memory/tests/conftest.py +0 -189
  56. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  57. package/sinain-memory/tests/test_embedder.py +0 -210
  58. package/sinain-memory/tests/test_extract_json.py +0 -124
  59. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  60. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  61. package/sinain-memory/tests/test_module_management.py +0 -458
  62. package/sinain-memory/tests/test_parsers.py +0 -96
  63. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  64. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  65. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  66. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  67. package/sinain-memory/tests/test_triplestore.py +0 -248
package/.env.example CHANGED
@@ -23,9 +23,11 @@ PRIVACY_MODE=standard # off | standard | strict | paranoid
23
23
  # paranoid: almost nothing leaves your machine
24
24
 
25
25
  # ── Agent ────────────────────────────────────────────────────────────────────
26
- SINAIN_AGENT=claude # claude | codex | junie | goose | aider | <custom command>
27
- # MCP agents (claude, codex, junie, goose) call sinain tools directly
26
+ SINAIN_AGENT=claude # claude | openclaude | codex | junie | goose | aider | <custom command>
27
+ # MCP agents (claude, openclaude, codex, junie, goose) call sinain tools directly
28
28
  # Pipe agents (aider, custom) receive escalation text on stdin
29
+ # openclaude: set OPENAI_BASE_URL=http://localhost:11434/v1 + OPENAI_MODEL=<ollama-model>
30
+ # to route through local Ollama. Run.sh auto-warms the model on startup.
29
31
  SINAIN_CORE_URL=http://localhost:9500
30
32
  SINAIN_POLL_INTERVAL=5 # seconds between escalation polls
31
33
  SINAIN_HEARTBEAT_INTERVAL=900 # seconds between heartbeat ticks (15 min)
package/config-shared.js CHANGED
@@ -459,6 +459,7 @@ export async function stepAgent(existing, label = "Bare agent") {
459
459
  message: label,
460
460
  options: [
461
461
  { value: "claude", label: "Claude Code", hint: "Calls sinain tools directly — recommended" },
462
+ { value: "openclaude", label: "OpenClaude", hint: "Claude Code clone, local-first (Ollama/OpenAI-compat)" },
462
463
  { value: "codex", label: "Codex", hint: "Calls sinain tools directly" },
463
464
  { value: "goose", label: "Goose", hint: "Calls sinain tools directly" },
464
465
  { value: "junie", label: "Junie", hint: "JetBrains IDE agent" },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@geravant/sinain",
3
- "version": "1.13.0",
3
+ "version": "1.14.0",
4
4
  "description": "Ambient intelligence that sees what you see, hears what you hear, and acts on your behalf",
5
5
  "type": "module",
6
6
  "bin": {
@@ -25,6 +25,9 @@
25
25
  "index.ts",
26
26
  "openclaw.plugin.json",
27
27
  "sinain-memory",
28
+ "!sinain-memory/eval",
29
+ "!sinain-memory/tests",
30
+ "!sinain-memory/**/__pycache__",
28
31
  "sinain-knowledge",
29
32
  "sinain-core/src",
30
33
  "sinain-core/package.json",
@@ -55,7 +55,7 @@ fi
55
55
  JUNIE_HAS_MCP=false # set during startup checks
56
56
  agent_has_mcp() {
57
57
  case "$AGENT" in
58
- claude|codex|goose) return 0 ;;
58
+ claude|openclaude|codex|goose) return 0 ;;
59
59
  junie) $JUNIE_HAS_MCP ;;
60
60
  *) return 1 ;;
61
61
  esac
@@ -67,11 +67,11 @@ agent_has_mcp() {
67
67
  invoke_agent() {
68
68
  local prompt="$1"
69
69
  case "$AGENT" in
70
- claude)
70
+ claude|openclaude)
71
71
  local turns="${2:-$AGENT_MAX_TURNS}"
72
72
  if [ -n "${SINAIN_SPAWN:-}" ]; then
73
73
  # Spawn: PreToolUse hook routes permission prompts to overlay HUD
74
- claude \
74
+ "$AGENT" \
75
75
  --mcp-config "$MCP_CONFIG" \
76
76
  --settings "$SCRIPT_DIR/.claude/settings.json" \
77
77
  ${ALLOWED_TOOLS:+--allowedTools $ALLOWED_TOOLS} \
@@ -79,7 +79,7 @@ invoke_agent() {
79
79
  -p "$prompt"
80
80
  else
81
81
  # Escalation: auto-approve for speed (short-lived, read-heavy)
82
- claude --enable-auto-mode \
82
+ "$AGENT" --enable-auto-mode \
83
83
  --mcp-config "$MCP_CONFIG" \
84
84
  ${ALLOWED_TOOLS:+--allowedTools $ALLOWED_TOOLS} \
85
85
  --max-turns "$turns" --output-format text \
@@ -227,6 +227,38 @@ print(' sinain extension added to ' + config_path)
227
227
  fi
228
228
  fi
229
229
 
230
+ # Ollama warmup — pin the backing model so each agent invocation hits hot weights.
231
+ # openclaude + Ollama via the OpenAI-compat endpoint does NOT forward keep_alive,
232
+ # so we ping Ollama's native /api/generate once with keep_alive=-1 (persistent).
233
+ # Applies to any agent pointed at an Ollama-compatible endpoint via OPENAI_BASE_URL.
234
+ OLLAMA_WARMUP="${OLLAMA_WARMUP:-true}"
235
+ if [ "$OLLAMA_WARMUP" = "true" ] && [ -n "${OPENAI_BASE_URL:-}" ]; then
236
+ if [[ "$OPENAI_BASE_URL" == *"11434"* ]] || [[ "$OPENAI_BASE_URL" == *"ollama"* ]]; then
237
+ # Derive Ollama host by stripping /v1 suffix from OPENAI_BASE_URL
238
+ OLLAMA_HOST="${OLLAMA_HOST:-${OPENAI_BASE_URL%/v1*}}"
239
+ OLLAMA_MODEL="${OLLAMA_MODEL:-${OPENAI_MODEL:-}}"
240
+ OLLAMA_KEEP_ALIVE="${OLLAMA_KEEP_ALIVE:--1}" # -1 = persistent, or "24h", "30m", etc.
241
+ if [ -n "$OLLAMA_MODEL" ]; then
242
+ echo "Warming Ollama model $OLLAMA_MODEL at $OLLAMA_HOST (keep_alive=$OLLAMA_KEEP_ALIVE)..."
243
+ # Ollama accepts keep_alive as int (-1 = persistent) or duration string ("24h", "30m").
244
+ if [[ "$OLLAMA_KEEP_ALIVE" =~ ^-?[0-9]+$ ]]; then
245
+ WARMUP_PAYLOAD="{\"model\":\"$OLLAMA_MODEL\",\"prompt\":\"\",\"keep_alive\":$OLLAMA_KEEP_ALIVE,\"stream\":false}"
246
+ else
247
+ WARMUP_PAYLOAD="{\"model\":\"$OLLAMA_MODEL\",\"prompt\":\"\",\"keep_alive\":\"$OLLAMA_KEEP_ALIVE\",\"stream\":false}"
248
+ fi
249
+ if curl -sf -m 60 -X POST "$OLLAMA_HOST/api/generate" \
250
+ -H 'Content-Type: application/json' \
251
+ -d "$WARMUP_PAYLOAD" >/dev/null 2>&1; then
252
+ echo " ✓ Model pinned in memory"
253
+ else
254
+ echo " ⚠ Warmup failed — first request will cold-start the model"
255
+ fi
256
+ else
257
+ echo " ⚠ OLLAMA_WARMUP=true but OPENAI_MODEL not set — skipping warmup"
258
+ fi
259
+ fi
260
+ fi
261
+
230
262
  # Agent mode label
231
263
  if agent_has_mcp; then
232
264
  AGENT_MODE="MCP"
@@ -48,12 +48,14 @@ export class FeedBuffer {
48
48
  this.items.push(item);
49
49
  if (this.items.length > this._hwm) this._hwm = this.items.length;
50
50
 
51
- // Fire onFull when buffer is at capacity AND enough new items have arrived
52
- // since the last distillation (at least half the buffer replaced)
51
+ // Fire when enough new items have arrived since last distillation.
52
+ // 20 items 1.7 min of audio at ~12 items/min transcription rate.
53
+ // Distillation takes ~7s, so 20-item threshold gives 100s gap — safe margin.
54
+ // This means ~35 passes/hour, leaving <20 items undistilled at shutdown.
53
55
  const newSinceRearm = this._version - this._onFullVersion;
54
- if (this.items.length >= this.maxSize
56
+ if (this.items.length >= 20
55
57
  && this._onFullCb && this._onFullArmed
56
- && newSinceRearm >= Math.floor(this.maxSize / 2)) {
58
+ && newSinceRearm >= 20) {
57
59
  this._onFullArmed = false;
58
60
  const snapshot = [...this.items];
59
61
  queueMicrotask(() => this._onFullCb!(snapshot));
@@ -67,35 +67,66 @@ async function queryKnowledgeFactsMulti(entities: string[], maxFacts: number): P
67
67
  ];
68
68
  const scriptPath = scriptCandidates.find(p => existsSync(p)) || scriptCandidates[0];
69
69
 
70
- const results: string[] = [];
70
+ // Step 1: Get candidates from Python (RRF-ranked, no embedding — avoids deadlock)
71
+ // Request 2x candidates in JSON for re-ranking in Node.js
72
+ const candidateFacts: Array<Record<string, string>> = [];
71
73
  for (const dbPath of dbPaths) {
72
74
  if (!existsSync(dbPath)) continue;
73
75
  try {
74
- const args = [scriptPath, "--db", dbPath, "--max-facts", String(maxFacts), "--format", "compact"];
76
+ const args = [scriptPath, "--db", dbPath, "--max-facts", String(maxFacts * 2), "--format", "json"];
75
77
  if (entities.length > 0) args.push("--entities", JSON.stringify(entities));
76
78
  const out = execFileSync("python3", args, { timeout: 5000, encoding: "utf-8" }).trim();
77
- if (out) results.push(out);
79
+ if (out) {
80
+ const parsed = JSON.parse(out);
81
+ const facts = parsed.facts || parsed;
82
+ if (Array.isArray(facts)) candidateFacts.push(...facts);
83
+ }
78
84
  } catch { /* skip failed db */ }
79
85
  }
80
86
 
81
- if (results.length === 0) return "";
82
- if (results.length === 1) return results[0];
87
+ if (candidateFacts.length === 0) return "";
83
88
 
84
- // Merge and deduplicate lines from both sources
85
- const seen = new Set<string>();
86
- const merged: string[] = [];
87
- for (const block of results) {
88
- for (const line of block.split("\n")) {
89
- const key = line.replace(/\(confidence:.*$/, "").trim();
90
- if (key && !seen.has(key)) {
91
- seen.add(key);
92
- merged.push(line);
93
- }
89
+ // Step 2: Re-rank by embedding similarity in-process (no deadlock — model is in this process)
90
+ const queryText = entities.join(" ");
91
+ try {
92
+ if (embeddingService?.ready) {
93
+ const allTexts = [queryText, ...candidateFacts.map(f => f.value || "")];
94
+ const embeddings = await embeddingService.embed(allTexts);
95
+ const queryEmb = embeddings[0];
96
+ const scored = candidateFacts.map((f, i) => ({
97
+ fact: f,
98
+ sim: EmbeddingService.cosine(queryEmb, embeddings[i + 1]),
99
+ }));
100
+ scored.sort((a, b) => b.sim - a.sim);
101
+ candidateFacts.length = 0;
102
+ candidateFacts.push(...scored.slice(0, maxFacts).map(s => s.fact));
94
103
  }
104
+ } catch { /* embedding unavailable — use RRF order */ }
105
+
106
+ // Step 3: Format as compact text
107
+ const seen = new Set<string>();
108
+ const lines: string[] = [];
109
+ let total = 0;
110
+ const maxChars = 1200;
111
+ for (const f of candidateFacts.slice(0, maxFacts)) {
112
+ const eid = ((f as any).entity_id || (f as any).entityId || "").split(":").pop()?.slice(0, 20) || "?";
113
+ const value = (f as any).value || "";
114
+ const conf = (f as any).confidence || "?";
115
+ const count = (f as any).reinforce_count || "1";
116
+ const line = `${eid}: ${value} (${conf},${count}x)`;
117
+ const key = value.slice(0, 60);
118
+ if (seen.has(key)) continue;
119
+ seen.add(key);
120
+ if (total + line.length + 2 > maxChars) break;
121
+ lines.push(line);
122
+ total += line.length + 2;
95
123
  }
96
- return merged.slice(0, maxFacts).join("\n");
124
+ return lines.join("; ");
97
125
  }
98
126
 
127
+ // Reference to embedding service — set during init
128
+ let embeddingService: import("./embedding/service.js").EmbeddingService | null = null;
129
+
99
130
  /** List all entities from both local and workspace knowledge graphs. */
100
131
  async function listKnowledgeEntitiesMulti(max: number): Promise<string> {
101
132
  const { execFileSync } = await import("node:child_process");
@@ -340,7 +371,7 @@ async function main() {
340
371
  : null;
341
372
 
342
373
  // ── Initialize embedding service (non-blocking) ──
343
- const embeddingService = new EmbeddingService();
374
+ embeddingService = new EmbeddingService();
344
375
  embeddingService.loadAsync(); // ~9s background load, server starts immediately
345
376
 
346
377
  // ── Initialize local knowledge pipeline ──
@@ -683,8 +714,8 @@ async function main() {
683
714
  },
684
715
  getSpawnPending: () => escalator.getSpawnPending(),
685
716
  respondSpawn: (id: string, result: string) => escalator.respondSpawn(id, result),
686
- embedTexts: (texts: string[]) => embeddingService.embed(texts),
687
- isEmbeddingReady: () => embeddingService.ready,
717
+ embedTexts: (texts: string[]) => embeddingService!.embed(texts),
718
+ isEmbeddingReady: () => embeddingService?.ready ?? false,
688
719
  });
689
720
 
690
721
  // ── Wire overlay profiling ──
@@ -330,6 +330,10 @@ def query_facts_hybrid(
330
330
  if eid and eid not in fact_map:
331
331
  fact_map[eid] = f
332
332
 
333
+ # Return top RRF candidates. Embedding re-ranking is done by the caller
334
+ # (sinain-core Node.js) to avoid deadlock — the Python subprocess can't call
335
+ # back to sinain-core's /embed endpoint while sinain-core is blocked waiting
336
+ # for the subprocess.
333
337
  results = [fact_map[eid] for eid in sorted_ids[:max_facts] if eid in fact_map]
334
338
 
335
339
  # Expand top results with 1-hop graph neighbors
@@ -396,7 +400,7 @@ def format_facts_text(facts: list[dict], max_chars: int = 500) -> str:
396
400
  return "\n".join(lines)
397
401
 
398
402
 
399
- def format_facts_compact(facts: list[dict], max_chars: int = 400) -> str:
403
+ def format_facts_compact(facts: list[dict], max_chars: int = 1200) -> str:
400
404
  """Encode facts for efficient escalation context injection.
401
405
 
402
406
  Compact format: domain/entity: value (conf, Nx)
@@ -409,7 +413,7 @@ def format_facts_compact(facts: list[dict], max_chars: int = 400) -> str:
409
413
  total = 0
410
414
  for f in facts:
411
415
  entity = f.get("entityId", "").split(":")[-1][:20]
412
- value = f.get("value", "")[:60]
416
+ value = f.get("value", "")
413
417
  conf = f.get("confidence", "?")
414
418
  count = f.get("reinforce_count", "1")
415
419
  domain = f.get("domain", "")
@@ -469,7 +473,12 @@ def main() -> None:
469
473
  facts = query_top_facts(args.db, limit=args.top)
470
474
  elif args.entities:
471
475
  entities = json.loads(args.entities)
472
- facts = query_facts_by_entities(args.db, entities, max_facts=args.max_facts)
476
+ # Use hybrid retrieval (FTS5 + tags + entity graph + RRF) for best results
477
+ query_text = " ".join(entities)
478
+ facts = query_facts_hybrid(args.db, query_text, max_facts=args.max_facts)
479
+ # Fallback to tag-only if hybrid returns nothing
480
+ if not facts:
481
+ facts = query_facts_by_entities(args.db, entities, max_facts=args.max_facts)
473
482
  else:
474
483
  facts = query_top_facts(args.db, limit=args.max_facts)
475
484
 
@@ -21,7 +21,9 @@ import json
21
21
  import re
22
22
  import shutil
23
23
  import sys
24
+ import unicodedata
24
25
  from datetime import datetime, timezone
26
+ from difflib import SequenceMatcher
25
27
  from pathlib import Path
26
28
 
27
29
  from common import (
@@ -121,9 +123,50 @@ def _fact_id(entity: str, attribute: str, value: str) -> str:
121
123
  return f"fact:{slug}-{h}"
122
124
 
123
125
 
126
+ _UNICODE_PRE_MAP = str.maketrans({"ß": "ss", "ẞ": "SS"})
127
+
128
+
124
129
  def _normalize_entity(name: str) -> str:
125
- """Normalize entity name to canonical form: lowercase, hyphenated, no punctuation."""
126
- return re.sub(r"[^a-z0-9-]", "", name.lower().replace(" ", "-").replace("_", "-"))
130
+ """Normalize entity name to canonical form: lowercase, hyphenated, ASCII-transliterated."""
131
+ s = name.translate(_UNICODE_PRE_MAP)
132
+ s = unicodedata.normalize("NFKD", s)
133
+ s = s.encode("ascii", "ignore").decode("ascii")
134
+ s = s.lower().replace(" ", "-").replace("_", "-")
135
+ s = re.sub(r"[^a-z0-9-]", "", s)
136
+ s = re.sub(r"-{2,}", "-", s)
137
+ return s.strip("-")
138
+
139
+
140
+ def _find_matching_entity(
141
+ name: str,
142
+ existing_names: dict[str, str],
143
+ ) -> str | None:
144
+ """Find an existing entity that fuzzy-matches `name`. Returns entity_node_id or None."""
145
+ if name in existing_names:
146
+ return existing_names[name]
147
+
148
+ # Hyphen-insensitive exact match (chatgpt == chat-gpt)
149
+ name_compact = name.replace("-", "")
150
+ for existing_name, node_id in existing_names.items():
151
+ if existing_name.replace("-", "") == name_compact:
152
+ return node_id
153
+
154
+ # Edit-distance fuzzy match
155
+ if len(name) < 3:
156
+ return None
157
+ threshold = 0.90
158
+ best_match = None
159
+ best_ratio = threshold
160
+ for existing_name, node_id in existing_names.items():
161
+ if len(existing_name) < 3:
162
+ continue
163
+ if frozenset({name, existing_name}) in _DEDUP_SKIP_PAIRS:
164
+ continue
165
+ ratio = SequenceMatcher(None, name, existing_name).ratio()
166
+ if ratio >= best_ratio:
167
+ best_ratio = ratio
168
+ best_match = node_id
169
+ return best_match
127
170
 
128
171
 
129
172
  def _canonicalize_ops(ops: list[dict], existing_entities: list[str], existing_facts: list[dict]) -> list[dict]:
@@ -528,7 +571,14 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
528
571
  # --- Build entity graph layer (two-layer model) ---
529
572
  if digest_entities and stats["asserted"] > 0:
530
573
  try:
531
- # Create entity:* nodes from digest entities
574
+ # Load existing entity names for fuzzy matching
575
+ all_entity_nodes: dict[str, str] = {} # {name: entity_node_id}
576
+ for r in store.entities_with_attr("name"):
577
+ if r[0].startswith("entity:"):
578
+ all_entity_nodes[r[1]] = r[0]
579
+
580
+ # Create entity:* nodes from digest entities (with fuzzy dedup)
581
+ entity_resolve: dict[str, str] = {} # {normalized_name: resolved_node_id}
532
582
  for ent in (digest_entities or []):
533
583
  if isinstance(ent, dict):
534
584
  ename = _normalize_entity(ent.get("name", ""))
@@ -539,12 +589,22 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
539
589
  if not ename or len(ename) < 2:
540
590
  continue
541
591
 
592
+ # Check for fuzzy match against existing entities
593
+ matched_id = _find_matching_entity(ename, all_entity_nodes)
594
+ if matched_id:
595
+ entity_resolve[ename] = matched_id
596
+ if matched_id != f"entity:{ename}":
597
+ print(f" [graph] alias: \"{ename}\" → {matched_id}", file=sys.stderr)
598
+ continue
599
+
542
600
  entity_node_id = f"entity:{ename}"
543
601
  existing = store.entity(entity_node_id)
544
602
  if not existing:
545
603
  tx = store.begin_tx("entity_graph")
546
604
  store.assert_triple(tx, entity_node_id, "name", ename)
547
605
  store.assert_triple(tx, entity_node_id, "type", etype)
606
+ all_entity_nodes[ename] = entity_node_id
607
+ entity_resolve[ename] = entity_node_id
548
608
 
549
609
  # Link facts to their entity nodes via "about" ref edges
550
610
  for op_data in ops:
@@ -554,18 +614,13 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
554
614
  value = op_data.get("value", "")
555
615
  attribute = op_data.get("attribute", "")
556
616
  fact_eid = _fact_id(entity, attribute, value)
557
- entity_node_id = f"entity:{_normalize_entity(entity)}"
617
+ norm_entity = _normalize_entity(entity)
618
+ entity_node_id = entity_resolve.get(norm_entity, f"entity:{norm_entity}")
558
619
  # Only link if entity node exists
559
620
  if store.entity(entity_node_id):
560
621
  tx = store.begin_tx("entity_graph")
561
622
  store.assert_triple(tx, fact_eid, "about", entity_node_id, value_type="ref")
562
623
 
563
- # Infer cross-entity refs from fact content
564
- all_entity_nodes = {}
565
- for r in store.entities_with_attr("name"):
566
- if r[0].startswith("entity:"):
567
- all_entity_nodes[r[1]] = r[0] # {name: entity_id}
568
-
569
624
  ref_count = 0
570
625
  for fact_eid_row in store.entities_with_attr("value"):
571
626
  fact_eid = fact_eid_row[0]
@@ -695,17 +750,146 @@ def _bootstrap_graph(memory_dir: str, db_path: str) -> dict:
695
750
  return {"bootstrapped": stats.get("asserted", 0)}
696
751
 
697
752
 
753
+ # Pairs that fuzzy matching incorrectly clusters — reviewed and confirmed distinct.
754
+ _DEDUP_SKIP_PAIRS = {
755
+ frozenset({"ai-driven-development", "spac-driven-development"}),
756
+ frozenset({"german", "germany"}),
757
+ frozenset({"llama", "ollama"}),
758
+ frozenset({"gemma", "gemma4"}),
759
+ }
760
+
761
+
762
+ def merge_entity_duplicates(db_path: str, dry_run: bool = True) -> dict:
763
+ """Merge fragmented entity nodes using fuzzy matching.
764
+
765
+ Idempotent: checks for migration:entity-dedup-v1 stamp.
766
+ """
767
+ from triplestore import TripleStore
768
+ store = TripleStore(db_path)
769
+
770
+ # Idempotency check
771
+ stamp = store.entity("migration:entity-dedup-v1")
772
+ if stamp:
773
+ print("migration:entity-dedup-v1 already applied — skipping", file=sys.stderr)
774
+ return {"status": "already_applied"}
775
+
776
+ # Load all entity nodes
777
+ all_entities: dict[str, str] = {} # {name: entity_node_id}
778
+ for entity_id, name in store.entities_with_attr("name"):
779
+ if entity_id.startswith("entity:"):
780
+ all_entities[name] = entity_id
781
+
782
+ print(f"Total entity nodes: {len(all_entities)}", file=sys.stderr)
783
+
784
+ # Build clusters via greedy matching
785
+ remaining = dict(all_entities) # copy
786
+ clusters: list[list[tuple[str, str]]] = [] # [[( name, node_id ), ...], ...]
787
+
788
+ while remaining:
789
+ seed_name, seed_id = next(iter(remaining.items()))
790
+ cluster = [(seed_name, seed_id)]
791
+ del remaining[seed_name]
792
+
793
+ # Find all matches for this seed
794
+ to_remove = []
795
+ for other_name, other_id in remaining.items():
796
+ matched = _find_matching_entity(other_name, {seed_name: seed_id})
797
+ if matched:
798
+ cluster.append((other_name, other_id))
799
+ to_remove.append(other_name)
800
+ for name in to_remove:
801
+ del remaining[name]
802
+
803
+ if len(cluster) > 1:
804
+ # Filter out known false-positive pairs
805
+ names_set = {n for n, _ in cluster}
806
+ if any(pair <= names_set for pair in _DEDUP_SKIP_PAIRS):
807
+ continue
808
+ clusters.append(cluster)
809
+
810
+ print(f"Found {len(clusters)} duplicate clusters", file=sys.stderr)
811
+
812
+ merge_count = 0
813
+ repoint_count = 0
814
+
815
+ for cluster in clusters:
816
+ # Canonical selection: if any entity has significantly more backrefs (5+),
817
+ # use it. Otherwise prefer longest name (most complete spelling).
818
+ max_refs = max(len(store.backrefs(nid)) for _, nid in cluster)
819
+ if max_refs >= 5:
820
+ cluster.sort(key=lambda x: (-len(store.backrefs(x[1])), -len(x[0]), x[0]))
821
+ else:
822
+ cluster.sort(key=lambda x: (-len(x[0]), x[0]))
823
+ canonical_name, canonical_id = cluster[0]
824
+ duplicates = cluster[1:]
825
+
826
+ dup_names = [d[0] for d in duplicates]
827
+ print(f" cluster: {canonical_name} ← {dup_names}", file=sys.stderr)
828
+
829
+ if dry_run:
830
+ merge_count += len(duplicates)
831
+ continue
832
+
833
+ for dup_name, dup_id in duplicates:
834
+ # Re-point all refs pointing to this duplicate
835
+ refs = store.backrefs(dup_id)
836
+ for src_entity, attr in refs:
837
+ tx = store.begin_tx("entity_dedup")
838
+ store.retract_triple(tx, src_entity, attr, dup_id)
839
+ store.assert_triple(tx, src_entity, attr, canonical_id, value_type="ref")
840
+ repoint_count += 1
841
+
842
+ # Retract all triples of the duplicate entity itself
843
+ dup_attrs = store.entity(dup_id)
844
+ tx = store.begin_tx("entity_dedup")
845
+ for attr, values in dup_attrs.items():
846
+ if not isinstance(values, list):
847
+ values = [values]
848
+ for val in values:
849
+ store.retract_triple(tx, dup_id, attr, str(val))
850
+
851
+ merge_count += 1
852
+
853
+ # Stamp migration
854
+ if not dry_run and clusters:
855
+ tx = store.begin_tx("entity_dedup")
856
+ store.assert_triple(tx, "migration:entity-dedup-v1", "applied_at",
857
+ datetime.now(timezone.utc).isoformat())
858
+ store.assert_triple(tx, "migration:entity-dedup-v1", "clusters_merged",
859
+ str(len(clusters)))
860
+
861
+ result = {
862
+ "status": "dry_run" if dry_run else "applied",
863
+ "clusters": len(clusters),
864
+ "entities_merged": merge_count,
865
+ "refs_repointed": repoint_count,
866
+ }
867
+ print(json.dumps(result, indent=2), file=sys.stderr)
868
+ return result
869
+
870
+
698
871
  def main() -> None:
699
872
  parser = argparse.ArgumentParser(description="Knowledge Integrator")
700
873
  parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
701
874
  parser.add_argument("--digest", default=None, help="SessionDigest JSON string")
702
875
  parser.add_argument("--bootstrap", action="store_true", help="One-time: seed graph from playbook")
703
876
  parser.add_argument("--retag", action="store_true", help="Re-extract tags for all existing facts")
877
+ parser.add_argument("--dedup-entities", action="store_true", help="Merge fragmented entity nodes")
878
+ parser.add_argument("--dry-run", action="store_true", help="Preview changes without applying")
704
879
  args = parser.parse_args()
705
880
 
706
881
  memory_dir = args.memory_dir
707
882
  db_path = str(Path(memory_dir) / "knowledge-graph.db")
708
883
 
884
+ # Entity dedup mode: merge fragmented entity nodes
885
+ if args.dedup_entities:
886
+ if not Path(db_path).exists():
887
+ output_json({"error": "knowledge-graph.db not found"})
888
+ return
889
+ result = merge_entity_duplicates(db_path, dry_run=args.dry_run)
890
+ output_json(result)
891
+ return
892
+
709
893
  # Bootstrap mode: seed graph from current playbook
710
894
  if args.bootstrap:
711
895
  result = _bootstrap_graph(memory_dir, db_path)
File without changes