@geravant/sinain 1.12.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/.env.example +4 -2
  2. package/config-shared.js +1 -0
  3. package/package.json +4 -1
  4. package/sinain-agent/run.sh +36 -4
  5. package/sinain-core/package-lock.json +963 -0
  6. package/sinain-core/package.json +1 -0
  7. package/sinain-core/src/buffers/feed-buffer.ts +34 -0
  8. package/sinain-core/src/embedding/service.ts +66 -0
  9. package/sinain-core/src/index.ts +65 -17
  10. package/sinain-core/src/learning/local-curation.ts +137 -7
  11. package/sinain-core/src/server.ts +31 -0
  12. package/sinain-memory/README.md +105 -0
  13. package/sinain-memory/embed_client.py +117 -0
  14. package/sinain-memory/graph_query.py +269 -18
  15. package/sinain-memory/knowledge_integrator.py +551 -74
  16. package/sinain-memory/memory-config.json +1 -1
  17. package/sinain-memory/session_distiller.py +43 -19
  18. package/sinain-memory/triplestore.py +60 -0
  19. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  20. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  21. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  22. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  23. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  24. package/sinain-memory/eval/__init__.py +0 -0
  25. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/assertions.py +0 -267
  27. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  31. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  32. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  33. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  34. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  35. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  38. package/sinain-memory/eval/benchmarks/config.py +0 -23
  39. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  40. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  41. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  42. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  43. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  44. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  45. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  46. package/sinain-memory/eval/benchmarks/query.py +0 -172
  47. package/sinain-memory/eval/benchmarks/report.py +0 -87
  48. package/sinain-memory/eval/benchmarks/runner.py +0 -276
  49. package/sinain-memory/eval/judges/__init__.py +0 -0
  50. package/sinain-memory/eval/judges/base_judge.py +0 -61
  51. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  52. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  53. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  54. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  55. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  56. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  57. package/sinain-memory/eval/schemas.py +0 -247
  58. package/sinain-memory/tests/__init__.py +0 -0
  59. package/sinain-memory/tests/conftest.py +0 -189
  60. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  61. package/sinain-memory/tests/test_embedder.py +0 -210
  62. package/sinain-memory/tests/test_extract_json.py +0 -124
  63. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  64. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  65. package/sinain-memory/tests/test_module_management.py +0 -458
  66. package/sinain-memory/tests/test_parsers.py +0 -96
  67. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  68. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  69. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  70. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  71. package/sinain-memory/tests/test_triplestore.py +0 -248
@@ -21,7 +21,9 @@ import json
21
21
  import re
22
22
  import shutil
23
23
  import sys
24
+ import unicodedata
24
25
  from datetime import datetime, timezone
26
+ from difflib import SequenceMatcher
25
27
  from pathlib import Path
26
28
 
27
29
  from common import (
@@ -55,29 +57,33 @@ FOR THE PLAYBOOK:
55
57
  - Three Laws: (1) don't remove error-prevention patterns, (2) preserve high-scoring approaches, (3) then evolve
56
58
 
57
59
  FOR THE KNOWLEDGE GRAPH:
58
- - ASSERT new durable facts (error→fix mappings, domain knowledge, user expertise)
60
+ - ASSERT every concrete fact from the digest: factual claims, decisions, relationships, numbers
59
61
  - REINFORCE existing facts confirmed by the session (list their entity_ids)
60
62
  - RETRACT facts contradicted by session evidence (list their entity_ids)
61
- - Each fact needs: entity (domain/tool/workflow), attribute (relationship type), value (the knowledge), confidence (0.0-1.0), domain (for module scoping)
62
- - Entity naming: use lowercase-hyphenated slugs (e.g., "react-native", "metro-bundler")
63
- - Only assert DURABLE facts — not ephemeral session details
63
+ - Each fact needs: entity (real name from content), attribute (relationship type), value (self-contained sentence), confidence (0.0-1.0), domain (for scoping)
64
+ - Entity naming: use actual names as lowercase-hyphenated slugs
65
+ Good: "citibank", "al-futaim-group", "artom", "intellij-idea"
66
+ Bad: "ai-solutions", "client-understanding", "tool-usage"
67
+ - The value field must be a complete, self-contained sentence that answers a question on its own
68
+ - Assert BOTH durable facts AND time-bound decisions/action items (mark decisions with confidence 0.7)
64
69
 
65
70
  If the session was empty/idle, return minimal changes.
66
71
 
67
- Respond with ONLY a JSON object:
72
+ Respond with ONLY a JSON object. IMPORTANT: put graphOps FIRST (before playbook) — \
73
+ graphOps are the most valuable output and must not be truncated.
68
74
  {
69
- "updatedPlaybook": "full playbook body text (between header and footer comments)",
75
+ "graphOps": [
76
+ {"op": "assert", "entity": "entity-slug", "attribute": "attr-name", "value": "fact text", "confidence": 0.8, "domain": "domain-name"},
77
+ {"op": "reinforce", "entityId": "fact:existing-slug"},
78
+ {"op": "retract", "entityId": "fact:existing-slug", "reason": "why"}
79
+ ],
70
80
  "changes": {
71
81
  "added": ["pattern text", ...],
72
82
  "pruned": ["pattern text", ...],
73
83
  "promoted": ["pattern text", ...],
74
84
  "reinforced": ["pattern text", ...]
75
85
  },
76
- "graphOps": [
77
- {"op": "assert", "entity": "entity-slug", "attribute": "attr-name", "value": "fact text", "confidence": 0.8, "domain": "domain-name"},
78
- {"op": "reinforce", "entityId": "fact:existing-slug"},
79
- {"op": "retract", "entityId": "fact:existing-slug", "reason": "why"}
80
- ]
86
+ "updatedPlaybook": "full playbook body text (between header and footer comments)"
81
87
  }"""
82
88
 
83
89
 
@@ -117,51 +123,127 @@ def _fact_id(entity: str, attribute: str, value: str) -> str:
117
123
  return f"fact:{slug}-{h}"
118
124
 
119
125
 
126
+ _UNICODE_PRE_MAP = str.maketrans({"ß": "ss", "ẞ": "SS"})
127
+
128
+
120
129
  def _normalize_entity(name: str) -> str:
121
- """Normalize entity name to canonical form: lowercase, hyphenated, no punctuation."""
122
- return re.sub(r"[^a-z0-9-]", "", name.lower().replace(" ", "-").replace("_", "-"))
130
+ """Normalize entity name to canonical form: lowercase, hyphenated, ASCII-transliterated."""
131
+ s = name.translate(_UNICODE_PRE_MAP)
132
+ s = unicodedata.normalize("NFKD", s)
133
+ s = s.encode("ascii", "ignore").decode("ascii")
134
+ s = s.lower().replace(" ", "-").replace("_", "-")
135
+ s = re.sub(r"[^a-z0-9-]", "", s)
136
+ s = re.sub(r"-{2,}", "-", s)
137
+ return s.strip("-")
138
+
139
+
140
+ def _find_matching_entity(
141
+ name: str,
142
+ existing_names: dict[str, str],
143
+ ) -> str | None:
144
+ """Find an existing entity that fuzzy-matches `name`. Returns entity_node_id or None."""
145
+ if name in existing_names:
146
+ return existing_names[name]
147
+
148
+ # Hyphen-insensitive exact match (chatgpt == chat-gpt)
149
+ name_compact = name.replace("-", "")
150
+ for existing_name, node_id in existing_names.items():
151
+ if existing_name.replace("-", "") == name_compact:
152
+ return node_id
153
+
154
+ # Edit-distance fuzzy match
155
+ if len(name) < 3:
156
+ return None
157
+ threshold = 0.90
158
+ best_match = None
159
+ best_ratio = threshold
160
+ for existing_name, node_id in existing_names.items():
161
+ if len(existing_name) < 3:
162
+ continue
163
+ if frozenset({name, existing_name}) in _DEDUP_SKIP_PAIRS:
164
+ continue
165
+ ratio = SequenceMatcher(None, name, existing_name).ratio()
166
+ if ratio >= best_ratio:
167
+ best_ratio = ratio
168
+ best_match = node_id
169
+ return best_match
123
170
 
124
171
 
125
- def _canonicalize_ops(ops: list[dict], existing_entities: list[str]) -> list[dict]:
126
- """Map variant entity names to canonical forms before graph execution.
172
+ def _canonicalize_ops(ops: list[dict], existing_entities: list[str], existing_facts: list[dict]) -> list[dict]:
173
+ """Deduplicate graph ops via embedding similarity (Mem0 pattern).
127
174
 
128
- Inspired by mempalace entity detection uses simple heuristic instead of
129
- rule-based signal detection: normalize names, merge on edit distance or substring match.
130
- Converts duplicate assert reinforce when a near-match exists.
175
+ For each new assertion, check if a semantically equivalent fact already exists
176
+ using cosine similarity (threshold 0.78). If so, reinforce instead of asserting.
177
+ Falls back to exact hash matching if embedding service is unavailable.
131
178
  """
132
- canonical_map: dict[str, str] = {} # normalized → existing entity name
133
- for eid in existing_entities:
134
- # Extract entity name from the entity_id's attributes (stored as "entity" attr)
135
- canonical_map[_normalize_entity(eid)] = eid
179
+ existing_id_set = set(existing_entities)
180
+
181
+ # Build text→entity_id map for existing facts (for embedding-based dedup)
182
+ existing_texts: list[str] = []
183
+ existing_ids: list[str] = []
184
+ for f in existing_facts:
185
+ val = f.get("value", "")
186
+ eid = f.get("entityId", f.get("entity_id", ""))
187
+ if val and eid:
188
+ existing_texts.append(val)
189
+ existing_ids.append(eid)
190
+
191
+ # Separate assert ops for batch dedup
192
+ assert_ops = [(i, op) for i, op in enumerate(ops) if op.get("op") == "assert"]
193
+ non_assert_ops = [(i, op) for i, op in enumerate(ops) if op.get("op") != "assert"]
194
+
195
+ # Batch embedding dedup: single HTTP call for all new facts
196
+ dedup_map: dict[int, int] = {} # assert_index → existing_index
197
+ if assert_ops and existing_texts:
198
+ try:
199
+ from embed_client import find_duplicates_batch
200
+ new_values = [op.get("value", "") for _, op in assert_ops]
201
+ dedup_map = find_duplicates_batch(new_values, existing_texts)
202
+ if dedup_map:
203
+ print(f" [dedup] found {len(dedup_map)} semantic duplicates in batch", file=sys.stderr)
204
+ except Exception:
205
+ pass # embedding unavailable, fall through to exact matching
136
206
 
137
207
  result = []
138
- for op in ops:
208
+ seen_fact_ids: set[str] = set()
209
+ seen_values_set: set[str] = set()
210
+
211
+ # Re-merge in original order
212
+ all_indexed = non_assert_ops + assert_ops
213
+ all_indexed.sort(key=lambda x: x[0])
214
+
215
+ for orig_idx, op in all_indexed:
139
216
  if op.get("op") != "assert":
140
217
  result.append(op)
141
218
  continue
142
219
 
143
220
  entity = op.get("entity", "")
144
- normalized = _normalize_entity(entity)
221
+ attribute = op.get("attribute", "")
222
+ value = op.get("value", "")
223
+ fact_id = _fact_id(entity, attribute, value)
224
+
225
+ # Exact hash match
226
+ if fact_id in existing_id_set or fact_id in seen_fact_ids:
227
+ if fact_id in existing_id_set:
228
+ result.append({"op": "reinforce", "entityId": fact_id})
229
+ print(f" [dedup] exact → reinforce '{fact_id}'", file=sys.stderr)
230
+ continue
145
231
 
146
- # Check for near-match in existing entities
147
- matched_id = None
148
- for existing_norm, existing_eid in canonical_map.items():
149
- if existing_norm == normalized:
150
- matched_id = existing_eid
151
- break
152
- # Substring match: "react-router" matches "react-router-dom"
153
- if len(normalized) >= 4 and (normalized in existing_norm or existing_norm in normalized):
154
- matched_id = existing_eid
155
- break
232
+ # Check batch embedding dedup results
233
+ assert_idx = [i for i, (oi, _) in enumerate(assert_ops) if oi == orig_idx]
234
+ if assert_idx and assert_idx[0] in dedup_map:
235
+ dup_existing_idx = dedup_map[assert_idx[0]]
236
+ result.append({"op": "reinforce", "entityId": existing_ids[dup_existing_idx]})
237
+ print(f" [dedup] semantic → reinforce '{existing_ids[dup_existing_idx]}'", file=sys.stderr)
238
+ continue
156
239
 
157
- if matched_id:
158
- # Convert assert → reinforce (entity already exists under different name)
159
- result.append({"op": "reinforce", "entityId": matched_id})
160
- print(f" [canon] merged '{entity}' → existing '{matched_id}'", file=sys.stderr)
161
- else:
162
- result.append(op)
163
- # Register the new canonical form
164
- canonical_map[normalized] = _fact_id(entity, op.get("attribute", ""), op.get("value", ""))
240
+ # Intra-batch dedup (by value text)
241
+ if value in seen_values_set:
242
+ continue
243
+
244
+ result.append(op)
245
+ seen_fact_ids.add(fact_id)
246
+ seen_values_set.add(value)
165
247
 
166
248
  return result
167
249
 
@@ -179,7 +261,14 @@ def _load_graph_facts(db_path: str, entities: list[str] | None = None, limit: in
179
261
  if entities:
180
262
  # Tag-based search: find facts whose tags match any of the keywords
181
263
  # Normalize keywords to lowercase for tag matching
182
- keywords = [e.lower().replace(" ", "-") for e in entities]
264
+ # Handle both old-style string entities and new-style dict entities
265
+ keywords = []
266
+ for e in entities:
267
+ if isinstance(e, dict):
268
+ keywords.append(e.get("name", "").lower().replace(" ", "-"))
269
+ else:
270
+ keywords.append(str(e).lower().replace(" ", "-"))
271
+ keywords = [k for k in keywords if k]
183
272
  placeholders = ",".join(["?" for _ in keywords])
184
273
  rows = store._conn.execute(
185
274
  f"""SELECT entity_id, COUNT(*) as matches
@@ -221,8 +310,156 @@ def _load_graph_facts(db_path: str, entities: list[str] | None = None, limit: in
221
310
  return []
222
311
 
223
312
 
224
- def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str) -> dict:
225
- """Execute graph operations (assert/reinforce/retract) on the knowledge graph."""
313
+ def _consolidate_entity_facts(db_path: str, min_facts: int = 3) -> int:
314
+ """Merge multiple facts about the same entity into consolidated facts.
315
+
316
+ Pure code — no LLM. Concatenates fact values with "; " separator.
317
+ Runs at shutdown only (not incremental passes).
318
+ """
319
+ try:
320
+ from triplestore import TripleStore
321
+ store = TripleStore(db_path)
322
+
323
+ # Group facts by entity name
324
+ entity_facts: dict[str, list[tuple[str, str]]] = {} # entity → [(fact_id, value)]
325
+ for r in store.entities_with_attr("entity"):
326
+ fact_id, entity_name = r[0], r[1]
327
+ if not fact_id.startswith("fact:") or isinstance(entity_name, list):
328
+ continue
329
+ attrs = store.entity(fact_id)
330
+ if attrs and "value" in attrs:
331
+ val = attrs["value"][0] if isinstance(attrs["value"], list) else str(attrs["value"])
332
+ entity_facts.setdefault(entity_name, []).append((fact_id, val))
333
+
334
+ consolidated = 0
335
+ for entity_name, facts in entity_facts.items():
336
+ if len(facts) < min_facts:
337
+ continue
338
+
339
+ # Check if a consolidated fact already exists
340
+ if any(";" in val and len(val) > 100 for _, val in facts):
341
+ continue # already consolidated
342
+
343
+ # Deduplicate values (same fact stated differently)
344
+ seen_values: list[str] = []
345
+ for _, val in facts:
346
+ # Skip if very similar to an already-seen value
347
+ if not any(len(set(val.lower().split()) & set(sv.lower().split())) / max(len(val.split()), 1) > 0.7 for sv in seen_values):
348
+ seen_values.append(val)
349
+
350
+ if len(seen_values) < 2:
351
+ continue # nothing to consolidate after dedup
352
+
353
+ merged_value = "; ".join(seen_values)
354
+ if len(merged_value) > 500:
355
+ merged_value = merged_value[:500] + "..."
356
+
357
+ # Create consolidated fact, retract originals
358
+ tx = store.begin_tx("consolidation")
359
+ new_eid = _fact_id(entity_name, "consolidated", merged_value)
360
+ store.assert_triple(tx, new_eid, "entity", entity_name)
361
+ store.assert_triple(tx, new_eid, "attribute", "consolidated")
362
+ store.assert_triple(tx, new_eid, "value", merged_value)
363
+ store.assert_triple(tx, new_eid, "confidence", "0.95")
364
+ store.assert_triple(tx, new_eid, "first_seen", _now_iso())
365
+ store.assert_triple(tx, new_eid, "reinforce_count", str(len(facts)))
366
+ for tag in _extract_tags(merged_value):
367
+ store.assert_triple(tx, new_eid, "tag", tag)
368
+
369
+ # Retract original individual facts
370
+ for old_eid, _ in facts:
371
+ for attr_name in list(store.entity(old_eid).keys()):
372
+ store.retract_triple(tx, old_eid, attr_name)
373
+
374
+ consolidated += 1
375
+ print(f" [consolidate] {entity_name}: {len(facts)} facts → 1 ({len(merged_value)} chars)", file=sys.stderr)
376
+
377
+ store.close()
378
+ return consolidated
379
+ except Exception as e:
380
+ print(f" [consolidate] failed: {e}", file=sys.stderr)
381
+ return 0
382
+
383
+
384
+ def _now_iso() -> str:
385
+ from datetime import datetime, timezone
386
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
387
+
388
+
389
+ def _extract_entity_from_fact(fact_text: str, known_entities: list) -> str:
390
+ """Extract the most relevant entity name from a fact sentence.
391
+
392
+ Matches against known entities from the distiller output.
393
+ Falls back to first capitalized multi-word phrase.
394
+ """
395
+ fact_lower = fact_text.lower()
396
+ # Check which known entities appear in the fact text (longest match first)
397
+ candidates = []
398
+ for ent in known_entities:
399
+ ename = ent if isinstance(ent, str) else ent.get("name", "")
400
+ if ename and ename.lower().replace("-", " ") in fact_lower.replace("-", " "):
401
+ candidates.append(ename)
402
+ if candidates:
403
+ # Return the longest matching entity (most specific)
404
+ return _normalize_entity(max(candidates, key=len))
405
+
406
+ # Fallback: first capitalized multi-word phrase
407
+ import re as _re
408
+ match = _re.search(r"[A-Z][a-z]+(?: [A-Z][a-z]+)+", fact_text)
409
+ if match:
410
+ return _normalize_entity(match.group())
411
+
412
+ # Last resort: first significant word
413
+ words = [w for w in fact_text.split() if len(w) > 3 and w[0].isupper()]
414
+ if words:
415
+ return _normalize_entity(words[0])
416
+
417
+ return "general"
418
+
419
+
420
+ def _facts_to_graph_ops(digest: dict) -> list[dict]:
421
+ """Convert distiller facts/entities/decisions directly to graph ops.
422
+
423
+ DETERMINISTIC — no LLM needed. The distiller already extracted structured
424
+ facts with entity names. This function mechanically converts them to
425
+ assert operations for the triplestore.
426
+ """
427
+ ops = []
428
+ known_entities = digest.get("entities", [])
429
+
430
+ # Each fact becomes an assert op
431
+ for fact_text in digest.get("facts", []):
432
+ if not fact_text or len(fact_text) < 5:
433
+ continue
434
+ entity = _extract_entity_from_fact(fact_text, known_entities)
435
+ ops.append({
436
+ "op": "assert",
437
+ "entity": entity,
438
+ "attribute": "fact",
439
+ "value": fact_text,
440
+ "confidence": 0.9,
441
+ "domain": "",
442
+ })
443
+
444
+ # Each decision becomes an assert with lower confidence (time-bound)
445
+ for decision_text in digest.get("decisions", []):
446
+ if not decision_text or len(decision_text) < 5:
447
+ continue
448
+ entity = _extract_entity_from_fact(decision_text, known_entities)
449
+ ops.append({
450
+ "op": "assert",
451
+ "entity": entity,
452
+ "attribute": "decision",
453
+ "value": decision_text,
454
+ "confidence": 0.7,
455
+ "domain": "",
456
+ })
457
+
458
+ return ops
459
+
460
+
461
+ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_entities: list | None = None) -> dict:
462
+ """Execute graph operations + build entity graph with ref edges."""
226
463
  if not ops:
227
464
  return {"asserted": 0, "reinforced": 0, "retracted": 0}
228
465
 
@@ -230,9 +467,18 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str) -> dict:
230
467
  from triplestore import TripleStore
231
468
  store = TripleStore(db_path)
232
469
 
233
- # Canonicalize entity names to prevent fragmentation
470
+ # Deduplicate via embedding similarity (Mem0 pattern)
234
471
  existing_ids = [r[0] for r in store.entities_with_attr("entity")]
235
- ops = _canonicalize_ops(ops, existing_ids)
472
+ # Load existing fact values for semantic comparison
473
+ existing_facts_for_dedup = []
474
+ for eid in existing_ids:
475
+ attrs = store.entity(eid)
476
+ if attrs and "value" in attrs:
477
+ vals = attrs["value"]
478
+ val = vals[0] if isinstance(vals, list) and vals else str(vals) if vals else ""
479
+ if val:
480
+ existing_facts_for_dedup.append({"entity_id": eid, "value": val})
481
+ ops = _canonicalize_ops(ops, existing_ids, existing_facts_for_dedup)
236
482
 
237
483
  stats = {"asserted": 0, "reinforced": 0, "retracted": 0}
238
484
 
@@ -322,10 +568,90 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str) -> dict:
322
568
  store.retract_triple(tx, entity_id, attr_name, val)
323
569
  stats["retracted"] += 1
324
570
 
571
+ # --- Build entity graph layer (two-layer model) ---
572
+ if digest_entities and stats["asserted"] > 0:
573
+ try:
574
+ # Load existing entity names for fuzzy matching
575
+ all_entity_nodes: dict[str, str] = {} # {name: entity_node_id}
576
+ for r in store.entities_with_attr("name"):
577
+ if r[0].startswith("entity:"):
578
+ all_entity_nodes[r[1]] = r[0]
579
+
580
+ # Create entity:* nodes from digest entities (with fuzzy dedup)
581
+ entity_resolve: dict[str, str] = {} # {normalized_name: resolved_node_id}
582
+ for ent in (digest_entities or []):
583
+ if isinstance(ent, dict):
584
+ ename = _normalize_entity(ent.get("name", ""))
585
+ etype = ent.get("type", "unknown")
586
+ else:
587
+ ename = _normalize_entity(str(ent))
588
+ etype = "unknown"
589
+ if not ename or len(ename) < 2:
590
+ continue
591
+
592
+ # Check for fuzzy match against existing entities
593
+ matched_id = _find_matching_entity(ename, all_entity_nodes)
594
+ if matched_id:
595
+ entity_resolve[ename] = matched_id
596
+ if matched_id != f"entity:{ename}":
597
+ print(f" [graph] alias: \"{ename}\" → {matched_id}", file=sys.stderr)
598
+ continue
599
+
600
+ entity_node_id = f"entity:{ename}"
601
+ existing = store.entity(entity_node_id)
602
+ if not existing:
603
+ tx = store.begin_tx("entity_graph")
604
+ store.assert_triple(tx, entity_node_id, "name", ename)
605
+ store.assert_triple(tx, entity_node_id, "type", etype)
606
+ all_entity_nodes[ename] = entity_node_id
607
+ entity_resolve[ename] = entity_node_id
608
+
609
+ # Link facts to their entity nodes via "about" ref edges
610
+ for op_data in ops:
611
+ if op_data.get("op") != "assert":
612
+ continue
613
+ entity = op_data.get("entity", "")
614
+ value = op_data.get("value", "")
615
+ attribute = op_data.get("attribute", "")
616
+ fact_eid = _fact_id(entity, attribute, value)
617
+ norm_entity = _normalize_entity(entity)
618
+ entity_node_id = entity_resolve.get(norm_entity, f"entity:{norm_entity}")
619
+ # Only link if entity node exists
620
+ if store.entity(entity_node_id):
621
+ tx = store.begin_tx("entity_graph")
622
+ store.assert_triple(tx, fact_eid, "about", entity_node_id, value_type="ref")
623
+
624
+ ref_count = 0
625
+ for fact_eid_row in store.entities_with_attr("value"):
626
+ fact_eid = fact_eid_row[0]
627
+ if not fact_eid.startswith("fact:"):
628
+ continue
629
+ attrs = store.entity(fact_eid)
630
+ source_entity = (attrs.get("entity", [""])[0] if attrs.get("entity") else "").lower()
631
+ value_lower = (attrs["value"][0] if attrs.get("value") else "").lower()
632
+
633
+ for ename, enode_id in all_entity_nodes.items():
634
+ if ename == source_entity or len(ename) < 4:
635
+ continue
636
+ if ename in value_lower:
637
+ existing_refs = store.backrefs(enode_id, attribute="mentions")
638
+ if not any(r[0] == fact_eid for r in existing_refs):
639
+ tx = store.begin_tx("ref_inference")
640
+ store.assert_triple(tx, fact_eid, "mentions", enode_id, value_type="ref")
641
+ ref_count += 1
642
+
643
+ if ref_count:
644
+ stats["refs_created"] = ref_count
645
+ print(f" [graph] {len(all_entity_nodes)} entity nodes, {ref_count} ref edges", file=sys.stderr)
646
+ except Exception as e:
647
+ print(f" [graph] entity graph failed (non-fatal): {e}", file=sys.stderr)
648
+
325
649
  store.close()
326
650
  return stats
327
651
  except Exception as e:
652
+ import traceback
328
653
  print(f"[warn] Failed to execute graph ops: {e}", file=sys.stderr)
654
+ traceback.print_exc(file=sys.stderr)
329
655
  return {"asserted": 0, "reinforced": 0, "retracted": 0, "error": str(e)}
330
656
 
331
657
 
@@ -424,17 +750,146 @@ def _bootstrap_graph(memory_dir: str, db_path: str) -> dict:
424
750
  return {"bootstrapped": stats.get("asserted", 0)}
425
751
 
426
752
 
753
+ # Pairs that fuzzy matching incorrectly clusters — reviewed and confirmed distinct.
754
+ _DEDUP_SKIP_PAIRS = {
755
+ frozenset({"ai-driven-development", "spac-driven-development"}),
756
+ frozenset({"german", "germany"}),
757
+ frozenset({"llama", "ollama"}),
758
+ frozenset({"gemma", "gemma4"}),
759
+ }
760
+
761
+
762
+ def merge_entity_duplicates(db_path: str, dry_run: bool = True) -> dict:
763
+ """Merge fragmented entity nodes using fuzzy matching.
764
+
765
+ Idempotent: checks for migration:entity-dedup-v1 stamp.
766
+ """
767
+ from triplestore import TripleStore
768
+ store = TripleStore(db_path)
769
+
770
+ # Idempotency check
771
+ stamp = store.entity("migration:entity-dedup-v1")
772
+ if stamp:
773
+ print("migration:entity-dedup-v1 already applied — skipping", file=sys.stderr)
774
+ return {"status": "already_applied"}
775
+
776
+ # Load all entity nodes
777
+ all_entities: dict[str, str] = {} # {name: entity_node_id}
778
+ for entity_id, name in store.entities_with_attr("name"):
779
+ if entity_id.startswith("entity:"):
780
+ all_entities[name] = entity_id
781
+
782
+ print(f"Total entity nodes: {len(all_entities)}", file=sys.stderr)
783
+
784
+ # Build clusters via greedy matching
785
+ remaining = dict(all_entities) # copy
786
+ clusters: list[list[tuple[str, str]]] = [] # [[( name, node_id ), ...], ...]
787
+
788
+ while remaining:
789
+ seed_name, seed_id = next(iter(remaining.items()))
790
+ cluster = [(seed_name, seed_id)]
791
+ del remaining[seed_name]
792
+
793
+ # Find all matches for this seed
794
+ to_remove = []
795
+ for other_name, other_id in remaining.items():
796
+ matched = _find_matching_entity(other_name, {seed_name: seed_id})
797
+ if matched:
798
+ cluster.append((other_name, other_id))
799
+ to_remove.append(other_name)
800
+ for name in to_remove:
801
+ del remaining[name]
802
+
803
+ if len(cluster) > 1:
804
+ # Filter out known false-positive pairs
805
+ names_set = {n for n, _ in cluster}
806
+ if any(pair <= names_set for pair in _DEDUP_SKIP_PAIRS):
807
+ continue
808
+ clusters.append(cluster)
809
+
810
+ print(f"Found {len(clusters)} duplicate clusters", file=sys.stderr)
811
+
812
+ merge_count = 0
813
+ repoint_count = 0
814
+
815
+ for cluster in clusters:
816
+ # Canonical selection: if any entity has significantly more backrefs (5+),
817
+ # use it. Otherwise prefer longest name (most complete spelling).
818
+ max_refs = max(len(store.backrefs(nid)) for _, nid in cluster)
819
+ if max_refs >= 5:
820
+ cluster.sort(key=lambda x: (-len(store.backrefs(x[1])), -len(x[0]), x[0]))
821
+ else:
822
+ cluster.sort(key=lambda x: (-len(x[0]), x[0]))
823
+ canonical_name, canonical_id = cluster[0]
824
+ duplicates = cluster[1:]
825
+
826
+ dup_names = [d[0] for d in duplicates]
827
+ print(f" cluster: {canonical_name} ← {dup_names}", file=sys.stderr)
828
+
829
+ if dry_run:
830
+ merge_count += len(duplicates)
831
+ continue
832
+
833
+ for dup_name, dup_id in duplicates:
834
+ # Re-point all refs pointing to this duplicate
835
+ refs = store.backrefs(dup_id)
836
+ for src_entity, attr in refs:
837
+ tx = store.begin_tx("entity_dedup")
838
+ store.retract_triple(tx, src_entity, attr, dup_id)
839
+ store.assert_triple(tx, src_entity, attr, canonical_id, value_type="ref")
840
+ repoint_count += 1
841
+
842
+ # Retract all triples of the duplicate entity itself
843
+ dup_attrs = store.entity(dup_id)
844
+ tx = store.begin_tx("entity_dedup")
845
+ for attr, values in dup_attrs.items():
846
+ if not isinstance(values, list):
847
+ values = [values]
848
+ for val in values:
849
+ store.retract_triple(tx, dup_id, attr, str(val))
850
+
851
+ merge_count += 1
852
+
853
+ # Stamp migration
854
+ if not dry_run and clusters:
855
+ tx = store.begin_tx("entity_dedup")
856
+ store.assert_triple(tx, "migration:entity-dedup-v1", "applied_at",
857
+ datetime.now(timezone.utc).isoformat())
858
+ store.assert_triple(tx, "migration:entity-dedup-v1", "clusters_merged",
859
+ str(len(clusters)))
860
+
861
+ result = {
862
+ "status": "dry_run" if dry_run else "applied",
863
+ "clusters": len(clusters),
864
+ "entities_merged": merge_count,
865
+ "refs_repointed": repoint_count,
866
+ }
867
+ print(json.dumps(result, indent=2), file=sys.stderr)
868
+ return result
869
+
870
+
427
871
  def main() -> None:
428
872
  parser = argparse.ArgumentParser(description="Knowledge Integrator")
429
873
  parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
430
874
  parser.add_argument("--digest", default=None, help="SessionDigest JSON string")
431
875
  parser.add_argument("--bootstrap", action="store_true", help="One-time: seed graph from playbook")
432
876
  parser.add_argument("--retag", action="store_true", help="Re-extract tags for all existing facts")
877
+ parser.add_argument("--dedup-entities", action="store_true", help="Merge fragmented entity nodes")
878
+ parser.add_argument("--dry-run", action="store_true", help="Preview changes without applying")
433
879
  args = parser.parse_args()
434
880
 
435
881
  memory_dir = args.memory_dir
436
882
  db_path = str(Path(memory_dir) / "knowledge-graph.db")
437
883
 
884
+ # Entity dedup mode: merge fragmented entity nodes
885
+ if args.dedup_entities:
886
+ if not Path(db_path).exists():
887
+ output_json({"error": "knowledge-graph.db not found"})
888
+ return
889
+ result = merge_entity_duplicates(db_path, dry_run=args.dry_run)
890
+ output_json(result)
891
+ return
892
+
438
893
  # Bootstrap mode: seed graph from current playbook
439
894
  if args.bootstrap:
440
895
  result = _bootstrap_graph(memory_dir, db_path)
@@ -506,39 +961,61 @@ def main() -> None:
506
961
  facts_lines.append(f"- [{eid}] ({domain}, confidence={conf}) {val}")
507
962
  facts_text = f"\n\n## Existing Graph Facts (for reference — reinforce or retract as needed)\n" + "\n".join(facts_lines)
508
963
 
509
- user_prompt = f"""## Session Digest
510
- {json.dumps(digest, indent=2, ensure_ascii=False)}
964
+ # ── Step 1: DETERMINISTIC graph ops from distiller output (no LLM needed) ──
965
+ # The distiller already extracted structured facts — conversion is mechanical.
966
+ graph_ops = _facts_to_graph_ops(digest)
967
+ digest_ts = digest.get("ts", datetime.now(timezone.utc).isoformat())
511
968
 
512
- ## Current Playbook Body
513
- {body}{facts_text}"""
969
+ # Dedup + execute
970
+ graph_stats = _execute_graph_ops(db_path, graph_ops, digest_ts, digest_entities=digest_entities)
514
971
 
515
- try:
516
- raw = call_llm_with_fallback(
517
- SYSTEM_PROMPT,
518
- user_prompt,
519
- script="knowledge_integrator",
520
- json_mode=True,
521
- )
522
- result = extract_json(raw)
523
- except (ValueError, LLMError) as e:
524
- print(f"LLM integration failed: {e}", file=sys.stderr)
525
- output_json({"error": str(e)})
526
- return
972
+ # NOTE: Consolidation (merging entity facts) and summaries both HURT retrieval
973
+ # at our scale (<200 facts). Individual facts are more retrievable than merged ones.
974
+ # Keep facts separate — dedup handles true duplicates, different facts stay distinct.
527
975
 
528
- # Archive current playbook before mutation
976
+ # ── Step 2: Automated playbook curation (tag overlap, no LLM) ──
529
977
  archive_path = _archive_playbook(memory_dir)
530
-
531
- # Write updated playbook
532
- updated_body = result.get("updatedPlaybook", body)
978
+ active_tags = set()
979
+ for op in graph_ops:
980
+ active_tags.update(_extract_tags(op.get("value", "")))
981
+
982
+ playbook_lines = [l for l in body.splitlines() if l.strip() and not l.startswith("<!--")]
983
+ changes: dict[str, list[str]] = {"added": [], "pruned": [], "promoted": [], "reinforced": []}
984
+
985
+ # Reinforce playbook lines whose tags overlap with this session
986
+ updated_lines = []
987
+ for line in playbook_lines:
988
+ line_tags = set(_extract_tags(line))
989
+ if line_tags & active_tags:
990
+ # Increment seen count: "... (seen 3)" → "... (seen 4)"
991
+ import re as _re
992
+ seen_match = _re.search(r"\(seen (\d+)\)", line)
993
+ if seen_match:
994
+ old_count = int(seen_match.group(1))
995
+ line = line[:seen_match.start()] + f"(seen {old_count + 1})" + line[seen_match.end():]
996
+ changes["reinforced"].append(line.strip()[:60])
997
+ updated_lines.append(line)
998
+ else:
999
+ updated_lines.append(line)
1000
+
1001
+ # Add novel facts as new playbook lines (no LLM — just format as bullet points)
1002
+ for fact in digest.get("facts", [])[:5]: # cap at 5 new lines per pass
1003
+ fact_tags = set(_extract_tags(fact))
1004
+ # Only add if no existing playbook line covers this
1005
+ if not any(set(_extract_tags(l)) & fact_tags for l in playbook_lines if len(fact_tags) > 1):
1006
+ new_line = f"- {fact} (seen 1)"
1007
+ updated_lines.append(new_line)
1008
+ changes["added"].append(fact[:60])
1009
+
1010
+ # Keep playbook under 50 lines
1011
+ if len(updated_lines) > 50:
1012
+ updated_lines = updated_lines[:50]
1013
+
1014
+ updated_body = "\n".join(updated_lines)
533
1015
  new_playbook = f"{header}\n\n{updated_body}\n\n{footer}".strip() + "\n"
534
1016
  playbook_path = Path(memory_dir) / "sinain-playbook.md"
535
1017
  playbook_path.write_text(new_playbook, encoding="utf-8")
536
1018
 
537
- # Execute graph operations
538
- graph_ops = result.get("graphOps", [])
539
- digest_ts = digest.get("ts", datetime.now(timezone.utc).isoformat())
540
- graph_stats = _execute_graph_ops(db_path, graph_ops, digest_ts)
541
-
542
1019
  # Append digest to session-digests.jsonl
543
1020
  digests_path = Path(memory_dir) / "session-digests.jsonl"
544
1021
  with open(digests_path, "a", encoding="utf-8") as f:
@@ -548,7 +1025,7 @@ def main() -> None:
548
1025
  log_entry = {
549
1026
  "ts": datetime.now(timezone.utc).isoformat(),
550
1027
  "_type": "integration",
551
- "changes": result.get("changes", {}),
1028
+ "changes": changes,
552
1029
  "graphStats": graph_stats,
553
1030
  "digestEntities": digest_entities,
554
1031
  "archivePath": archive_path,
@@ -563,7 +1040,7 @@ def main() -> None:
563
1040
 
564
1041
  output_json({
565
1042
  "status": "ok",
566
- "changes": result.get("changes", {}),
1043
+ "changes": changes,
567
1044
  "graphStats": graph_stats,
568
1045
  "playbookLines": len(new_playbook.splitlines()),
569
1046
  })