@geravant/sinain 1.12.0 → 1.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/package-lock.json +963 -0
- package/sinain-core/package.json +1 -0
- package/sinain-core/src/buffers/feed-buffer.ts +32 -0
- package/sinain-core/src/embedding/service.ts +66 -0
- package/sinain-core/src/index.ts +19 -2
- package/sinain-core/src/learning/local-curation.ts +137 -7
- package/sinain-core/src/server.ts +31 -0
- package/sinain-memory/README.md +105 -0
- package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/embed_client.py +117 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/meeting_adapter.py +81 -0
- package/sinain-memory/eval/benchmarks/meeting_runner.py +230 -0
- package/sinain-memory/eval/benchmarks/query.py +37 -16
- package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +318 -0
- package/sinain-memory/eval/benchmarks/runner.py +10 -3
- package/sinain-memory/graph_query.py +257 -15
- package/sinain-memory/knowledge_integrator.py +365 -72
- package/sinain-memory/memory-config.json +1 -1
- package/sinain-memory/session_distiller.py +43 -19
- package/sinain-memory/triplestore.py +60 -0
|
@@ -55,29 +55,33 @@ FOR THE PLAYBOOK:
|
|
|
55
55
|
- Three Laws: (1) don't remove error-prevention patterns, (2) preserve high-scoring approaches, (3) then evolve
|
|
56
56
|
|
|
57
57
|
FOR THE KNOWLEDGE GRAPH:
|
|
58
|
-
- ASSERT
|
|
58
|
+
- ASSERT every concrete fact from the digest: factual claims, decisions, relationships, numbers
|
|
59
59
|
- REINFORCE existing facts confirmed by the session (list their entity_ids)
|
|
60
60
|
- RETRACT facts contradicted by session evidence (list their entity_ids)
|
|
61
|
-
- Each fact needs: entity (
|
|
62
|
-
- Entity naming: use lowercase-hyphenated slugs
|
|
63
|
-
|
|
61
|
+
- Each fact needs: entity (real name from content), attribute (relationship type), value (self-contained sentence), confidence (0.0-1.0), domain (for scoping)
|
|
62
|
+
- Entity naming: use actual names as lowercase-hyphenated slugs
|
|
63
|
+
Good: "citibank", "al-futaim-group", "artom", "intellij-idea"
|
|
64
|
+
Bad: "ai-solutions", "client-understanding", "tool-usage"
|
|
65
|
+
- The value field must be a complete, self-contained sentence that answers a question on its own
|
|
66
|
+
- Assert BOTH durable facts AND time-bound decisions/action items (mark decisions with confidence 0.7)
|
|
64
67
|
|
|
65
68
|
If the session was empty/idle, return minimal changes.
|
|
66
69
|
|
|
67
|
-
Respond with ONLY a JSON object:
|
|
70
|
+
Respond with ONLY a JSON object. IMPORTANT: put graphOps FIRST (before playbook) — \
|
|
71
|
+
graphOps are the most valuable output and must not be truncated.
|
|
68
72
|
{
|
|
69
|
-
"
|
|
73
|
+
"graphOps": [
|
|
74
|
+
{"op": "assert", "entity": "entity-slug", "attribute": "attr-name", "value": "fact text", "confidence": 0.8, "domain": "domain-name"},
|
|
75
|
+
{"op": "reinforce", "entityId": "fact:existing-slug"},
|
|
76
|
+
{"op": "retract", "entityId": "fact:existing-slug", "reason": "why"}
|
|
77
|
+
],
|
|
70
78
|
"changes": {
|
|
71
79
|
"added": ["pattern text", ...],
|
|
72
80
|
"pruned": ["pattern text", ...],
|
|
73
81
|
"promoted": ["pattern text", ...],
|
|
74
82
|
"reinforced": ["pattern text", ...]
|
|
75
83
|
},
|
|
76
|
-
"
|
|
77
|
-
{"op": "assert", "entity": "entity-slug", "attribute": "attr-name", "value": "fact text", "confidence": 0.8, "domain": "domain-name"},
|
|
78
|
-
{"op": "reinforce", "entityId": "fact:existing-slug"},
|
|
79
|
-
{"op": "retract", "entityId": "fact:existing-slug", "reason": "why"}
|
|
80
|
-
]
|
|
84
|
+
"updatedPlaybook": "full playbook body text (between header and footer comments)"
|
|
81
85
|
}"""
|
|
82
86
|
|
|
83
87
|
|
|
@@ -122,46 +126,81 @@ def _normalize_entity(name: str) -> str:
|
|
|
122
126
|
return re.sub(r"[^a-z0-9-]", "", name.lower().replace(" ", "-").replace("_", "-"))
|
|
123
127
|
|
|
124
128
|
|
|
125
|
-
def _canonicalize_ops(ops: list[dict], existing_entities: list[str]) -> list[dict]:
|
|
126
|
-
"""
|
|
129
|
+
def _canonicalize_ops(ops: list[dict], existing_entities: list[str], existing_facts: list[dict]) -> list[dict]:
|
|
130
|
+
"""Deduplicate graph ops via embedding similarity (Mem0 pattern).
|
|
127
131
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
132
|
+
For each new assertion, check if a semantically equivalent fact already exists
|
|
133
|
+
using cosine similarity (threshold 0.78). If so, reinforce instead of asserting.
|
|
134
|
+
Falls back to exact hash matching if embedding service is unavailable.
|
|
131
135
|
"""
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
+
existing_id_set = set(existing_entities)
|
|
137
|
+
|
|
138
|
+
# Build text→entity_id map for existing facts (for embedding-based dedup)
|
|
139
|
+
existing_texts: list[str] = []
|
|
140
|
+
existing_ids: list[str] = []
|
|
141
|
+
for f in existing_facts:
|
|
142
|
+
val = f.get("value", "")
|
|
143
|
+
eid = f.get("entityId", f.get("entity_id", ""))
|
|
144
|
+
if val and eid:
|
|
145
|
+
existing_texts.append(val)
|
|
146
|
+
existing_ids.append(eid)
|
|
147
|
+
|
|
148
|
+
# Separate assert ops for batch dedup
|
|
149
|
+
assert_ops = [(i, op) for i, op in enumerate(ops) if op.get("op") == "assert"]
|
|
150
|
+
non_assert_ops = [(i, op) for i, op in enumerate(ops) if op.get("op") != "assert"]
|
|
151
|
+
|
|
152
|
+
# Batch embedding dedup: single HTTP call for all new facts
|
|
153
|
+
dedup_map: dict[int, int] = {} # assert_index → existing_index
|
|
154
|
+
if assert_ops and existing_texts:
|
|
155
|
+
try:
|
|
156
|
+
from embed_client import find_duplicates_batch
|
|
157
|
+
new_values = [op.get("value", "") for _, op in assert_ops]
|
|
158
|
+
dedup_map = find_duplicates_batch(new_values, existing_texts)
|
|
159
|
+
if dedup_map:
|
|
160
|
+
print(f" [dedup] found {len(dedup_map)} semantic duplicates in batch", file=sys.stderr)
|
|
161
|
+
except Exception:
|
|
162
|
+
pass # embedding unavailable, fall through to exact matching
|
|
136
163
|
|
|
137
164
|
result = []
|
|
138
|
-
|
|
165
|
+
seen_fact_ids: set[str] = set()
|
|
166
|
+
seen_values_set: set[str] = set()
|
|
167
|
+
|
|
168
|
+
# Re-merge in original order
|
|
169
|
+
all_indexed = non_assert_ops + assert_ops
|
|
170
|
+
all_indexed.sort(key=lambda x: x[0])
|
|
171
|
+
|
|
172
|
+
for orig_idx, op in all_indexed:
|
|
139
173
|
if op.get("op") != "assert":
|
|
140
174
|
result.append(op)
|
|
141
175
|
continue
|
|
142
176
|
|
|
143
177
|
entity = op.get("entity", "")
|
|
144
|
-
|
|
178
|
+
attribute = op.get("attribute", "")
|
|
179
|
+
value = op.get("value", "")
|
|
180
|
+
fact_id = _fact_id(entity, attribute, value)
|
|
181
|
+
|
|
182
|
+
# Exact hash match
|
|
183
|
+
if fact_id in existing_id_set or fact_id in seen_fact_ids:
|
|
184
|
+
if fact_id in existing_id_set:
|
|
185
|
+
result.append({"op": "reinforce", "entityId": fact_id})
|
|
186
|
+
print(f" [dedup] exact → reinforce '{fact_id}'", file=sys.stderr)
|
|
187
|
+
continue
|
|
145
188
|
|
|
146
|
-
# Check
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if len(normalized) >= 4 and (normalized in existing_norm or existing_norm in normalized):
|
|
154
|
-
matched_id = existing_eid
|
|
155
|
-
break
|
|
189
|
+
# Check batch embedding dedup results
|
|
190
|
+
assert_idx = [i for i, (oi, _) in enumerate(assert_ops) if oi == orig_idx]
|
|
191
|
+
if assert_idx and assert_idx[0] in dedup_map:
|
|
192
|
+
dup_existing_idx = dedup_map[assert_idx[0]]
|
|
193
|
+
result.append({"op": "reinforce", "entityId": existing_ids[dup_existing_idx]})
|
|
194
|
+
print(f" [dedup] semantic → reinforce '{existing_ids[dup_existing_idx]}'", file=sys.stderr)
|
|
195
|
+
continue
|
|
156
196
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
canonical_map[normalized] = _fact_id(entity, op.get("attribute", ""), op.get("value", ""))
|
|
197
|
+
# Intra-batch dedup (by value text)
|
|
198
|
+
if value in seen_values_set:
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
result.append(op)
|
|
202
|
+
seen_fact_ids.add(fact_id)
|
|
203
|
+
seen_values_set.add(value)
|
|
165
204
|
|
|
166
205
|
return result
|
|
167
206
|
|
|
@@ -179,7 +218,14 @@ def _load_graph_facts(db_path: str, entities: list[str] | None = None, limit: in
|
|
|
179
218
|
if entities:
|
|
180
219
|
# Tag-based search: find facts whose tags match any of the keywords
|
|
181
220
|
# Normalize keywords to lowercase for tag matching
|
|
182
|
-
|
|
221
|
+
# Handle both old-style string entities and new-style dict entities
|
|
222
|
+
keywords = []
|
|
223
|
+
for e in entities:
|
|
224
|
+
if isinstance(e, dict):
|
|
225
|
+
keywords.append(e.get("name", "").lower().replace(" ", "-"))
|
|
226
|
+
else:
|
|
227
|
+
keywords.append(str(e).lower().replace(" ", "-"))
|
|
228
|
+
keywords = [k for k in keywords if k]
|
|
183
229
|
placeholders = ",".join(["?" for _ in keywords])
|
|
184
230
|
rows = store._conn.execute(
|
|
185
231
|
f"""SELECT entity_id, COUNT(*) as matches
|
|
@@ -221,8 +267,156 @@ def _load_graph_facts(db_path: str, entities: list[str] | None = None, limit: in
|
|
|
221
267
|
return []
|
|
222
268
|
|
|
223
269
|
|
|
224
|
-
def
|
|
225
|
-
"""
|
|
270
|
+
def _consolidate_entity_facts(db_path: str, min_facts: int = 3) -> int:
|
|
271
|
+
"""Merge multiple facts about the same entity into consolidated facts.
|
|
272
|
+
|
|
273
|
+
Pure code — no LLM. Concatenates fact values with "; " separator.
|
|
274
|
+
Runs at shutdown only (not incremental passes).
|
|
275
|
+
"""
|
|
276
|
+
try:
|
|
277
|
+
from triplestore import TripleStore
|
|
278
|
+
store = TripleStore(db_path)
|
|
279
|
+
|
|
280
|
+
# Group facts by entity name
|
|
281
|
+
entity_facts: dict[str, list[tuple[str, str]]] = {} # entity → [(fact_id, value)]
|
|
282
|
+
for r in store.entities_with_attr("entity"):
|
|
283
|
+
fact_id, entity_name = r[0], r[1]
|
|
284
|
+
if not fact_id.startswith("fact:") or isinstance(entity_name, list):
|
|
285
|
+
continue
|
|
286
|
+
attrs = store.entity(fact_id)
|
|
287
|
+
if attrs and "value" in attrs:
|
|
288
|
+
val = attrs["value"][0] if isinstance(attrs["value"], list) else str(attrs["value"])
|
|
289
|
+
entity_facts.setdefault(entity_name, []).append((fact_id, val))
|
|
290
|
+
|
|
291
|
+
consolidated = 0
|
|
292
|
+
for entity_name, facts in entity_facts.items():
|
|
293
|
+
if len(facts) < min_facts:
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
# Check if a consolidated fact already exists
|
|
297
|
+
if any(";" in val and len(val) > 100 for _, val in facts):
|
|
298
|
+
continue # already consolidated
|
|
299
|
+
|
|
300
|
+
# Deduplicate values (same fact stated differently)
|
|
301
|
+
seen_values: list[str] = []
|
|
302
|
+
for _, val in facts:
|
|
303
|
+
# Skip if very similar to an already-seen value
|
|
304
|
+
if not any(len(set(val.lower().split()) & set(sv.lower().split())) / max(len(val.split()), 1) > 0.7 for sv in seen_values):
|
|
305
|
+
seen_values.append(val)
|
|
306
|
+
|
|
307
|
+
if len(seen_values) < 2:
|
|
308
|
+
continue # nothing to consolidate after dedup
|
|
309
|
+
|
|
310
|
+
merged_value = "; ".join(seen_values)
|
|
311
|
+
if len(merged_value) > 500:
|
|
312
|
+
merged_value = merged_value[:500] + "..."
|
|
313
|
+
|
|
314
|
+
# Create consolidated fact, retract originals
|
|
315
|
+
tx = store.begin_tx("consolidation")
|
|
316
|
+
new_eid = _fact_id(entity_name, "consolidated", merged_value)
|
|
317
|
+
store.assert_triple(tx, new_eid, "entity", entity_name)
|
|
318
|
+
store.assert_triple(tx, new_eid, "attribute", "consolidated")
|
|
319
|
+
store.assert_triple(tx, new_eid, "value", merged_value)
|
|
320
|
+
store.assert_triple(tx, new_eid, "confidence", "0.95")
|
|
321
|
+
store.assert_triple(tx, new_eid, "first_seen", _now_iso())
|
|
322
|
+
store.assert_triple(tx, new_eid, "reinforce_count", str(len(facts)))
|
|
323
|
+
for tag in _extract_tags(merged_value):
|
|
324
|
+
store.assert_triple(tx, new_eid, "tag", tag)
|
|
325
|
+
|
|
326
|
+
# Retract original individual facts
|
|
327
|
+
for old_eid, _ in facts:
|
|
328
|
+
for attr_name in list(store.entity(old_eid).keys()):
|
|
329
|
+
store.retract_triple(tx, old_eid, attr_name)
|
|
330
|
+
|
|
331
|
+
consolidated += 1
|
|
332
|
+
print(f" [consolidate] {entity_name}: {len(facts)} facts → 1 ({len(merged_value)} chars)", file=sys.stderr)
|
|
333
|
+
|
|
334
|
+
store.close()
|
|
335
|
+
return consolidated
|
|
336
|
+
except Exception as e:
|
|
337
|
+
print(f" [consolidate] failed: {e}", file=sys.stderr)
|
|
338
|
+
return 0
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _now_iso() -> str:
|
|
342
|
+
from datetime import datetime, timezone
|
|
343
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _extract_entity_from_fact(fact_text: str, known_entities: list) -> str:
|
|
347
|
+
"""Extract the most relevant entity name from a fact sentence.
|
|
348
|
+
|
|
349
|
+
Matches against known entities from the distiller output.
|
|
350
|
+
Falls back to first capitalized multi-word phrase.
|
|
351
|
+
"""
|
|
352
|
+
fact_lower = fact_text.lower()
|
|
353
|
+
# Check which known entities appear in the fact text (longest match first)
|
|
354
|
+
candidates = []
|
|
355
|
+
for ent in known_entities:
|
|
356
|
+
ename = ent if isinstance(ent, str) else ent.get("name", "")
|
|
357
|
+
if ename and ename.lower().replace("-", " ") in fact_lower.replace("-", " "):
|
|
358
|
+
candidates.append(ename)
|
|
359
|
+
if candidates:
|
|
360
|
+
# Return the longest matching entity (most specific)
|
|
361
|
+
return _normalize_entity(max(candidates, key=len))
|
|
362
|
+
|
|
363
|
+
# Fallback: first capitalized multi-word phrase
|
|
364
|
+
import re as _re
|
|
365
|
+
match = _re.search(r"[A-Z][a-z]+(?: [A-Z][a-z]+)+", fact_text)
|
|
366
|
+
if match:
|
|
367
|
+
return _normalize_entity(match.group())
|
|
368
|
+
|
|
369
|
+
# Last resort: first significant word
|
|
370
|
+
words = [w for w in fact_text.split() if len(w) > 3 and w[0].isupper()]
|
|
371
|
+
if words:
|
|
372
|
+
return _normalize_entity(words[0])
|
|
373
|
+
|
|
374
|
+
return "general"
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _facts_to_graph_ops(digest: dict) -> list[dict]:
|
|
378
|
+
"""Convert distiller facts/entities/decisions directly to graph ops.
|
|
379
|
+
|
|
380
|
+
DETERMINISTIC — no LLM needed. The distiller already extracted structured
|
|
381
|
+
facts with entity names. This function mechanically converts them to
|
|
382
|
+
assert operations for the triplestore.
|
|
383
|
+
"""
|
|
384
|
+
ops = []
|
|
385
|
+
known_entities = digest.get("entities", [])
|
|
386
|
+
|
|
387
|
+
# Each fact becomes an assert op
|
|
388
|
+
for fact_text in digest.get("facts", []):
|
|
389
|
+
if not fact_text or len(fact_text) < 5:
|
|
390
|
+
continue
|
|
391
|
+
entity = _extract_entity_from_fact(fact_text, known_entities)
|
|
392
|
+
ops.append({
|
|
393
|
+
"op": "assert",
|
|
394
|
+
"entity": entity,
|
|
395
|
+
"attribute": "fact",
|
|
396
|
+
"value": fact_text,
|
|
397
|
+
"confidence": 0.9,
|
|
398
|
+
"domain": "",
|
|
399
|
+
})
|
|
400
|
+
|
|
401
|
+
# Each decision becomes an assert with lower confidence (time-bound)
|
|
402
|
+
for decision_text in digest.get("decisions", []):
|
|
403
|
+
if not decision_text or len(decision_text) < 5:
|
|
404
|
+
continue
|
|
405
|
+
entity = _extract_entity_from_fact(decision_text, known_entities)
|
|
406
|
+
ops.append({
|
|
407
|
+
"op": "assert",
|
|
408
|
+
"entity": entity,
|
|
409
|
+
"attribute": "decision",
|
|
410
|
+
"value": decision_text,
|
|
411
|
+
"confidence": 0.7,
|
|
412
|
+
"domain": "",
|
|
413
|
+
})
|
|
414
|
+
|
|
415
|
+
return ops
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_entities: list | None = None) -> dict:
|
|
419
|
+
"""Execute graph operations + build entity graph with ref edges."""
|
|
226
420
|
if not ops:
|
|
227
421
|
return {"asserted": 0, "reinforced": 0, "retracted": 0}
|
|
228
422
|
|
|
@@ -230,9 +424,18 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str) -> dict:
|
|
|
230
424
|
from triplestore import TripleStore
|
|
231
425
|
store = TripleStore(db_path)
|
|
232
426
|
|
|
233
|
-
#
|
|
427
|
+
# Deduplicate via embedding similarity (Mem0 pattern)
|
|
234
428
|
existing_ids = [r[0] for r in store.entities_with_attr("entity")]
|
|
235
|
-
|
|
429
|
+
# Load existing fact values for semantic comparison
|
|
430
|
+
existing_facts_for_dedup = []
|
|
431
|
+
for eid in existing_ids:
|
|
432
|
+
attrs = store.entity(eid)
|
|
433
|
+
if attrs and "value" in attrs:
|
|
434
|
+
vals = attrs["value"]
|
|
435
|
+
val = vals[0] if isinstance(vals, list) and vals else str(vals) if vals else ""
|
|
436
|
+
if val:
|
|
437
|
+
existing_facts_for_dedup.append({"entity_id": eid, "value": val})
|
|
438
|
+
ops = _canonicalize_ops(ops, existing_ids, existing_facts_for_dedup)
|
|
236
439
|
|
|
237
440
|
stats = {"asserted": 0, "reinforced": 0, "retracted": 0}
|
|
238
441
|
|
|
@@ -322,10 +525,78 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str) -> dict:
|
|
|
322
525
|
store.retract_triple(tx, entity_id, attr_name, val)
|
|
323
526
|
stats["retracted"] += 1
|
|
324
527
|
|
|
528
|
+
# --- Build entity graph layer (two-layer model) ---
|
|
529
|
+
if digest_entities and stats["asserted"] > 0:
|
|
530
|
+
try:
|
|
531
|
+
# Create entity:* nodes from digest entities
|
|
532
|
+
for ent in (digest_entities or []):
|
|
533
|
+
if isinstance(ent, dict):
|
|
534
|
+
ename = _normalize_entity(ent.get("name", ""))
|
|
535
|
+
etype = ent.get("type", "unknown")
|
|
536
|
+
else:
|
|
537
|
+
ename = _normalize_entity(str(ent))
|
|
538
|
+
etype = "unknown"
|
|
539
|
+
if not ename or len(ename) < 2:
|
|
540
|
+
continue
|
|
541
|
+
|
|
542
|
+
entity_node_id = f"entity:{ename}"
|
|
543
|
+
existing = store.entity(entity_node_id)
|
|
544
|
+
if not existing:
|
|
545
|
+
tx = store.begin_tx("entity_graph")
|
|
546
|
+
store.assert_triple(tx, entity_node_id, "name", ename)
|
|
547
|
+
store.assert_triple(tx, entity_node_id, "type", etype)
|
|
548
|
+
|
|
549
|
+
# Link facts to their entity nodes via "about" ref edges
|
|
550
|
+
for op_data in ops:
|
|
551
|
+
if op_data.get("op") != "assert":
|
|
552
|
+
continue
|
|
553
|
+
entity = op_data.get("entity", "")
|
|
554
|
+
value = op_data.get("value", "")
|
|
555
|
+
attribute = op_data.get("attribute", "")
|
|
556
|
+
fact_eid = _fact_id(entity, attribute, value)
|
|
557
|
+
entity_node_id = f"entity:{_normalize_entity(entity)}"
|
|
558
|
+
# Only link if entity node exists
|
|
559
|
+
if store.entity(entity_node_id):
|
|
560
|
+
tx = store.begin_tx("entity_graph")
|
|
561
|
+
store.assert_triple(tx, fact_eid, "about", entity_node_id, value_type="ref")
|
|
562
|
+
|
|
563
|
+
# Infer cross-entity refs from fact content
|
|
564
|
+
all_entity_nodes = {}
|
|
565
|
+
for r in store.entities_with_attr("name"):
|
|
566
|
+
if r[0].startswith("entity:"):
|
|
567
|
+
all_entity_nodes[r[1]] = r[0] # {name: entity_id}
|
|
568
|
+
|
|
569
|
+
ref_count = 0
|
|
570
|
+
for fact_eid_row in store.entities_with_attr("value"):
|
|
571
|
+
fact_eid = fact_eid_row[0]
|
|
572
|
+
if not fact_eid.startswith("fact:"):
|
|
573
|
+
continue
|
|
574
|
+
attrs = store.entity(fact_eid)
|
|
575
|
+
source_entity = (attrs.get("entity", [""])[0] if attrs.get("entity") else "").lower()
|
|
576
|
+
value_lower = (attrs["value"][0] if attrs.get("value") else "").lower()
|
|
577
|
+
|
|
578
|
+
for ename, enode_id in all_entity_nodes.items():
|
|
579
|
+
if ename == source_entity or len(ename) < 4:
|
|
580
|
+
continue
|
|
581
|
+
if ename in value_lower:
|
|
582
|
+
existing_refs = store.backrefs(enode_id, attribute="mentions")
|
|
583
|
+
if not any(r[0] == fact_eid for r in existing_refs):
|
|
584
|
+
tx = store.begin_tx("ref_inference")
|
|
585
|
+
store.assert_triple(tx, fact_eid, "mentions", enode_id, value_type="ref")
|
|
586
|
+
ref_count += 1
|
|
587
|
+
|
|
588
|
+
if ref_count:
|
|
589
|
+
stats["refs_created"] = ref_count
|
|
590
|
+
print(f" [graph] {len(all_entity_nodes)} entity nodes, {ref_count} ref edges", file=sys.stderr)
|
|
591
|
+
except Exception as e:
|
|
592
|
+
print(f" [graph] entity graph failed (non-fatal): {e}", file=sys.stderr)
|
|
593
|
+
|
|
325
594
|
store.close()
|
|
326
595
|
return stats
|
|
327
596
|
except Exception as e:
|
|
597
|
+
import traceback
|
|
328
598
|
print(f"[warn] Failed to execute graph ops: {e}", file=sys.stderr)
|
|
599
|
+
traceback.print_exc(file=sys.stderr)
|
|
329
600
|
return {"asserted": 0, "reinforced": 0, "retracted": 0, "error": str(e)}
|
|
330
601
|
|
|
331
602
|
|
|
@@ -506,39 +777,61 @@ def main() -> None:
|
|
|
506
777
|
facts_lines.append(f"- [{eid}] ({domain}, confidence={conf}) {val}")
|
|
507
778
|
facts_text = f"\n\n## Existing Graph Facts (for reference — reinforce or retract as needed)\n" + "\n".join(facts_lines)
|
|
508
779
|
|
|
509
|
-
|
|
510
|
-
|
|
780
|
+
# ── Step 1: DETERMINISTIC graph ops from distiller output (no LLM needed) ──
|
|
781
|
+
# The distiller already extracted structured facts — conversion is mechanical.
|
|
782
|
+
graph_ops = _facts_to_graph_ops(digest)
|
|
783
|
+
digest_ts = digest.get("ts", datetime.now(timezone.utc).isoformat())
|
|
511
784
|
|
|
512
|
-
|
|
513
|
-
|
|
785
|
+
# Dedup + execute
|
|
786
|
+
graph_stats = _execute_graph_ops(db_path, graph_ops, digest_ts, digest_entities=digest_entities)
|
|
514
787
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
user_prompt,
|
|
519
|
-
script="knowledge_integrator",
|
|
520
|
-
json_mode=True,
|
|
521
|
-
)
|
|
522
|
-
result = extract_json(raw)
|
|
523
|
-
except (ValueError, LLMError) as e:
|
|
524
|
-
print(f"LLM integration failed: {e}", file=sys.stderr)
|
|
525
|
-
output_json({"error": str(e)})
|
|
526
|
-
return
|
|
788
|
+
# NOTE: Consolidation (merging entity facts) and summaries both HURT retrieval
|
|
789
|
+
# at our scale (<200 facts). Individual facts are more retrievable than merged ones.
|
|
790
|
+
# Keep facts separate — dedup handles true duplicates, different facts stay distinct.
|
|
527
791
|
|
|
528
|
-
#
|
|
792
|
+
# ── Step 2: Automated playbook curation (tag overlap, no LLM) ──
|
|
529
793
|
archive_path = _archive_playbook(memory_dir)
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
794
|
+
active_tags = set()
|
|
795
|
+
for op in graph_ops:
|
|
796
|
+
active_tags.update(_extract_tags(op.get("value", "")))
|
|
797
|
+
|
|
798
|
+
playbook_lines = [l for l in body.splitlines() if l.strip() and not l.startswith("<!--")]
|
|
799
|
+
changes: dict[str, list[str]] = {"added": [], "pruned": [], "promoted": [], "reinforced": []}
|
|
800
|
+
|
|
801
|
+
# Reinforce playbook lines whose tags overlap with this session
|
|
802
|
+
updated_lines = []
|
|
803
|
+
for line in playbook_lines:
|
|
804
|
+
line_tags = set(_extract_tags(line))
|
|
805
|
+
if line_tags & active_tags:
|
|
806
|
+
# Increment seen count: "... (seen 3)" → "... (seen 4)"
|
|
807
|
+
import re as _re
|
|
808
|
+
seen_match = _re.search(r"\(seen (\d+)\)", line)
|
|
809
|
+
if seen_match:
|
|
810
|
+
old_count = int(seen_match.group(1))
|
|
811
|
+
line = line[:seen_match.start()] + f"(seen {old_count + 1})" + line[seen_match.end():]
|
|
812
|
+
changes["reinforced"].append(line.strip()[:60])
|
|
813
|
+
updated_lines.append(line)
|
|
814
|
+
else:
|
|
815
|
+
updated_lines.append(line)
|
|
816
|
+
|
|
817
|
+
# Add novel facts as new playbook lines (no LLM — just format as bullet points)
|
|
818
|
+
for fact in digest.get("facts", [])[:5]: # cap at 5 new lines per pass
|
|
819
|
+
fact_tags = set(_extract_tags(fact))
|
|
820
|
+
# Only add if no existing playbook line covers this
|
|
821
|
+
if not any(set(_extract_tags(l)) & fact_tags for l in playbook_lines if len(fact_tags) > 1):
|
|
822
|
+
new_line = f"- {fact} (seen 1)"
|
|
823
|
+
updated_lines.append(new_line)
|
|
824
|
+
changes["added"].append(fact[:60])
|
|
825
|
+
|
|
826
|
+
# Keep playbook under 50 lines
|
|
827
|
+
if len(updated_lines) > 50:
|
|
828
|
+
updated_lines = updated_lines[:50]
|
|
829
|
+
|
|
830
|
+
updated_body = "\n".join(updated_lines)
|
|
533
831
|
new_playbook = f"{header}\n\n{updated_body}\n\n{footer}".strip() + "\n"
|
|
534
832
|
playbook_path = Path(memory_dir) / "sinain-playbook.md"
|
|
535
833
|
playbook_path.write_text(new_playbook, encoding="utf-8")
|
|
536
834
|
|
|
537
|
-
# Execute graph operations
|
|
538
|
-
graph_ops = result.get("graphOps", [])
|
|
539
|
-
digest_ts = digest.get("ts", datetime.now(timezone.utc).isoformat())
|
|
540
|
-
graph_stats = _execute_graph_ops(db_path, graph_ops, digest_ts)
|
|
541
|
-
|
|
542
835
|
# Append digest to session-digests.jsonl
|
|
543
836
|
digests_path = Path(memory_dir) / "session-digests.jsonl"
|
|
544
837
|
with open(digests_path, "a", encoding="utf-8") as f:
|
|
@@ -548,7 +841,7 @@ def main() -> None:
|
|
|
548
841
|
log_entry = {
|
|
549
842
|
"ts": datetime.now(timezone.utc).isoformat(),
|
|
550
843
|
"_type": "integration",
|
|
551
|
-
"changes":
|
|
844
|
+
"changes": changes,
|
|
552
845
|
"graphStats": graph_stats,
|
|
553
846
|
"digestEntities": digest_entities,
|
|
554
847
|
"archivePath": archive_path,
|
|
@@ -563,7 +856,7 @@ def main() -> None:
|
|
|
563
856
|
|
|
564
857
|
output_json({
|
|
565
858
|
"status": "ok",
|
|
566
|
-
"changes":
|
|
859
|
+
"changes": changes,
|
|
567
860
|
"graphStats": graph_stats,
|
|
568
861
|
"playbookLines": len(new_playbook.splitlines()),
|
|
569
862
|
})
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"eval_reporter": { "model": "smart", "maxTokens": 1000 },
|
|
15
15
|
"triple_extractor": { "model": "fast", "maxTokens": 1500, "timeout": 30 },
|
|
16
16
|
"session_distiller": { "model": "smart", "maxTokens": 1500, "timeout": 30 },
|
|
17
|
-
"knowledge_integrator": { "model": "smart", "maxTokens":
|
|
17
|
+
"knowledge_integrator": { "model": "smart", "maxTokens": 4000, "timeout": 60 }
|
|
18
18
|
},
|
|
19
19
|
"defaults": { "model": "fast", "maxTokens": 1500 },
|
|
20
20
|
"triplestore": {
|
|
@@ -28,7 +28,7 @@ from common import (
|
|
|
28
28
|
|
|
29
29
|
SYSTEM_PROMPT = """\
|
|
30
30
|
You are a session distiller for a personal AI overlay system (sinain).
|
|
31
|
-
Your job: analyze a session transcript and extract
|
|
31
|
+
Your job: analyze a session transcript and extract ALL knowledge worth remembering.
|
|
32
32
|
|
|
33
33
|
The transcript contains feed items from sinain-core:
|
|
34
34
|
- audio: transcribed speech from the user's environment
|
|
@@ -37,24 +37,42 @@ The transcript contains feed items from sinain-core:
|
|
|
37
37
|
- system: system events and status messages
|
|
38
38
|
|
|
39
39
|
Extract:
|
|
40
|
-
1. whatHappened: 2-3 sentences summarizing what
|
|
41
|
-
2.
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
40
|
+
1. whatHappened: 2-3 sentences summarizing what occurred in this session
|
|
41
|
+
2. facts: up to 15 concrete factual claims. Each must be a self-contained sentence. \
|
|
42
|
+
IMPORTANT — spread across these dimensions (do not let one theme dominate):
|
|
43
|
+
- WHO: people mentioned, their roles, backgrounds, relationships to each other
|
|
44
|
+
- WHAT: specific claims, properties, descriptions of things discussed
|
|
45
|
+
- HOW MUCH: any numbers, quantities, dates, durations, counts stated
|
|
46
|
+
- WHAT CHANGED: decisions made, agreements reached, state changes
|
|
47
|
+
- WHAT'S NEXT: commitments, action items, plans, deadlines
|
|
48
|
+
If you have 5+ facts about one dimension and 0 about another that was discussed, \
|
|
49
|
+
you are missing something. Breadth over depth.
|
|
50
|
+
Good: "The CTO of Al-Futaim previously worked at Citibank for 17 years as Director of IT in Singapore"
|
|
51
|
+
Good: "Citibank has 2400 IntelliJ subscriptions and heavy TeamCity usage"
|
|
52
|
+
Good: "The meeting is 45 minutes, scheduled for Tuesday"
|
|
53
|
+
Bad: "client-understanding-key: True"
|
|
54
|
+
Bad: five variations of "Al-Futaim is moving to the cloud"
|
|
55
|
+
3. decisions: up to 5 decisions or agreements made (who decided what, with any deadline)
|
|
56
|
+
4. entities: named things discussed or interacted with — as objects with name \
|
|
57
|
+
(lowercase-hyphenated slug) and type (freeform — person, org, tool, file, concept, \
|
|
58
|
+
service, framework, error, whatever fits the context).
|
|
59
|
+
Examples: {"name": "citibank", "type": "org"}, {"name": "auth-module", "type": "file"}, \
|
|
60
|
+
{"name": "react-native", "type": "framework"}
|
|
61
|
+
5. patterns: up to 3 reusable techniques or workflows (if any — skip if none)
|
|
62
|
+
6. preferences: up to 3 user preferences or habits observed
|
|
63
|
+
|
|
64
|
+
If existing entities are provided, reference them by name to enable reinforcement.
|
|
65
|
+
Focus on CONCRETE, SPECIFIC knowledge. Skip vague observations.
|
|
66
|
+
If the session was idle or empty, say so briefly.
|
|
49
67
|
|
|
50
68
|
Respond with ONLY a JSON object:
|
|
51
69
|
{
|
|
52
70
|
"whatHappened": "string",
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
71
|
+
"facts": ["self-contained factual sentence", ...],
|
|
72
|
+
"decisions": ["decision sentence with who/what/when", ...],
|
|
73
|
+
"entities": [{"name": "citibank", "type": "org"}, {"name": "artom", "type": "person"}, ...],
|
|
74
|
+
"patterns": ["reusable technique or workflow", ...],
|
|
75
|
+
"preferences": ["user preference or habit", ...],
|
|
58
76
|
"isEmpty": false
|
|
59
77
|
}"""
|
|
60
78
|
|
|
@@ -95,6 +113,7 @@ def main() -> None:
|
|
|
95
113
|
parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
|
|
96
114
|
parser.add_argument("--transcript", required=True, help="JSON array of feed items")
|
|
97
115
|
parser.add_argument("--session-meta", default="{}", help="JSON session metadata")
|
|
116
|
+
parser.add_argument("--existing-entities", default="", help="Compact summary of existing knowledge graph entities")
|
|
98
117
|
args = parser.parse_args()
|
|
99
118
|
|
|
100
119
|
# Parse inputs
|
|
@@ -111,11 +130,11 @@ def main() -> None:
|
|
|
111
130
|
if not items or len(items) < 2:
|
|
112
131
|
output_json({
|
|
113
132
|
"whatHappened": "Empty or trivial session",
|
|
133
|
+
"facts": [],
|
|
134
|
+
"decisions": [],
|
|
135
|
+
"entities": [],
|
|
114
136
|
"patterns": [],
|
|
115
|
-
"antiPatterns": [],
|
|
116
137
|
"preferences": [],
|
|
117
|
-
"entities": [],
|
|
118
|
-
"toolInsights": [],
|
|
119
138
|
"isEmpty": True,
|
|
120
139
|
})
|
|
121
140
|
return
|
|
@@ -130,11 +149,16 @@ def main() -> None:
|
|
|
130
149
|
lines = [l for l in playbook.splitlines() if l.strip() and not l.startswith("<!--")]
|
|
131
150
|
playbook_summary = f"\n\n## Current Playbook (for reference — don't repeat known patterns)\n{chr(10).join(lines[:30])}"
|
|
132
151
|
|
|
152
|
+
# Include existing entities for retrieve-before-extract (Mem0 pattern)
|
|
153
|
+
existing_section = ""
|
|
154
|
+
if args.existing_entities and args.existing_entities.strip():
|
|
155
|
+
existing_section = f"\n\n## Existing Knowledge (reinforce or update these if the session confirms/changes them)\n{args.existing_entities}"
|
|
156
|
+
|
|
133
157
|
user_prompt = f"""## Session Transcript ({len(items)} items)
|
|
134
158
|
{transcript_text}
|
|
135
159
|
|
|
136
160
|
## Session Metadata
|
|
137
|
-
{json.dumps(meta, indent=2)}{playbook_summary}"""
|
|
161
|
+
{json.dumps(meta, indent=2)}{playbook_summary}{existing_section}"""
|
|
138
162
|
|
|
139
163
|
try:
|
|
140
164
|
raw = call_llm_with_fallback(
|