@geravant/sinain 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,519 @@
1
+ #!/usr/bin/env python3
2
+ """Knowledge Integrator — update playbook + knowledge graph from a SessionDigest.
3
+
4
+ Takes a session digest (from session_distiller.py), the current playbook, and
5
+ the knowledge graph, then produces:
6
+ 1. Updated playbook (working memory)
7
+ 2. Graph operations (long-term memory: assert/reinforce/retract facts)
8
+
9
+ Single LLM call, ~15s. Replaces: playbook_curator + feedback_analyzer +
10
+ triple_extractor + triple_ingest.
11
+
12
+ Usage:
13
+ python3 knowledge_integrator.py --memory-dir memory/ \
14
+ --digest '{"whatHappened":"...","patterns":[...]}' \
15
+ [--bootstrap] # one-time: seed graph from current playbook
16
+ """
17
+
18
+ import argparse
19
+ import hashlib
20
+ import json
21
+ import re
22
+ import shutil
23
+ import sys
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+
27
+ from common import (
28
+ LLMError,
29
+ call_llm_with_fallback,
30
+ extract_json,
31
+ output_json,
32
+ read_playbook,
33
+ )
34
+
35
+ SYSTEM_PROMPT = """\
36
+ You are a knowledge integrator for a personal AI overlay system (sinain).
37
+ You maintain TWO knowledge stores:
38
+
39
+ 1. PLAYBOOK (working memory, ~50 lines): actively curated patterns, anti-patterns,
40
+ and preferences. Injected into every agent prompt. Must be concise and current.
41
+
42
+ 2. KNOWLEDGE GRAPH (long-term memory): durable facts that survive playbook pruning.
43
+ Stored as entity-attribute-value triples. Facts can be reinforced (seen again),
44
+ retracted (contradicted or outdated), or newly asserted.
45
+
46
+ Given a session digest (what happened), the current playbook, and existing graph facts:
47
+
48
+ FOR THE PLAYBOOK:
49
+ - ADD patterns from the digest that are novel (not already in playbook)
50
+ - REINFORCE existing patterns that the session confirms (increment "seen" count)
51
+ - PRUNE patterns contradicted by session evidence
52
+ - PROMOTE frequently-reinforced patterns (seen 3+) to "established"
53
+ - Keep under 50 lines. Density over completeness.
54
+ - DO NOT modify header/footer comments (<!-- mining-index ... --> and <!-- effectiveness ... -->)
55
+ - Three Laws: (1) don't remove error-prevention patterns, (2) preserve high-scoring approaches, (3) then evolve
56
+
57
+ FOR THE KNOWLEDGE GRAPH:
58
+ - ASSERT new durable facts (error→fix mappings, domain knowledge, user expertise)
59
+ - REINFORCE existing facts confirmed by the session (list their entity_ids)
60
+ - RETRACT facts contradicted by session evidence (list their entity_ids)
61
+ - Each fact needs: entity (domain/tool/workflow), attribute (relationship type), value (the knowledge), confidence (0.0-1.0), domain (for module scoping)
62
+ - Entity naming: use lowercase-hyphenated slugs (e.g., "react-native", "metro-bundler")
63
+ - Only assert DURABLE facts — not ephemeral session details
64
+
65
+ If the session was empty/idle, return minimal changes.
66
+
67
+ Respond with ONLY a JSON object:
68
+ {
69
+ "updatedPlaybook": "full playbook body text (between header and footer comments)",
70
+ "changes": {
71
+ "added": ["pattern text", ...],
72
+ "pruned": ["pattern text", ...],
73
+ "promoted": ["pattern text", ...],
74
+ "reinforced": ["pattern text", ...]
75
+ },
76
+ "graphOps": [
77
+ {"op": "assert", "entity": "entity-slug", "attribute": "attr-name", "value": "fact text", "confidence": 0.8, "domain": "domain-name"},
78
+ {"op": "reinforce", "entityId": "fact:existing-slug"},
79
+ {"op": "retract", "entityId": "fact:existing-slug", "reason": "why"}
80
+ ]
81
+ }"""
82
+
83
+
84
+ _STOPWORDS = frozenset({
85
+ "the", "and", "for", "when", "with", "that", "this", "from", "into",
86
+ "after", "before", "during", "should", "would", "could", "been", "have",
87
+ "will", "also", "then", "than", "not", "but", "are", "was", "were",
88
+ "can", "may", "use", "run", "set", "get", "try", "all", "any", "new",
89
+ "score", "seen",
90
+ })
91
+
92
+
93
+ def _extract_tags(value: str) -> list[str]:
94
+ """Extract searchable keyword tags from fact value text.
95
+
96
+ Returns up to 10 deduplicated lowercase tags suitable for AVET-indexed lookup.
97
+ """
98
+ # Lowercase words (including hyphenated compounds like "react-native")
99
+ words = re.findall(r"[a-z][a-z0-9-]+", value.lower())
100
+ tags = [w for w in words if len(w) > 2 and w not in _STOPWORDS]
101
+ # Detect compound terms from CamelCase or "Title Case" patterns
102
+ compounds = re.findall(r"[A-Z][a-z]+ [A-Z][a-z]+", value)
103
+ for c in compounds:
104
+ tags.append(c.lower().replace(" ", "-"))
105
+ # Numeric tokens that look meaningful (error codes, port numbers)
106
+ nums = re.findall(r"\b\d{3,5}\b", value)
107
+ tags.extend(nums)
108
+ # Deduplicate preserving order, cap at 10
109
+ return list(dict.fromkeys(tags))[:10]
110
+
111
+
112
+ def _fact_id(entity: str, attribute: str, value: str) -> str:
113
+ """Generate a deterministic fact entity ID from entity+attribute+value."""
114
+ content = f"{entity}:{attribute}:{value}"
115
+ h = hashlib.sha256(content.encode()).hexdigest()[:12]
116
+ slug = entity.replace(" ", "-").lower()[:30]
117
+ return f"fact:{slug}-{h}"
118
+
119
+
120
+ def _load_graph_facts(db_path: str, entities: list[str] | None = None, limit: int = 50) -> list[dict]:
121
+ """Load relevant facts from the knowledge graph for LLM context."""
122
+ if not Path(db_path).exists():
123
+ return []
124
+
125
+ try:
126
+ from triplestore import TripleStore
127
+ store = TripleStore(db_path)
128
+
129
+ # Get all non-retracted fact entities with their attributes
130
+ if entities:
131
+ # Tag-based search: find facts whose tags match any of the keywords
132
+ # Normalize keywords to lowercase for tag matching
133
+ keywords = [e.lower().replace(" ", "-") for e in entities]
134
+ placeholders = ",".join(["?" for _ in keywords])
135
+ rows = store._conn.execute(
136
+ f"""SELECT entity_id, COUNT(*) as matches
137
+ FROM triples
138
+ WHERE attribute = 'tag' AND NOT retracted
139
+ AND value IN ({placeholders})
140
+ GROUP BY entity_id
141
+ ORDER BY matches DESC
142
+ LIMIT ?""",
143
+ (*keywords, limit),
144
+ ).fetchall()
145
+ fact_ids = [r["entity_id"] for r in rows]
146
+ else:
147
+ # Top-N by confidence
148
+ rows = store._conn.execute(
149
+ """SELECT entity_id, CAST(value AS REAL) as conf
150
+ FROM triples
151
+ WHERE attribute = 'confidence' AND NOT retracted
152
+ AND entity_id LIKE 'fact:%'
153
+ ORDER BY conf DESC
154
+ LIMIT ?""",
155
+ (limit,),
156
+ ).fetchall()
157
+ fact_ids = [r["entity_id"] for r in rows]
158
+
159
+ facts = []
160
+ for fid in fact_ids:
161
+ attrs = store.entity(fid)
162
+ if attrs:
163
+ fact = {"entityId": fid}
164
+ for attr_name, values in attrs.items():
165
+ fact[attr_name] = values[0] if len(values) == 1 else values
166
+ facts.append(fact)
167
+
168
+ store.close()
169
+ return facts
170
+ except Exception as e:
171
+ print(f"[warn] Failed to load graph facts: {e}", file=sys.stderr)
172
+ return []
173
+
174
+
175
+ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str) -> dict:
176
+ """Execute graph operations (assert/reinforce/retract) on the knowledge graph."""
177
+ if not ops:
178
+ return {"asserted": 0, "reinforced": 0, "retracted": 0}
179
+
180
+ try:
181
+ from triplestore import TripleStore
182
+ store = TripleStore(db_path)
183
+ stats = {"asserted": 0, "reinforced": 0, "retracted": 0}
184
+
185
+ for op_data in ops:
186
+ op = op_data.get("op", "")
187
+
188
+ if op == "assert":
189
+ entity = op_data.get("entity", "")
190
+ attribute = op_data.get("attribute", "")
191
+ value = op_data.get("value", "")
192
+ confidence = op_data.get("confidence", 0.7)
193
+ domain = op_data.get("domain", "")
194
+
195
+ if not entity or not attribute or not value:
196
+ continue
197
+
198
+ entity_id = _fact_id(entity, attribute, value)
199
+ tx = store.begin_tx("knowledge_integrator", metadata=json.dumps({"digest_ts": digest_ts}))
200
+ store.assert_triple(tx, entity_id, "entity", entity)
201
+ store.assert_triple(tx, entity_id, "attribute", attribute)
202
+ store.assert_triple(tx, entity_id, "value", value)
203
+ store.assert_triple(tx, entity_id, "confidence", str(confidence))
204
+ store.assert_triple(tx, entity_id, "first_seen", digest_ts)
205
+ store.assert_triple(tx, entity_id, "last_reinforced", digest_ts)
206
+ store.assert_triple(tx, entity_id, "reinforce_count", "1")
207
+ if domain:
208
+ store.assert_triple(tx, entity_id, "domain", domain)
209
+ # Auto-tag for keyword-based discovery
210
+ for tag in _extract_tags(value):
211
+ store.assert_triple(tx, entity_id, "tag", tag)
212
+ stats["asserted"] += 1
213
+
214
+ elif op == "reinforce":
215
+ entity_id = op_data.get("entityId", "")
216
+ if not entity_id:
217
+ continue
218
+
219
+ # Read current confidence and reinforce count
220
+ attrs = store.entity(entity_id)
221
+ if not attrs:
222
+ continue
223
+
224
+ cur_conf = 0.5
225
+ cur_count = 0
226
+ if "confidence" in attrs:
227
+ try:
228
+ cur_conf = float(attrs["confidence"][0])
229
+ except (ValueError, IndexError):
230
+ pass
231
+ if "reinforce_count" in attrs:
232
+ try:
233
+ cur_count = int(attrs["reinforce_count"][0])
234
+ except (ValueError, IndexError):
235
+ pass
236
+
237
+ new_conf = min(1.0, cur_conf + 0.15)
238
+ new_count = cur_count + 1
239
+
240
+ tx = store.begin_tx("knowledge_integrator", metadata=json.dumps({
241
+ "op": "reinforce", "entity_id": entity_id, "digest_ts": digest_ts
242
+ }))
243
+ # Retract old values, assert new
244
+ store.retract_triple(tx, entity_id, "confidence", str(cur_conf))
245
+ store.assert_triple(tx, entity_id, "confidence", str(round(new_conf, 2)))
246
+ store.retract_triple(tx, entity_id, "reinforce_count", str(cur_count))
247
+ store.assert_triple(tx, entity_id, "reinforce_count", str(new_count))
248
+ # Retract old last_reinforced if present
249
+ old_reinforced = attrs.get("last_reinforced", [])
250
+ for val in old_reinforced:
251
+ store.retract_triple(tx, entity_id, "last_reinforced", val)
252
+ store.assert_triple(tx, entity_id, "last_reinforced", digest_ts)
253
+ stats["reinforced"] += 1
254
+
255
+ elif op == "retract":
256
+ entity_id = op_data.get("entityId", "")
257
+ reason = op_data.get("reason", "")
258
+ if not entity_id:
259
+ continue
260
+
261
+ tx = store.begin_tx("knowledge_integrator", metadata=json.dumps({
262
+ "op": "retract", "entity_id": entity_id, "reason": reason, "digest_ts": digest_ts
263
+ }))
264
+ # Retract all attributes of this entity
265
+ attrs = store.entity(entity_id)
266
+ for attr_name, values in attrs.items():
267
+ for val in values:
268
+ store.retract_triple(tx, entity_id, attr_name, val)
269
+ stats["retracted"] += 1
270
+
271
+ store.close()
272
+ return stats
273
+ except Exception as e:
274
+ print(f"[warn] Failed to execute graph ops: {e}", file=sys.stderr)
275
+ return {"asserted": 0, "reinforced": 0, "retracted": 0, "error": str(e)}
276
+
277
+
278
+ def _extract_header_footer(playbook: str) -> tuple[str, str, str]:
279
+ """Split playbook into (header, body, footer)."""
280
+ lines = playbook.splitlines()
281
+ header_lines: list[str] = []
282
+ footer_lines: list[str] = []
283
+ body_lines: list[str] = []
284
+
285
+ in_header = True
286
+ for line in lines:
287
+ stripped = line.strip()
288
+ if in_header and stripped.startswith("<!--"):
289
+ header_lines.append(line)
290
+ continue
291
+ in_header = False
292
+ if stripped.startswith("<!-- effectiveness"):
293
+ footer_lines.append(line)
294
+ else:
295
+ body_lines.append(line)
296
+
297
+ return "\n".join(header_lines), "\n".join(body_lines), "\n".join(footer_lines)
298
+
299
+
300
+ def _archive_playbook(memory_dir: str) -> str | None:
301
+ """Archive current playbook. Returns archive path or None."""
302
+ src = Path(memory_dir) / "sinain-playbook.md"
303
+ if not src.exists():
304
+ return None
305
+
306
+ archive_dir = Path(memory_dir) / "playbook-archive"
307
+ archive_dir.mkdir(parents=True, exist_ok=True)
308
+
309
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
310
+ dest = archive_dir / f"sinain-playbook-{ts}.md"
311
+ shutil.copy2(src, dest)
312
+ return str(dest)
313
+
314
+
315
+ def _bootstrap_graph(memory_dir: str, db_path: str) -> dict:
316
+ """One-time: seed knowledge graph from current playbook patterns."""
317
+ playbook = read_playbook(memory_dir)
318
+ if not playbook:
319
+ return {"bootstrapped": 0}
320
+
321
+ import re
322
+ # Extract patterns from playbook (lines starting with "- ")
323
+ patterns = []
324
+ for line in playbook.splitlines():
325
+ line = line.strip()
326
+ if line.startswith("- ") and ("score" in line or "seen" in line):
327
+ patterns.append(line[2:])
328
+
329
+ if not patterns:
330
+ return {"bootstrapped": 0}
331
+
332
+ # Generate assert ops for each pattern
333
+ ops = []
334
+ for pattern in patterns:
335
+ # Extract score if present
336
+ score_match = re.search(r"score\s*[\d.]+", pattern)
337
+ confidence = 0.6
338
+ if score_match:
339
+ try:
340
+ confidence = float(re.search(r"[\d.]+", score_match.group()).group())
341
+ except (ValueError, AttributeError):
342
+ pass
343
+
344
+ # Determine domain from pattern text (basic heuristic)
345
+ domain = "general"
346
+ domain_keywords = {
347
+ "react": "react-native", "metro": "react-native", "flutter": "flutter",
348
+ "ocr": "vision", "audio": "audio", "hud": "sinain-hud",
349
+ "docker": "infrastructure", "ssh": "infrastructure", "deploy": "infrastructure",
350
+ "intellij": "intellij", "psi": "intellij", "claude": "ai-agents",
351
+ "gemini": "ai-agents", "openrouter": "ai-agents", "escalation": "sinain-core",
352
+ }
353
+ lower = pattern.lower()
354
+ for kw, dom in domain_keywords.items():
355
+ if kw in lower:
356
+ domain = dom
357
+ break
358
+
359
+ ops.append({
360
+ "op": "assert",
361
+ "entity": domain,
362
+ "attribute": "pattern",
363
+ "value": pattern[:200],
364
+ "confidence": confidence,
365
+ "domain": domain,
366
+ })
367
+
368
+ now = datetime.now(timezone.utc).isoformat()
369
+ stats = _execute_graph_ops(db_path, ops, now)
370
+ return {"bootstrapped": stats.get("asserted", 0)}
371
+
372
+
373
+ def main() -> None:
374
+ parser = argparse.ArgumentParser(description="Knowledge Integrator")
375
+ parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
376
+ parser.add_argument("--digest", default=None, help="SessionDigest JSON string")
377
+ parser.add_argument("--bootstrap", action="store_true", help="One-time: seed graph from playbook")
378
+ parser.add_argument("--retag", action="store_true", help="Re-extract tags for all existing facts")
379
+ args = parser.parse_args()
380
+
381
+ memory_dir = args.memory_dir
382
+ db_path = str(Path(memory_dir) / "knowledge-graph.db")
383
+
384
+ # Bootstrap mode: seed graph from current playbook
385
+ if args.bootstrap:
386
+ result = _bootstrap_graph(memory_dir, db_path)
387
+ output_json(result)
388
+ return
389
+
390
+ # Retag mode: extract tags for all existing facts
391
+ if args.retag:
392
+ if not Path(db_path).exists():
393
+ output_json({"error": "knowledge-graph.db not found"})
394
+ return
395
+ from triplestore import TripleStore
396
+ store = TripleStore(db_path)
397
+ # Get all fact entities that have a 'value' attribute
398
+ rows = store._conn.execute(
399
+ "SELECT DISTINCT entity_id FROM triples WHERE attribute = 'value' AND NOT retracted AND entity_id LIKE 'fact:%'"
400
+ ).fetchall()
401
+ tagged = 0
402
+ for row in rows:
403
+ fid = row["entity_id"]
404
+ attrs = store.entity(fid)
405
+ value_text = attrs.get("value", [""])[0] if attrs else ""
406
+ existing_tags = set(attrs.get("tag", [])) if attrs else set()
407
+ new_tags = _extract_tags(value_text)
408
+ missing = [t for t in new_tags if t not in existing_tags]
409
+ if missing:
410
+ tx = store.begin_tx("retag", metadata=json.dumps({"entity_id": fid}))
411
+ for tag in missing:
412
+ store.assert_triple(tx, fid, "tag", tag)
413
+ tagged += 1
414
+ store.close()
415
+ output_json({"retagged": tagged, "total_facts": len(rows)})
416
+ return
417
+
418
+ # Normal mode: integrate session digest
419
+ if not args.digest:
420
+ print("--digest is required (unless --bootstrap or --retag)", file=sys.stderr)
421
+ output_json({"error": "--digest required"})
422
+ return
423
+
424
+ try:
425
+ digest = json.loads(args.digest)
426
+ except json.JSONDecodeError as e:
427
+ output_json({"error": f"Invalid digest JSON: {e}"})
428
+ return
429
+
430
+ # Skip if digest indicates empty session
431
+ if digest.get("isEmpty", False):
432
+ output_json({"skipped": True, "reason": "empty session"})
433
+ return
434
+
435
+ # Read current playbook
436
+ playbook = read_playbook(memory_dir)
437
+ header, body, footer = _extract_header_footer(playbook)
438
+
439
+ # Load relevant graph facts for LLM context
440
+ digest_entities = digest.get("entities", [])
441
+ existing_facts = _load_graph_facts(db_path, entities=digest_entities if digest_entities else None)
442
+
443
+ # Build user prompt
444
+ facts_text = ""
445
+ if existing_facts:
446
+ facts_lines = []
447
+ for f in existing_facts[:30]:
448
+ eid = f.get("entityId", "?")
449
+ val = f.get("value", "")
450
+ conf = f.get("confidence", "?")
451
+ domain = f.get("domain", "?")
452
+ facts_lines.append(f"- [{eid}] ({domain}, confidence={conf}) {val}")
453
+ facts_text = f"\n\n## Existing Graph Facts (for reference — reinforce or retract as needed)\n" + "\n".join(facts_lines)
454
+
455
+ user_prompt = f"""## Session Digest
456
+ {json.dumps(digest, indent=2, ensure_ascii=False)}
457
+
458
+ ## Current Playbook Body
459
+ {body}{facts_text}"""
460
+
461
+ try:
462
+ raw = call_llm_with_fallback(
463
+ SYSTEM_PROMPT,
464
+ user_prompt,
465
+ script="knowledge_integrator",
466
+ json_mode=True,
467
+ )
468
+ result = extract_json(raw)
469
+ except (ValueError, LLMError) as e:
470
+ print(f"LLM integration failed: {e}", file=sys.stderr)
471
+ output_json({"error": str(e)})
472
+ return
473
+
474
+ # Archive current playbook before mutation
475
+ archive_path = _archive_playbook(memory_dir)
476
+
477
+ # Write updated playbook
478
+ updated_body = result.get("updatedPlaybook", body)
479
+ new_playbook = f"{header}\n\n{updated_body}\n\n{footer}".strip() + "\n"
480
+ playbook_path = Path(memory_dir) / "sinain-playbook.md"
481
+ playbook_path.write_text(new_playbook, encoding="utf-8")
482
+
483
+ # Execute graph operations
484
+ graph_ops = result.get("graphOps", [])
485
+ digest_ts = digest.get("ts", datetime.now(timezone.utc).isoformat())
486
+ graph_stats = _execute_graph_ops(db_path, graph_ops, digest_ts)
487
+
488
+ # Append digest to session-digests.jsonl
489
+ digests_path = Path(memory_dir) / "session-digests.jsonl"
490
+ with open(digests_path, "a", encoding="utf-8") as f:
491
+ f.write(json.dumps(digest, ensure_ascii=False) + "\n")
492
+
493
+ # Write integration log
494
+ log_entry = {
495
+ "ts": datetime.now(timezone.utc).isoformat(),
496
+ "_type": "integration",
497
+ "changes": result.get("changes", {}),
498
+ "graphStats": graph_stats,
499
+ "digestEntities": digest_entities,
500
+ "archivePath": archive_path,
501
+ "playbookLines": len(new_playbook.splitlines()),
502
+ }
503
+ log_dir = Path(memory_dir) / "playbook-logs"
504
+ log_dir.mkdir(parents=True, exist_ok=True)
505
+ today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
506
+ log_file = log_dir / f"{today}.jsonl"
507
+ with open(log_file, "a", encoding="utf-8") as f:
508
+ f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
509
+
510
+ output_json({
511
+ "status": "ok",
512
+ "changes": result.get("changes", {}),
513
+ "graphStats": graph_stats,
514
+ "playbookLines": len(new_playbook.splitlines()),
515
+ })
516
+
517
+
518
+ if __name__ == "__main__":
519
+ main()
@@ -12,7 +12,9 @@
12
12
  "module_manager": { "model": "fast", "maxTokens": 2000 },
13
13
  "tick_evaluator": { "model": "smart", "maxTokens": 200, "timeout": 30 },
14
14
  "eval_reporter": { "model": "smart", "maxTokens": 1000 },
15
- "triple_extractor": { "model": "fast", "maxTokens": 1500, "timeout": 30 }
15
+ "triple_extractor": { "model": "fast", "maxTokens": 1500, "timeout": 30 },
16
+ "session_distiller": { "model": "smart", "maxTokens": 1500, "timeout": 30 },
17
+ "knowledge_integrator": { "model": "smart", "maxTokens": 3000, "timeout": 60 }
16
18
  },
17
19
  "defaults": { "model": "fast", "maxTokens": 1500 },
18
20
  "triplestore": {
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env python3
2
+ """Session Distiller — condense session transcript into a SessionDigest.
3
+
4
+ Takes feed items + agent digests from sinain-core and produces a structured
5
+ digest of what happened, what patterns emerged, and what was learned.
6
+
7
+ Single LLM call, ~10s. Replaces: signal_analyzer + insight_synthesizer +
8
+ memory_miner for the purpose of knowledge extraction.
9
+
10
+ Usage:
11
+ python3 session_distiller.py --memory-dir memory/ \
12
+ --transcript '[ ... feed items ... ]' \
13
+ --session-meta '{"sessionKey":"...","durationMs":...}'
14
+ """
15
+
16
+ import argparse
17
+ import json
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ from common import (
22
+ LLMError,
23
+ call_llm_with_fallback,
24
+ extract_json,
25
+ output_json,
26
+ read_effective_playbook,
27
+ )
28
+
29
+ SYSTEM_PROMPT = """\
30
+ You are a session distiller for a personal AI overlay system (sinain).
31
+ Your job: analyze a session transcript and extract structured knowledge.
32
+
33
+ The transcript contains feed items from sinain-core:
34
+ - audio: transcribed speech from the user's environment
35
+ - agent: sinain's analysis digests and HUD messages
36
+ - openclaw: responses from the AI escalation system
37
+ - system: system events and status messages
38
+
39
+ Extract:
40
+ 1. whatHappened: 2-3 sentences summarizing what was accomplished in this session
41
+ 2. patterns: up to 5 reusable patterns discovered (things that worked, techniques used)
42
+ 3. antiPatterns: up to 3 things that failed and why
43
+ 4. preferences: up to 3 user preferences or workflow habits observed
44
+ 5. entities: key domains, tools, technologies, or topics worked with (for graph linking)
45
+ 6. toolInsights: tool usage insights (e.g., "grep before read reduces misses")
46
+
47
+ Focus on ACTIONABLE knowledge that would help a future agent in similar contexts.
48
+ Skip trivial observations. If the session was idle or empty, say so briefly.
49
+
50
+ Respond with ONLY a JSON object:
51
+ {
52
+ "whatHappened": "string",
53
+ "patterns": ["string", ...],
54
+ "antiPatterns": ["string", ...],
55
+ "preferences": ["string", ...],
56
+ "entities": ["string", ...],
57
+ "toolInsights": ["string", ...],
58
+ "isEmpty": false
59
+ }"""
60
+
61
+
62
+ def _truncate_transcript(items: list[dict], max_chars: int = 100_000) -> str:
63
+ """Format and truncate feed items to fit context window."""
64
+ lines: list[str] = []
65
+ total = 0
66
+ for item in items:
67
+ source = item.get("source", "?")
68
+ text = item.get("text", "")
69
+ ts = item.get("ts", "")
70
+
71
+ # Strip [PERIODIC] items — they're overlay refresh noise
72
+ if text.startswith("[PERIODIC]"):
73
+ continue
74
+
75
+ # Format timestamp as HH:MM:SS if numeric
76
+ ts_str = ""
77
+ if isinstance(ts, (int, float)) and ts > 0:
78
+ from datetime import datetime, timezone
79
+ ts_str = datetime.fromtimestamp(ts / 1000, tz=timezone.utc).strftime("%H:%M:%S")
80
+ elif isinstance(ts, str):
81
+ ts_str = ts[-8:] if len(ts) > 8 else ts
82
+
83
+ line = f"[{ts_str}] ({source}) {text}"
84
+ if total + len(line) > max_chars:
85
+ lines.append(f"... truncated ({len(items) - len(lines)} more items)")
86
+ break
87
+ lines.append(line)
88
+ total += len(line)
89
+
90
+ return "\n".join(lines)
91
+
92
+
93
+ def main() -> None:
94
+ parser = argparse.ArgumentParser(description="Session Distiller")
95
+ parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
96
+ parser.add_argument("--transcript", required=True, help="JSON array of feed items")
97
+ parser.add_argument("--session-meta", default="{}", help="JSON session metadata")
98
+ args = parser.parse_args()
99
+
100
+ # Parse inputs
101
+ try:
102
+ items = json.loads(args.transcript)
103
+ except json.JSONDecodeError as e:
104
+ print(f"Invalid transcript JSON: {e}", file=sys.stderr)
105
+ output_json({"error": f"Invalid transcript JSON: {e}", "isEmpty": True})
106
+ return
107
+
108
+ meta = json.loads(args.session_meta) if args.session_meta else {}
109
+
110
+ # Skip if transcript is trivially empty
111
+ if not items or len(items) < 2:
112
+ output_json({
113
+ "whatHappened": "Empty or trivial session",
114
+ "patterns": [],
115
+ "antiPatterns": [],
116
+ "preferences": [],
117
+ "entities": [],
118
+ "toolInsights": [],
119
+ "isEmpty": True,
120
+ })
121
+ return
122
+
123
+ # Format transcript
124
+ transcript_text = _truncate_transcript(items)
125
+
126
+ # Include current playbook for context (helps avoid re-discovering known patterns)
127
+ playbook = read_effective_playbook(args.memory_dir)
128
+ playbook_summary = ""
129
+ if playbook:
130
+ lines = [l for l in playbook.splitlines() if l.strip() and not l.startswith("<!--")]
131
+ playbook_summary = f"\n\n## Current Playbook (for reference — don't repeat known patterns)\n{chr(10).join(lines[:30])}"
132
+
133
+ user_prompt = f"""## Session Transcript ({len(items)} items)
134
+ {transcript_text}
135
+
136
+ ## Session Metadata
137
+ {json.dumps(meta, indent=2)}{playbook_summary}"""
138
+
139
+ try:
140
+ raw = call_llm_with_fallback(
141
+ SYSTEM_PROMPT,
142
+ user_prompt,
143
+ script="session_distiller",
144
+ json_mode=True,
145
+ )
146
+ result = extract_json(raw)
147
+ except (ValueError, LLMError) as e:
148
+ print(f"LLM distillation failed: {e}", file=sys.stderr)
149
+ output_json({"error": str(e), "isEmpty": True})
150
+ return
151
+
152
+ # Add metadata
153
+ result["ts"] = meta.get("ts", "")
154
+ result["sessionKey"] = meta.get("sessionKey", "")
155
+ result["durationMs"] = meta.get("durationMs", 0)
156
+ result["feedItemCount"] = len(items)
157
+
158
+ output_json(result)
159
+
160
+
161
+ if __name__ == "__main__":
162
+ main()