@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,738 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ KG Extraction V2 — 2-Pass Concurrent Hybrid via Ollama API
4
+ 8 batches x 2 passes = 16 concurrent Ollama calls per wave.
5
+
6
+ Pass A: Structured (all 14 types in one prompt)
7
+ Pass B: Native graph discovery (nodes/edges)
8
+ Both run concurrently per batch, 8 batches per wave.
9
+
10
+ Usage:
11
+ python3 kg-preflexor-v2.py # Full extraction
12
+ python3 kg-preflexor-v2.py --source telegram # Only telegram
13
+ python3 kg-preflexor-v2.py --stats # Graph stats
14
+ python3 kg-preflexor-v2.py --dry-run # No Neo4j writes
15
+ python3 kg-preflexor-v2.py --reset # Clear state
16
+ python3 kg-preflexor-v2.py --test-batch # Run 1 batch, show output
17
+ python3 kg-preflexor-v2.py --concurrency 16 # Custom concurrency
18
+
19
+ Environment variables:
20
+ PME_WORKSPACE — workspace root (default: $HOME/pentatonic)
21
+ PME_OLLAMA_URL — Ollama base URL (default: http://localhost:11434)
22
+ PME_OLLAMA_KG_MODEL — model for extraction (default: qwen3:8b)
23
+ PME_NEO4J_URI — Neo4j bolt URI (default: bolt://localhost:7687)
24
+ PME_NEO4J_PASSWORD — Neo4j password (overrides .secrets.json)
25
+ """
26
+
27
+ import argparse
28
+ import logging
29
+ import json
30
+ import os
31
+ import re
32
+ import time
33
+ import traceback
34
+ import urllib.request
35
+ from concurrent.futures import ThreadPoolExecutor, as_completed
36
+ from datetime import datetime, timezone
37
+ from pathlib import Path
38
+ from threading import Lock
39
+ from typing import Any, Optional
40
+
41
+ # -- Config --
42
+ WORKSPACE = Path(os.environ.get("PME_WORKSPACE", str(Path.home() / "pentatonic")))
43
+ SECRETS_FILE = WORKSPACE / ".secrets.json"
44
+ STATE_FILE = WORKSPACE / "data" / "kg-preflexor-v2-state.json"
45
+ REFINEMENT_FILE = WORKSPACE / "data" / "kg-refinement-queue.json"
46
+ LOG_DIR = WORKSPACE / "logs"
47
+
48
+ CHAT_ROOT = WORKSPACE / "chats"
49
+ TG_DIR = CHAT_ROOT / "telegram"
50
+ WA_DIR = CHAT_ROOT / "whatsapp"
51
+ EMAIL_DIR = CHAT_ROOT / "email"
52
+ SLACK_DIR = CHAT_ROOT / "slack"
53
+ IMESSAGE_DIR = CHAT_ROOT / "imessage"
54
+
55
+ OLLAMA_URL = os.environ.get("PME_OLLAMA_URL", "http://localhost:11434")
56
+ MODEL = os.environ.get("PME_OLLAMA_KG_MODEL", "qwen3:8b")
57
+ DEFAULT_BATCH_SIZE = 15
58
+ DEFAULT_CONCURRENCY = 8 # batches at once (x2 passes = 16 Ollama calls)
59
+
60
+ DECISION_KEYWORDS = [
61
+ "decided", "decision", "let's go with", "switching to", "approved",
62
+ "rejected", "committed", "promise", "deadline", "budget", "investment",
63
+ "contract", "agreement", "strategy", "pivot", "cancelled", "postponed"
64
+ ]
65
+
66
+ # -- Pass Definitions --
67
+ PASS_A_SYSTEM = """Extract structured knowledge from chat messages. Output JSON with these arrays (empty array if nothing found):
68
+
69
+ - persons: [{"name": "str", "role": "str or null"}]
70
+ - projects: [{"name": "str", "status": "active|completed|paused|planned|abandoned or null"}]
71
+ - systems: [{"name": "str", "type": "service|cron|container|script|api|database or null"}]
72
+ - entities: [{"name": "str", "type": "company|tool|place|service|product|platform"}]
73
+ - decisions: [{"what": "str", "who": "str", "date": "YYYY-MM-DD or null", "reasoning": "str or null"}]
74
+ - commitments: [{"what": "str", "who": "str", "deadline": "YYYY-MM-DD or null", "status": "open|fulfilled|broken|cancelled"}]
75
+ - events: [{"name": "str", "date": "YYYY-MM-DD or null", "type": "meeting|deadline|incident|social|travel|appointment or null"}]
76
+ - transactions: [{"description": "str", "amount": "str or null", "date": "YYYY-MM-DD or null"}]
77
+ - incidents: [{"what_broke": "str", "date": "YYYY-MM-DD or null", "severity": "critical|high|medium|low"}]
78
+ - deadlines: [{"description": "str", "date": "YYYY-MM-DD or null", "status": "upcoming|met|missed|cancelled"}]
79
+ - topics: [{"name": "str", "category": "technical|personal|business|health|finance|social or null"}]
80
+ - lessons: [{"insight": "str", "source": "str or null", "date": "YYYY-MM-DD or null"}]
81
+ - preferences: [{"category": "food|tool|workflow|communication|schedule|other", "value": "str", "who": "str"}]
82
+ - routines: [{"name": "str", "frequency": "daily|weekly|monthly or null", "description": "str or null"}]
83
+
84
+ Rules:
85
+ - ONLY extract what is explicitly stated in the messages
86
+ - Do NOT invent or infer content not shown
87
+ - If nothing found for a category, use empty array"""
88
+
89
+ PASS_B_SYSTEM = """Analyse these chat messages and extract a knowledge graph. Return JSON with "nodes" and "edges" arrays.
90
+ Each node: {"id": "string", "type": "string"}
91
+ Each edge: {"source": "string", "relation": "string", "target": "string"}
92
+
93
+ Find ALL meaningful relationships -- especially:
94
+ - Implicit connections between people and projects
95
+ - Temporal sequences and causation
96
+ - Sentiment and attitude signals
97
+ - Technical dependencies
98
+ - Any patterns a rigid schema might miss
99
+
100
+ Rules:
101
+ - ONLY extract from the messages shown
102
+ - Do NOT invent content not present"""
103
+
104
+
105
+ # -- Ollama Client --
106
+ class OllamaClient:
107
+ def __init__(self, base_url=OLLAMA_URL, model=MODEL):
108
+ self.base_url = base_url
109
+ self.model = model
110
+ self.lock = Lock()
111
+ self.total_tokens = 0
112
+ self.total_time = 0.0
113
+ self.total_calls = 0
114
+
115
+ def warmup(self) -> None:
116
+ print(f" Warming up {self.model}...", end=" ", flush=True)
117
+ t0 = time.time()
118
+ self._call("system", "Say OK", 8)
119
+ print(f"done ({time.time() - t0:.1f}s)", flush=True)
120
+
121
+ def extract(self, system_prompt, user_prompt, max_tokens=768) -> None:
122
+ for attempt in range(2):
123
+ result = self._call(system_prompt, user_prompt, max_tokens)
124
+ if result is None:
125
+ continue
126
+ with self.lock:
127
+ self.total_tokens += result.get("tokens", 0)
128
+ self.total_time += result.get("duration", 0)
129
+ self.total_calls += 1
130
+ data = self._parse_json(result["text"])
131
+ if data is not None:
132
+ return data
133
+ return None
134
+
135
+ def _call(self, system_prompt, user_prompt, max_tokens):
136
+ payload = json.dumps({
137
+ "model": self.model,
138
+ "messages": [
139
+ {"role": "system", "content": system_prompt},
140
+ {"role": "user", "content": user_prompt}
141
+ ],
142
+ "format": "json",
143
+ "stream": False,
144
+ "options": {"num_predict": max_tokens}
145
+ }).encode()
146
+ req = urllib.request.Request(
147
+ f"{self.base_url}/api/chat", data=payload,
148
+ headers={"Content-Type": "application/json"}, method="POST"
149
+ )
150
+ try:
151
+ with urllib.request.urlopen(req, timeout=180) as resp:
152
+ d = json.loads(resp.read())
153
+ return {
154
+ "text": d.get("message", {}).get("content", ""),
155
+ "duration": d.get("total_duration", 0) / 1e9,
156
+ "tokens": d.get("eval_count", 0),
157
+ }
158
+ except Exception:
159
+ return None
160
+
161
+ def _parse_json(self, text):
162
+ try:
163
+ return json.loads(text)
164
+ except json.JSONDecodeError:
165
+ pass
166
+ for pat in [r'```json\s*\n?(.*?)\n?```', r'```\s*\n?(.*?)\n?```']:
167
+ m = re.search(pat, text, re.DOTALL)
168
+ if m:
169
+ try:
170
+ return json.loads(m.group(1).strip())
171
+ except json.JSONDecodeError:
172
+ pass
173
+ s, e = text.find("{"), text.rfind("}")
174
+ if s != -1 and e > s:
175
+ try:
176
+ return json.loads(text[s:e + 1])
177
+ except json.JSONDecodeError:
178
+ pass
179
+ return None
180
+
181
+
182
+ # -- Schema Mapper --
183
+ PRED_MAP = {
184
+ "decides": "DECIDED", "decided": "DECIDED", "chose": "DECIDED",
185
+ "builds": "WORKS_ON", "built": "WORKS_ON", "develops": "WORKS_ON",
186
+ "uses": "USES", "used": "USES", "runs": "USES",
187
+ "manages": "MANAGES", "owns": "OWNS",
188
+ "creates": "CREATED", "created": "CREATED",
189
+ "mentions": "DISCUSSED", "discusses": "DISCUSSED",
190
+ "commits": "COMMITTED_TO", "committed": "COMMITTED_TO",
191
+ "breaks": "BROKE", "broke": "BROKE", "crashed": "BROKE",
192
+ "fixes": "FIXED", "fixed": "FIXED", "resolved": "FIXED",
193
+ "causes": "CAUSED", "caused": "CAUSED", "causes_problem": "CAUSED",
194
+ "depends_on": "DEPENDS_ON", "requires": "DEPENDS_ON",
195
+ "replaces": "LED_TO", "leads_to": "LED_TO",
196
+ "rejects": "DECIDED", "prefers": "PREFERS", "likes": "PREFERS",
197
+ "switches_to": "LED_TO", "connects": "CONNECTS_TO",
198
+ "avoids": "REJECTED", "cancels": "CANCELLED",
199
+ }
200
+
201
+ TYPE_MAP = {
202
+ "person": "Person", "human": "Person", "user": "Person", "agent": "Person",
203
+ "project": "Project", "feature": "Project", "task": "Project",
204
+ "system": "System", "service": "System", "tool": "System", "script": "System",
205
+ "database": "System", "cron": "System", "container": "System", "api": "System",
206
+ "company": "Entity", "organisation": "Entity", "organization": "Entity",
207
+ "place": "Entity", "platform": "Entity", "product": "Entity",
208
+ "topic": "Topic", "subject": "Topic", "event": "Event",
209
+ "meeting": "Event", "routine": "Routine", "decision": "Decision",
210
+ "lesson": "Lesson", "preference": "Preference", "deadline": "Deadline",
211
+ "commitment": "Commitment", "incident": "Incident",
212
+ "transaction": "Transaction", "subscription": "Transaction",
213
+ "version": "System", "schedule": "Routine", "date": "Event",
214
+ "data": "System", "briefing": "Event",
215
+ }
216
+
217
+
218
+ def map_native(data) -> tuple:
219
+ """Map native nodes/edges to Neo4j ops. Returns (ops, novel_types)."""
220
+ ops, novel = [], []
221
+ for node in data.get("nodes", []):
222
+ nid = node.get("id", "").strip()
223
+ if not nid:
224
+ continue
225
+ ntype = node.get("type", "entity").lower()
226
+ label = TYPE_MAP.get(ntype, "Entity")
227
+ if ntype and ntype not in TYPE_MAP:
228
+ novel.append(("node_type", ntype, nid))
229
+ ops.append(("node", label, nid))
230
+ for edge in data.get("edges", []):
231
+ src, tgt = edge.get("source", "").strip(), edge.get("target", "").strip()
232
+ if not src or not tgt:
233
+ continue
234
+ rel = edge.get("relation", "RELATES_TO").lower().replace(" ", "_")
235
+ neo_rel = PRED_MAP.get(rel, re.sub(r"[^A-Z0-9_]", "_", rel.upper()) or "RELATES_TO")
236
+ if rel and rel not in PRED_MAP:
237
+ novel.append(("edge_type", rel, f"{src} -> {tgt}"))
238
+ ops.append(("edge", neo_rel, src, tgt))
239
+ return ops, novel
240
+
241
+
242
+ # -- Neo4j Writer --
243
+ class GraphWriter:
244
+ def __init__(self, uri, user, password, dry_run=False):
245
+ self.dry_run = dry_run
246
+ self.driver = None
247
+ self.lock = Lock()
248
+ self.nodes_written = 0
249
+ self.edges_written = 0
250
+ self.novel_types = []
251
+ if not dry_run:
252
+ from neo4j import GraphDatabase
253
+ self.driver = GraphDatabase.driver(uri, auth=(user, password))
254
+ self._indexes()
255
+
256
+ def close(self) -> None:
257
+ if self.driver:
258
+ self.driver.close()
259
+
260
+ def _indexes(self):
261
+ idxs = [
262
+ "CREATE INDEX IF NOT EXISTS FOR (p:Person) ON (p.name)",
263
+ "CREATE INDEX IF NOT EXISTS FOR (p:Project) ON (p.name)",
264
+ "CREATE INDEX IF NOT EXISTS FOR (e:Entity) ON (e.name)",
265
+ "CREATE INDEX IF NOT EXISTS FOR (s:System) ON (s.name)",
266
+ "CREATE INDEX IF NOT EXISTS FOR (t:Topic) ON (t.name)",
267
+ "CREATE INDEX IF NOT EXISTS FOR (d:Decision) ON (d.what)",
268
+ "CREATE INDEX IF NOT EXISTS FOR (i:Incident) ON (i.what_broke)",
269
+ "CREATE INDEX IF NOT EXISTS FOR (l:Lesson) ON (l.insight)",
270
+ "CREATE INDEX IF NOT EXISTS FOR (c:Commitment) ON (c.what)",
271
+ "CREATE INDEX IF NOT EXISTS FOR (e:Event) ON (e.name)",
272
+ "CREATE INDEX IF NOT EXISTS FOR (r:Routine) ON (r.name)",
273
+ "CREATE INDEX IF NOT EXISTS FOR (d:Deadline) ON (d.description)",
274
+ ]
275
+ with self.driver.session() as s:
276
+ for idx in idxs:
277
+ try:
278
+ s.run(idx)
279
+ except Exception as e:
280
+ logging.debug(f"Suppressed: {e}")
281
+ pass
282
+
283
+ def _run(self, query, **params):
284
+ if self.dry_run:
285
+ return
286
+ clean = {k: (v if v is not None else "") for k, v in params.items()}
287
+ with self.driver.session() as s:
288
+ s.run(query, **clean)
289
+
290
+ def ingest_structured(self, data, source_chat=None) -> int:
291
+ if not data:
292
+ return 0
293
+ count = 0
294
+
295
+ LINK_MAP = {
296
+ "decisions": ("who", "MADE_DECISION", "Decision", "what", "what"),
297
+ "commitments": ("who", "HAS_COMMITMENT", "Commitment", "what", "what"),
298
+ "events": (None, "PARTICIPATED_IN", "Event", "name", "name"),
299
+ "transactions":(None, "MADE_TRANSACTION","Transaction","description", "description"),
300
+ "incidents": (None, "EXPERIENCED", "Incident", "what_broke", "what_broke"),
301
+ "deadlines": (None, "HAS_DEADLINE", "Deadline", "description", "description"),
302
+ "lessons": (None, "LEARNED", "Lesson", "insight", "insight"),
303
+ "preferences": ("who", "HAS_PREFERENCE", "Preference", "value", "value"),
304
+ "routines": (None, "FOLLOWS_ROUTINE", "Routine", "name", "name"),
305
+ }
306
+
307
+ HANDLERS = {
308
+ "persons": lambda p: self._run("MERGE (n:Person {name: $name}) SET n.role = $role", name=p.get("name","").strip(), role=p.get("role","")),
309
+ "projects": lambda p: self._run("MERGE (n:Project {name: $name}) SET n.status = $s, n.updated_at = datetime()", name=p.get("name","").strip(), s=p.get("status","active")),
310
+ "systems": lambda p: self._run("MERGE (n:System {name: $name}) SET n.type = $t", name=p.get("name","").strip(), t=p.get("type","")),
311
+ "entities": lambda p: self._run("MERGE (n:Entity {name: $name}) SET n.type = $t", name=p.get("name","").strip(), t=p.get("type","")),
312
+ "decisions": lambda p: self._run("MERGE (n:Decision {what: $w}) SET n.who=$who, n.date=$d, n.reasoning=$r, n.source_chat=$src",
313
+ w=p.get("what","").strip(), who=p.get("who",""), d=p.get("date",""), r=p.get("reasoning",""), src=source_chat or ""),
314
+ "commitments": lambda p: self._run("MERGE (n:Commitment {what: $w}) SET n.who=$who, n.deadline=$d, n.status=$s, n.source_chat=$src",
315
+ w=p.get("what","").strip(), who=p.get("who",""), d=p.get("deadline",""), s=p.get("status","open"), src=source_chat or ""),
316
+ "events": lambda p: self._run("MERGE (n:Event {name: $name}) SET n.date=$d, n.type=$t, n.source_chat=$src",
317
+ name=p.get("name","").strip(), d=p.get("date",""), t=p.get("type",""), src=source_chat or ""),
318
+ "transactions": lambda p: self._run("MERGE (n:Transaction {description: $d}) SET n.amount=$a, n.date=$dt, n.source_chat=$src",
319
+ d=p.get("description","").strip(), a=p.get("amount",""), dt=p.get("date",""), src=source_chat or ""),
320
+ "incidents": lambda p: self._run("MERGE (n:Incident {what_broke: $w}) SET n.date=$d, n.severity=$s, n.source_chat=$src",
321
+ w=p.get("what_broke","").strip(), d=p.get("date",""), s=p.get("severity","medium"), src=source_chat or ""),
322
+ "deadlines": lambda p: self._run("MERGE (n:Deadline {description: $d}) SET n.date=$dt, n.status=$s, n.source_chat=$src",
323
+ d=p.get("description","").strip(), dt=p.get("date",""), s=p.get("status","upcoming"), src=source_chat or ""),
324
+ "topics": lambda p: self._run("MERGE (n:Topic {name: $name}) SET n.category=$c, n.source_chat=$src",
325
+ name=p.get("name","").strip(), c=p.get("category",""), src=source_chat or ""),
326
+ "lessons": lambda p: self._run("MERGE (n:Lesson {insight: $i}) SET n.source=$s, n.date=$d, n.source_chat=$src",
327
+ i=p.get("insight","").strip(), s=p.get("source",""), d=p.get("date",""), src=source_chat or ""),
328
+ "preferences": lambda p: self._run("MERGE (n:Preference {category: $c, value: $v}) SET n.who=$w, n.source_chat=$src",
329
+ c=p.get("category","other"), v=p.get("value","").strip(), w=p.get("who",""), src=source_chat or ""),
330
+ "routines": lambda p: self._run("MERGE (n:Routine {name: $name}) SET n.frequency=$f, n.description=$d, n.source_chat=$src",
331
+ name=p.get("name","").strip(), f=p.get("frequency",""), d=p.get("description",""), src=source_chat or ""),
332
+ }
333
+ for key, handler in HANDLERS.items():
334
+ for item in data.get(key, []):
335
+ primary = item.get("name", item.get("what", item.get("insight", item.get("description", item.get("value", "")))))
336
+ if not primary or not str(primary).strip():
337
+ continue
338
+ try:
339
+ handler(item)
340
+ count += 1
341
+ if key in LINK_MAP:
342
+ who_field, rel_type, label, primary_field, param_name = LINK_MAP[key]
343
+ who = item.get(who_field, "") if who_field else None
344
+ primary_val = str(primary).strip()
345
+ if who and str(who).strip():
346
+ self._run(f"""
347
+ MATCH (p:Person {{name: $who}})
348
+ MATCH (n:{label} {{{primary_field}: $pval}})
349
+ MERGE (p)-[r:{rel_type}]->(n)
350
+ SET r.updated_at = datetime()
351
+ """, who=str(who).strip(), pval=primary_val)
352
+ elif source_chat:
353
+ self._run(f"""
354
+ MERGE (src:Entity {{name: $src, type: 'chat_source'}})
355
+ WITH src
356
+ MATCH (n:{label} {{{primary_field}: $pval}})
357
+ MERGE (src)-[r:EXTRACTED_FROM]->(n)
358
+ SET r.updated_at = datetime()
359
+ """, src=source_chat, pval=primary_val)
360
+ except Exception as e:
361
+ logging.debug(f"Suppressed: {e}")
362
+ pass
363
+ with self.lock:
364
+ self.nodes_written += count
365
+ return count
366
+
367
+ def ingest_native(self, data) -> int:
368
+ if not data:
369
+ return 0
370
+ ops, novel = map_native(data)
371
+ count = 0
372
+ for op in ops:
373
+ try:
374
+ if op[0] == "node":
375
+ self._run(f"MERGE (n:{op[1]} {{name: $name}})", name=op[2])
376
+ count += 1
377
+ elif op[0] == "edge":
378
+ self._run(f"""
379
+ MATCH (a {{name: $src}}) MATCH (b {{name: $tgt}})
380
+ MERGE (a)-[r:{op[1]}]->(b) SET r.updated_at = datetime()
381
+ """, src=op[2], tgt=op[3])
382
+ count += 1
383
+ except Exception as e:
384
+ logging.debug(f"Suppressed: {e}")
385
+ pass
386
+ with self.lock:
387
+ self.nodes_written += sum(1 for o in ops if o[0] == "node")
388
+ self.edges_written += sum(1 for o in ops if o[0] == "edge")
389
+ self.novel_types.extend(novel)
390
+ return count
391
+
392
+
393
+ # -- Message Loading --
394
+ def load_messages(chat_dir, offset=0) -> Any:
395
+ msgs = []
396
+ for f in sorted(chat_dir.glob("*.jsonl")):
397
+ with open(f) as fh:
398
+ for line in fh:
399
+ line = line.strip()
400
+ if not line:
401
+ continue
402
+ try:
403
+ msgs.append(json.loads(line))
404
+ except json.JSONDecodeError:
405
+ continue
406
+ return msgs[offset:]
407
+
408
+
409
+ def format_batch(messages) -> Any:
410
+ lines = []
411
+ for msg in messages:
412
+ ts = msg.get("timestamp", msg.get("date", msg.get("t", "")))
413
+ sender = msg.get("sender", msg.get("from", msg.get("author", "Unknown")))
414
+ body = msg.get("body", msg.get("text", msg.get("message", "")))
415
+ if not body or not body.strip():
416
+ continue
417
+ if len(body) > 800:
418
+ body = body[:800] + "...[truncated]"
419
+ lines.append(f"[{ts}] {sender}: {body}")
420
+ return "\n".join(lines)
421
+
422
+
423
+ def is_decision_dense(text) -> Any:
424
+ return sum(1 for kw in DECISION_KEYWORDS if kw in text.lower()) >= 3
425
+
426
+
427
+ # -- State --
428
+ def load_state() -> Any:
429
+ if STATE_FILE.exists():
430
+ try:
431
+ return json.load(open(STATE_FILE))
432
+ except Exception as e:
433
+ logging.debug(f"Suppressed: {e}")
434
+ pass
435
+ return {"sources": {}, "last_run": None, "total_batches": 0,
436
+ "total_items": 0, "novel_types": []}
437
+
438
+ def save_state(state) -> None:
439
+ STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
440
+ state["last_run"] = datetime.now(timezone.utc).isoformat()
441
+ json.dump(state, open(STATE_FILE, "w"), indent=2, default=str)
442
+
443
+ def load_refinement_queue() -> Any:
444
+ if REFINEMENT_FILE.exists():
445
+ try:
446
+ return json.load(open(REFINEMENT_FILE))
447
+ except Exception as e:
448
+ logging.debug(f"Suppressed: {e}")
449
+ pass
450
+ return {"batches": []}
451
+
452
+ def save_refinement_queue(q) -> None:
453
+ REFINEMENT_FILE.parent.mkdir(parents=True, exist_ok=True)
454
+ json.dump(q, open(REFINEMENT_FILE, "w"), indent=2)
455
+
456
+
457
+ # -- Secrets --
458
+ def get_neo4j_config() -> dict:
459
+ uri = os.environ.get("PME_NEO4J_URI", "bolt://localhost:7687")
460
+ pw = os.environ.get("PME_NEO4J_PASSWORD", "")
461
+ if pw:
462
+ return {"uri": uri, "user": "neo4j", "password": pw}
463
+ if SECRETS_FILE.exists():
464
+ try:
465
+ secrets = json.load(open(SECRETS_FILE))
466
+ neo4j = secrets.get("neo4j", {})
467
+ if isinstance(neo4j, dict) and neo4j.get("password"):
468
+ return {"uri": neo4j.get("uri", uri),
469
+ "user": neo4j.get("user", "neo4j"), "password": neo4j["password"]}
470
+ pw = secrets.get("neo4j_password", "")
471
+ if pw:
472
+ return {"uri": uri, "user": "neo4j", "password": pw}
473
+ except Exception:
474
+ pass
475
+ return {"uri": uri, "user": "neo4j", "password": "password"}
476
+
477
+
478
+ # -- Single Batch Processing --
479
+ def process_one_batch(client, writer, batch_text, batch_id, verbose=False, source_chat=None) -> tuple:
480
+ """Process a single batch with 2 concurrent passes. Returns (structured_count, native_count, score)."""
481
+ results = {}
482
+
483
+ with ThreadPoolExecutor(max_workers=2) as executor:
484
+ fa = executor.submit(client.extract, PASS_A_SYSTEM, batch_text, 768)
485
+ fb = executor.submit(client.extract, PASS_B_SYSTEM, batch_text, 1024)
486
+ results["structured"] = fa.result()
487
+ results["native"] = fb.result()
488
+
489
+ s_count = writer.ingest_structured(results["structured"], source_chat=source_chat) if results["structured"] else 0
490
+ n_count = writer.ingest_native(results["native"]) if results["native"] else 0
491
+
492
+ score = 0
493
+ if results["structured"]:
494
+ filled = sum(1 for k, v in results["structured"].items() if isinstance(v, list) and v)
495
+ score += min(filled * 7, 50)
496
+ if results["native"]:
497
+ nodes = len(results["native"].get("nodes", []))
498
+ edges = len(results["native"].get("edges", []))
499
+ if nodes > 0:
500
+ score += 25
501
+ if edges > 0:
502
+ score += 25
503
+
504
+ return s_count, n_count, score, results
505
+
506
+
507
+ # -- Main Processing --
508
+ def process_source(source_type, chat_dir, client, writer, state, batch_size,
509
+ concurrency, test_mode=False, verbose=False):
510
+ if not chat_dir.exists():
511
+ print(f" No {source_type} directory found")
512
+ return
513
+
514
+ chat_dirs = [d for d in chat_dir.iterdir() if d.is_dir()]
515
+ print(f" Found {len(chat_dirs)} {source_type} chats")
516
+ refinement_queue = load_refinement_queue()
517
+
518
+ for cdir in sorted(chat_dirs):
519
+ chat_id = cdir.name
520
+ state_key = f"{source_type}:{chat_id}"
521
+ chat_state = state["sources"].get(state_key, {"offset": 0, "processed": 0})
522
+ offset = chat_state.get("offset", 0)
523
+
524
+ messages = load_messages(cdir, offset)
525
+ if not messages or (len(messages) < 5 and not test_mode):
526
+ continue
527
+
528
+ total = len(messages)
529
+ num_batches = (total + batch_size - 1) // batch_size
530
+ print(f"\n {state_key}: {total} msgs from offset {offset} ({num_batches} batches)")
531
+
532
+ batches = []
533
+ for i in range(0, total, batch_size):
534
+ batch = messages[i:i + batch_size]
535
+ text = format_batch(batch)
536
+ if text.strip():
537
+ batches.append((i, batch, text))
538
+
539
+ wave_num = 0
540
+ for wave_start in range(0, len(batches), concurrency):
541
+ wave = batches[wave_start:wave_start + concurrency]
542
+ wave_num += 1
543
+ wave_total = (len(batches) + concurrency - 1) // concurrency
544
+ print(f" Wave {wave_num}/{wave_total} ({len(wave)} batches)...", end=" ", flush=True)
545
+
546
+ t0 = time.time()
547
+ wave_items = 0
548
+ wave_results = {}
549
+
550
+ with ThreadPoolExecutor(max_workers=concurrency) as executor:
551
+ futures = {}
552
+ for idx, (batch_offset, batch, text) in enumerate(wave):
553
+ f = executor.submit(process_one_batch, client, writer, text, idx, verbose, source_chat=state_key)
554
+ futures[f] = (batch_offset, batch, text)
555
+
556
+ for f in as_completed(futures):
557
+ batch_offset, batch, text = futures[f]
558
+ try:
559
+ s_count, n_count, score, results = f.result()
560
+ wave_items += s_count + n_count
561
+ wave_results[batch_offset] = (s_count, n_count, score, text)
562
+ except Exception as e:
563
+ print(f"X", end="", flush=True)
564
+ wave_results[batch_offset] = (0, 0, 0, text)
565
+
566
+ elapsed = time.time() - t0
567
+
568
+ for batch_offset, batch, text in wave:
569
+ s_count, n_count, score, _ = wave_results.get(batch_offset, (0, 0, 0, text))
570
+
571
+ if is_decision_dense(text) or score < 40:
572
+ refinement_queue["batches"].append({
573
+ "source": state_key, "offset": offset + batch_offset,
574
+ "size": len(batch), "score": score,
575
+ "decision_dense": is_decision_dense(text),
576
+ "timestamp": datetime.now(timezone.utc).isoformat()
577
+ })
578
+
579
+ new_offset = offset + batch_offset + len(batch)
580
+ state["sources"][state_key] = {
581
+ "offset": new_offset,
582
+ "processed": chat_state.get("processed", 0) + len(batch)
583
+ }
584
+
585
+ state["total_batches"] = state.get("total_batches", 0) + len(wave)
586
+ state["total_items"] = state.get("total_items", 0) + wave_items
587
+ save_state(state)
588
+ save_refinement_queue(refinement_queue)
589
+
590
+ print(f"OK {wave_items} items, {elapsed:.1f}s ({elapsed/len(wave):.1f}s/batch)", flush=True)
591
+
592
+ if writer.novel_types:
593
+ for nt in writer.novel_types[-5:]:
594
+ print(f" NEW {nt[0]}: {nt[1]} (from: {nt[2]})")
595
+ state.setdefault("novel_types", []).extend([
596
+ {"type": t, "value": v, "example": e}
597
+ for t, v, e in writer.novel_types[-20:]
598
+ ])
599
+ writer.novel_types = []
600
+
601
+ if test_mode:
602
+ print(f"\n Test mode -- showing first batch detail:")
603
+ first_offset = wave[0][0]
604
+ s, n, score, text = wave_results[first_offset]
605
+ print(f" Structured: {s} items | Native: {n} items | Score: {score}")
606
+ r = process_one_batch(client, writer, wave[0][2], 0, True, source_chat=state_key)
607
+ if r[3].get("structured"):
608
+ print(f"\n === Structured ===")
609
+ for k, v in r[3]["structured"].items():
610
+ if isinstance(v, list) and v:
611
+ print(f" {k}: {json.dumps(v, indent=2)[:500]}")
612
+ if r[3].get("native"):
613
+ print(f"\n === Native ===")
614
+ print(json.dumps(r[3]["native"], indent=2)[:1000])
615
+ return
616
+
617
+
618
+ def show_stats(neo4j_config) -> None:
619
+ from neo4j import GraphDatabase
620
+ driver = GraphDatabase.driver(neo4j_config["uri"],
621
+ auth=(neo4j_config["user"], neo4j_config["password"]))
622
+ with driver.session() as s:
623
+ total = s.run("MATCH (n) RETURN count(n) as c").single()["c"]
624
+ rels = s.run("MATCH ()-[r]->() RETURN count(r) as c").single()["c"]
625
+ print(f"\nKnowledge Graph Statistics")
626
+ print(f"{'='*50}")
627
+ print(f"Total nodes: {total}")
628
+ print(f"Total relationships: {rels}")
629
+ labels = s.run("MATCH (n) RETURN DISTINCT labels(n)[0] as l, count(n) as c ORDER BY c DESC").data()
630
+ print(f"\nBy type:")
631
+ for r in labels:
632
+ print(f" {r['l']}: {r['c']}")
633
+ rel_types = s.run("MATCH ()-[r]->() RETURN type(r) as t, count(r) as c ORDER BY c DESC LIMIT 15").data()
634
+ if rel_types:
635
+ print(f"\nRelationships:")
636
+ for r in rel_types:
637
+ print(f" {r['t']}: {r['c']}")
638
+ state = load_state()
639
+ print(f"\nPipeline: {state.get('total_batches',0)} batches, {state.get('total_items',0)} items")
640
+ novel = state.get("novel_types", [])
641
+ if novel:
642
+ print(f"Novel types: {len(novel)}")
643
+ rq = load_refinement_queue()
644
+ if rq["batches"]:
645
+ print(f"Refinement queue: {len(rq['batches'])} batches")
646
+ driver.close()
647
+
648
+
649
+ def main() -> None:
650
+ parser = argparse.ArgumentParser(description="KG V2 — 2-Pass Concurrent Hybrid")
651
+ parser.add_argument("--source", help="telegram,whatsapp")
652
+ parser.add_argument("--stats", action="store_true")
653
+ parser.add_argument("--dry-run", action="store_true")
654
+ parser.add_argument("--reset", action="store_true")
655
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
656
+ parser.add_argument("--concurrency", type=int, default=DEFAULT_CONCURRENCY)
657
+ parser.add_argument("--test-batch", action="store_true")
658
+ parser.add_argument("--verbose", "-v", action="store_true")
659
+ args = parser.parse_args()
660
+
661
+ neo4j_config = get_neo4j_config()
662
+
663
+ if args.stats:
664
+ show_stats(neo4j_config)
665
+ return
666
+
667
+ if args.reset:
668
+ if STATE_FILE.exists():
669
+ STATE_FILE.unlink()
670
+ print("State cleared")
671
+
672
+ state = load_state()
673
+ client = OllamaClient(OLLAMA_URL, MODEL)
674
+ client.warmup()
675
+ writer = GraphWriter(neo4j_config["uri"], neo4j_config["user"],
676
+ neo4j_config["password"], dry_run=args.dry_run)
677
+
678
+ ALL_SOURCES = {
679
+ "telegram": ("TG", TG_DIR),
680
+ "whatsapp": ("WA", WA_DIR),
681
+ "email": ("EM", EMAIL_DIR),
682
+ "slack": ("SL", SLACK_DIR),
683
+ "imessage": ("IM", IMESSAGE_DIR),
684
+ }
685
+ sources = args.source.split(",") if args.source else list(ALL_SOURCES.keys())
686
+ total_calls = args.concurrency * 2
687
+
688
+ print(f"\nKG V2 — 2-Pass Concurrent Hybrid via Ollama")
689
+ print(f"{'='*60}")
690
+ print(f"Model: {MODEL} | Batch: {args.batch_size} msgs | Concurrency: {args.concurrency} batches ({total_calls} calls)")
691
+ print(f"Sources: {', '.join(sources)} | Dry run: {args.dry_run} | Cost: $0.00")
692
+ print(f"{'='*60}")
693
+
694
+ try:
695
+ for src in sources:
696
+ icon, d = ALL_SOURCES.get(src, ("??", CHAT_ROOT / src))
697
+ print(f"\n[{icon}] {src.title()}...")
698
+ process_source(src, d, client, writer, state, args.batch_size,
699
+ args.concurrency, test_mode=args.test_batch, verbose=args.verbose)
700
+ except KeyboardInterrupt:
701
+ print("\nInterrupted -- state saved")
702
+ except Exception as e:
703
+ print(f"\nError: {e}")
704
+ traceback.print_exc()
705
+ finally:
706
+ save_state(state)
707
+ writer.close()
708
+ avg = client.total_time / max(client.total_calls, 1)
709
+ tps = client.total_tokens / max(client.total_time, 0.1)
710
+ print(f"\n{'='*60}")
711
+ print(f"Summary")
712
+ print(f" Batches: {state.get('total_batches', 0)}")
713
+ print(f" Ollama calls: {client.total_calls} ({avg:.1f}s avg, {tps:.0f} tok/s)")
714
+ print(f" Neo4j: {writer.nodes_written} nodes, {writer.edges_written} edges")
715
+ print(f" Items total: {state.get('total_items', 0)}")
716
+ print(f" Cost: $0.00")
717
+ rq = load_refinement_queue()
718
+ if rq["batches"]:
719
+ print(f" Refinement queue: {len(rq['batches'])} batches")
720
+
721
+ if not args.dry_run and writer.nodes_written > 0:
722
+ print("\nRefreshing node degrees for bridge inference...")
723
+ try:
724
+ import subprocess as _sp
725
+ _r = _sp.run(
726
+ ["python3", str(Path(__file__).parent / "graph-reasoner.py"), "precompute-degrees"],
727
+ capture_output=True, text=True, timeout=60
728
+ )
729
+ if _r.returncode == 0:
730
+ print(" Degrees refreshed")
731
+ else:
732
+ print(f" Degree refresh failed: {_r.stderr[:200]}")
733
+ except Exception as _e:
734
+ print(f" Degree refresh skipped: {_e}")
735
+
736
+
737
+ if __name__ == "__main__":
738
+ main()