@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +170 -69
- package/bin/__tests__/callback-server.test.js +4 -1
- package/bin/cli.js +41 -164
- package/bin/commands/config.js +251 -0
- package/package.json +2 -1
- package/packages/doctor/__tests__/detect.test.js +2 -6
- package/packages/doctor/src/checks/local-memory.js +164 -196
- package/packages/doctor/src/detect.js +11 -3
- package/packages/memory/src/corpus/adapters.js +104 -0
- package/packages/memory/src/corpus/cli.js +72 -7
- package/packages/memory/src/corpus/index.js +1 -1
- package/packages/memory-engine/.env.example +13 -0
- package/packages/memory-engine/README.md +131 -0
- package/packages/memory-engine/bench/README.md +99 -0
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
- package/packages/memory-engine/compat/Dockerfile +11 -0
- package/packages/memory-engine/compat/server.py +680 -0
- package/packages/memory-engine/docker-compose.yml +243 -0
- package/packages/memory-engine/docs/MIGRATION.md +178 -0
- package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
- package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
- package/packages/memory-engine/engine/README.md +52 -0
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
- package/packages/memory-engine/engine/l6-document-store.py +1018 -0
- package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
- package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
- package/packages/memory-engine/engine/services/l4/server.py +235 -0
- package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
- package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
- package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +129 -0
- package/packages/memory-engine/pme_memory/artifacts.py +95 -0
- package/packages/memory-engine/pme_memory/embed.py +74 -0
- package/packages/memory-engine/pme_memory/health.py +36 -0
- package/packages/memory-engine/pme_memory/hygiene.py +159 -0
- package/packages/memory-engine/pme_memory/indexer.py +200 -0
- package/packages/memory-engine/pme_memory/needs.py +55 -0
- package/packages/memory-engine/pme_memory/provenance.py +80 -0
- package/packages/memory-engine/pme_memory/scoring.py +168 -0
- package/packages/memory-engine/pme_memory/search.py +52 -0
- package/packages/memory-engine/pme_memory/store.py +86 -0
- package/packages/memory-engine/pme_memory/synthesis.py +114 -0
- package/packages/memory-engine/pyproject.toml +65 -0
- package/packages/memory-engine/scripts/kg-extractor.py +557 -0
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
- package/packages/memory-engine/tests/test_api_contract.sh +57 -0
|
@@ -0,0 +1,738 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
KG Extraction V2 — 2-Pass Concurrent Hybrid via Ollama API
|
|
4
|
+
8 batches x 2 passes = 16 concurrent Ollama calls per wave.
|
|
5
|
+
|
|
6
|
+
Pass A: Structured (all 14 types in one prompt)
|
|
7
|
+
Pass B: Native graph discovery (nodes/edges)
|
|
8
|
+
Both run concurrently per batch, 8 batches per wave.
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
python3 kg-preflexor-v2.py # Full extraction
|
|
12
|
+
python3 kg-preflexor-v2.py --source telegram # Only telegram
|
|
13
|
+
python3 kg-preflexor-v2.py --stats # Graph stats
|
|
14
|
+
python3 kg-preflexor-v2.py --dry-run # No Neo4j writes
|
|
15
|
+
python3 kg-preflexor-v2.py --reset # Clear state
|
|
16
|
+
python3 kg-preflexor-v2.py --test-batch # Run 1 batch, show output
|
|
17
|
+
python3 kg-preflexor-v2.py --concurrency 16 # Custom concurrency
|
|
18
|
+
|
|
19
|
+
Environment variables:
|
|
20
|
+
PME_WORKSPACE — workspace root (default: $HOME/pentatonic)
|
|
21
|
+
PME_OLLAMA_URL — Ollama base URL (default: http://localhost:11434)
|
|
22
|
+
PME_OLLAMA_KG_MODEL — model for extraction (default: qwen3:8b)
|
|
23
|
+
PME_NEO4J_URI — Neo4j bolt URI (default: bolt://localhost:7687)
|
|
24
|
+
PME_NEO4J_PASSWORD — Neo4j password (overrides .secrets.json)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import logging
|
|
29
|
+
import json
|
|
30
|
+
import os
|
|
31
|
+
import re
|
|
32
|
+
import time
|
|
33
|
+
import traceback
|
|
34
|
+
import urllib.request
|
|
35
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
36
|
+
from datetime import datetime, timezone
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
from threading import Lock
|
|
39
|
+
from typing import Any, Optional
|
|
40
|
+
|
|
41
|
+
# -- Config --
|
|
42
|
+
WORKSPACE = Path(os.environ.get("PME_WORKSPACE", str(Path.home() / "pentatonic")))
|
|
43
|
+
SECRETS_FILE = WORKSPACE / ".secrets.json"
|
|
44
|
+
STATE_FILE = WORKSPACE / "data" / "kg-preflexor-v2-state.json"
|
|
45
|
+
REFINEMENT_FILE = WORKSPACE / "data" / "kg-refinement-queue.json"
|
|
46
|
+
LOG_DIR = WORKSPACE / "logs"
|
|
47
|
+
|
|
48
|
+
CHAT_ROOT = WORKSPACE / "chats"
|
|
49
|
+
TG_DIR = CHAT_ROOT / "telegram"
|
|
50
|
+
WA_DIR = CHAT_ROOT / "whatsapp"
|
|
51
|
+
EMAIL_DIR = CHAT_ROOT / "email"
|
|
52
|
+
SLACK_DIR = CHAT_ROOT / "slack"
|
|
53
|
+
IMESSAGE_DIR = CHAT_ROOT / "imessage"
|
|
54
|
+
|
|
55
|
+
OLLAMA_URL = os.environ.get("PME_OLLAMA_URL", "http://localhost:11434")
|
|
56
|
+
MODEL = os.environ.get("PME_OLLAMA_KG_MODEL", "qwen3:8b")
|
|
57
|
+
DEFAULT_BATCH_SIZE = 15
|
|
58
|
+
DEFAULT_CONCURRENCY = 8 # batches at once (x2 passes = 16 Ollama calls)
|
|
59
|
+
|
|
60
|
+
DECISION_KEYWORDS = [
|
|
61
|
+
"decided", "decision", "let's go with", "switching to", "approved",
|
|
62
|
+
"rejected", "committed", "promise", "deadline", "budget", "investment",
|
|
63
|
+
"contract", "agreement", "strategy", "pivot", "cancelled", "postponed"
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
# -- Pass Definitions --
|
|
67
|
+
PASS_A_SYSTEM = """Extract structured knowledge from chat messages. Output JSON with these arrays (empty array if nothing found):
|
|
68
|
+
|
|
69
|
+
- persons: [{"name": "str", "role": "str or null"}]
|
|
70
|
+
- projects: [{"name": "str", "status": "active|completed|paused|planned|abandoned or null"}]
|
|
71
|
+
- systems: [{"name": "str", "type": "service|cron|container|script|api|database or null"}]
|
|
72
|
+
- entities: [{"name": "str", "type": "company|tool|place|service|product|platform"}]
|
|
73
|
+
- decisions: [{"what": "str", "who": "str", "date": "YYYY-MM-DD or null", "reasoning": "str or null"}]
|
|
74
|
+
- commitments: [{"what": "str", "who": "str", "deadline": "YYYY-MM-DD or null", "status": "open|fulfilled|broken|cancelled"}]
|
|
75
|
+
- events: [{"name": "str", "date": "YYYY-MM-DD or null", "type": "meeting|deadline|incident|social|travel|appointment or null"}]
|
|
76
|
+
- transactions: [{"description": "str", "amount": "str or null", "date": "YYYY-MM-DD or null"}]
|
|
77
|
+
- incidents: [{"what_broke": "str", "date": "YYYY-MM-DD or null", "severity": "critical|high|medium|low"}]
|
|
78
|
+
- deadlines: [{"description": "str", "date": "YYYY-MM-DD or null", "status": "upcoming|met|missed|cancelled"}]
|
|
79
|
+
- topics: [{"name": "str", "category": "technical|personal|business|health|finance|social or null"}]
|
|
80
|
+
- lessons: [{"insight": "str", "source": "str or null", "date": "YYYY-MM-DD or null"}]
|
|
81
|
+
- preferences: [{"category": "food|tool|workflow|communication|schedule|other", "value": "str", "who": "str"}]
|
|
82
|
+
- routines: [{"name": "str", "frequency": "daily|weekly|monthly or null", "description": "str or null"}]
|
|
83
|
+
|
|
84
|
+
Rules:
|
|
85
|
+
- ONLY extract what is explicitly stated in the messages
|
|
86
|
+
- Do NOT invent or infer content not shown
|
|
87
|
+
- If nothing found for a category, use empty array"""
|
|
88
|
+
|
|
89
|
+
PASS_B_SYSTEM = """Analyse these chat messages and extract a knowledge graph. Return JSON with "nodes" and "edges" arrays.
|
|
90
|
+
Each node: {"id": "string", "type": "string"}
|
|
91
|
+
Each edge: {"source": "string", "relation": "string", "target": "string"}
|
|
92
|
+
|
|
93
|
+
Find ALL meaningful relationships -- especially:
|
|
94
|
+
- Implicit connections between people and projects
|
|
95
|
+
- Temporal sequences and causation
|
|
96
|
+
- Sentiment and attitude signals
|
|
97
|
+
- Technical dependencies
|
|
98
|
+
- Any patterns a rigid schema might miss
|
|
99
|
+
|
|
100
|
+
Rules:
|
|
101
|
+
- ONLY extract from the messages shown
|
|
102
|
+
- Do NOT invent content not present"""
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# -- Ollama Client --
|
|
106
|
+
class OllamaClient:
|
|
107
|
+
def __init__(self, base_url=OLLAMA_URL, model=MODEL):
|
|
108
|
+
self.base_url = base_url
|
|
109
|
+
self.model = model
|
|
110
|
+
self.lock = Lock()
|
|
111
|
+
self.total_tokens = 0
|
|
112
|
+
self.total_time = 0.0
|
|
113
|
+
self.total_calls = 0
|
|
114
|
+
|
|
115
|
+
def warmup(self) -> None:
|
|
116
|
+
print(f" Warming up {self.model}...", end=" ", flush=True)
|
|
117
|
+
t0 = time.time()
|
|
118
|
+
self._call("system", "Say OK", 8)
|
|
119
|
+
print(f"done ({time.time() - t0:.1f}s)", flush=True)
|
|
120
|
+
|
|
121
|
+
def extract(self, system_prompt, user_prompt, max_tokens=768) -> None:
|
|
122
|
+
for attempt in range(2):
|
|
123
|
+
result = self._call(system_prompt, user_prompt, max_tokens)
|
|
124
|
+
if result is None:
|
|
125
|
+
continue
|
|
126
|
+
with self.lock:
|
|
127
|
+
self.total_tokens += result.get("tokens", 0)
|
|
128
|
+
self.total_time += result.get("duration", 0)
|
|
129
|
+
self.total_calls += 1
|
|
130
|
+
data = self._parse_json(result["text"])
|
|
131
|
+
if data is not None:
|
|
132
|
+
return data
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
def _call(self, system_prompt, user_prompt, max_tokens):
|
|
136
|
+
payload = json.dumps({
|
|
137
|
+
"model": self.model,
|
|
138
|
+
"messages": [
|
|
139
|
+
{"role": "system", "content": system_prompt},
|
|
140
|
+
{"role": "user", "content": user_prompt}
|
|
141
|
+
],
|
|
142
|
+
"format": "json",
|
|
143
|
+
"stream": False,
|
|
144
|
+
"options": {"num_predict": max_tokens}
|
|
145
|
+
}).encode()
|
|
146
|
+
req = urllib.request.Request(
|
|
147
|
+
f"{self.base_url}/api/chat", data=payload,
|
|
148
|
+
headers={"Content-Type": "application/json"}, method="POST"
|
|
149
|
+
)
|
|
150
|
+
try:
|
|
151
|
+
with urllib.request.urlopen(req, timeout=180) as resp:
|
|
152
|
+
d = json.loads(resp.read())
|
|
153
|
+
return {
|
|
154
|
+
"text": d.get("message", {}).get("content", ""),
|
|
155
|
+
"duration": d.get("total_duration", 0) / 1e9,
|
|
156
|
+
"tokens": d.get("eval_count", 0),
|
|
157
|
+
}
|
|
158
|
+
except Exception:
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
def _parse_json(self, text):
|
|
162
|
+
try:
|
|
163
|
+
return json.loads(text)
|
|
164
|
+
except json.JSONDecodeError:
|
|
165
|
+
pass
|
|
166
|
+
for pat in [r'```json\s*\n?(.*?)\n?```', r'```\s*\n?(.*?)\n?```']:
|
|
167
|
+
m = re.search(pat, text, re.DOTALL)
|
|
168
|
+
if m:
|
|
169
|
+
try:
|
|
170
|
+
return json.loads(m.group(1).strip())
|
|
171
|
+
except json.JSONDecodeError:
|
|
172
|
+
pass
|
|
173
|
+
s, e = text.find("{"), text.rfind("}")
|
|
174
|
+
if s != -1 and e > s:
|
|
175
|
+
try:
|
|
176
|
+
return json.loads(text[s:e + 1])
|
|
177
|
+
except json.JSONDecodeError:
|
|
178
|
+
pass
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# -- Schema Mapper --
|
|
183
|
+
PRED_MAP = {
|
|
184
|
+
"decides": "DECIDED", "decided": "DECIDED", "chose": "DECIDED",
|
|
185
|
+
"builds": "WORKS_ON", "built": "WORKS_ON", "develops": "WORKS_ON",
|
|
186
|
+
"uses": "USES", "used": "USES", "runs": "USES",
|
|
187
|
+
"manages": "MANAGES", "owns": "OWNS",
|
|
188
|
+
"creates": "CREATED", "created": "CREATED",
|
|
189
|
+
"mentions": "DISCUSSED", "discusses": "DISCUSSED",
|
|
190
|
+
"commits": "COMMITTED_TO", "committed": "COMMITTED_TO",
|
|
191
|
+
"breaks": "BROKE", "broke": "BROKE", "crashed": "BROKE",
|
|
192
|
+
"fixes": "FIXED", "fixed": "FIXED", "resolved": "FIXED",
|
|
193
|
+
"causes": "CAUSED", "caused": "CAUSED", "causes_problem": "CAUSED",
|
|
194
|
+
"depends_on": "DEPENDS_ON", "requires": "DEPENDS_ON",
|
|
195
|
+
"replaces": "LED_TO", "leads_to": "LED_TO",
|
|
196
|
+
"rejects": "DECIDED", "prefers": "PREFERS", "likes": "PREFERS",
|
|
197
|
+
"switches_to": "LED_TO", "connects": "CONNECTS_TO",
|
|
198
|
+
"avoids": "REJECTED", "cancels": "CANCELLED",
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
TYPE_MAP = {
|
|
202
|
+
"person": "Person", "human": "Person", "user": "Person", "agent": "Person",
|
|
203
|
+
"project": "Project", "feature": "Project", "task": "Project",
|
|
204
|
+
"system": "System", "service": "System", "tool": "System", "script": "System",
|
|
205
|
+
"database": "System", "cron": "System", "container": "System", "api": "System",
|
|
206
|
+
"company": "Entity", "organisation": "Entity", "organization": "Entity",
|
|
207
|
+
"place": "Entity", "platform": "Entity", "product": "Entity",
|
|
208
|
+
"topic": "Topic", "subject": "Topic", "event": "Event",
|
|
209
|
+
"meeting": "Event", "routine": "Routine", "decision": "Decision",
|
|
210
|
+
"lesson": "Lesson", "preference": "Preference", "deadline": "Deadline",
|
|
211
|
+
"commitment": "Commitment", "incident": "Incident",
|
|
212
|
+
"transaction": "Transaction", "subscription": "Transaction",
|
|
213
|
+
"version": "System", "schedule": "Routine", "date": "Event",
|
|
214
|
+
"data": "System", "briefing": "Event",
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def map_native(data) -> tuple:
|
|
219
|
+
"""Map native nodes/edges to Neo4j ops. Returns (ops, novel_types)."""
|
|
220
|
+
ops, novel = [], []
|
|
221
|
+
for node in data.get("nodes", []):
|
|
222
|
+
nid = node.get("id", "").strip()
|
|
223
|
+
if not nid:
|
|
224
|
+
continue
|
|
225
|
+
ntype = node.get("type", "entity").lower()
|
|
226
|
+
label = TYPE_MAP.get(ntype, "Entity")
|
|
227
|
+
if ntype and ntype not in TYPE_MAP:
|
|
228
|
+
novel.append(("node_type", ntype, nid))
|
|
229
|
+
ops.append(("node", label, nid))
|
|
230
|
+
for edge in data.get("edges", []):
|
|
231
|
+
src, tgt = edge.get("source", "").strip(), edge.get("target", "").strip()
|
|
232
|
+
if not src or not tgt:
|
|
233
|
+
continue
|
|
234
|
+
rel = edge.get("relation", "RELATES_TO").lower().replace(" ", "_")
|
|
235
|
+
neo_rel = PRED_MAP.get(rel, re.sub(r"[^A-Z0-9_]", "_", rel.upper()) or "RELATES_TO")
|
|
236
|
+
if rel and rel not in PRED_MAP:
|
|
237
|
+
novel.append(("edge_type", rel, f"{src} -> {tgt}"))
|
|
238
|
+
ops.append(("edge", neo_rel, src, tgt))
|
|
239
|
+
return ops, novel
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
# -- Neo4j Writer --
|
|
243
|
+
class GraphWriter:
|
|
244
|
+
def __init__(self, uri, user, password, dry_run=False):
|
|
245
|
+
self.dry_run = dry_run
|
|
246
|
+
self.driver = None
|
|
247
|
+
self.lock = Lock()
|
|
248
|
+
self.nodes_written = 0
|
|
249
|
+
self.edges_written = 0
|
|
250
|
+
self.novel_types = []
|
|
251
|
+
if not dry_run:
|
|
252
|
+
from neo4j import GraphDatabase
|
|
253
|
+
self.driver = GraphDatabase.driver(uri, auth=(user, password))
|
|
254
|
+
self._indexes()
|
|
255
|
+
|
|
256
|
+
def close(self) -> None:
|
|
257
|
+
if self.driver:
|
|
258
|
+
self.driver.close()
|
|
259
|
+
|
|
260
|
+
def _indexes(self):
|
|
261
|
+
idxs = [
|
|
262
|
+
"CREATE INDEX IF NOT EXISTS FOR (p:Person) ON (p.name)",
|
|
263
|
+
"CREATE INDEX IF NOT EXISTS FOR (p:Project) ON (p.name)",
|
|
264
|
+
"CREATE INDEX IF NOT EXISTS FOR (e:Entity) ON (e.name)",
|
|
265
|
+
"CREATE INDEX IF NOT EXISTS FOR (s:System) ON (s.name)",
|
|
266
|
+
"CREATE INDEX IF NOT EXISTS FOR (t:Topic) ON (t.name)",
|
|
267
|
+
"CREATE INDEX IF NOT EXISTS FOR (d:Decision) ON (d.what)",
|
|
268
|
+
"CREATE INDEX IF NOT EXISTS FOR (i:Incident) ON (i.what_broke)",
|
|
269
|
+
"CREATE INDEX IF NOT EXISTS FOR (l:Lesson) ON (l.insight)",
|
|
270
|
+
"CREATE INDEX IF NOT EXISTS FOR (c:Commitment) ON (c.what)",
|
|
271
|
+
"CREATE INDEX IF NOT EXISTS FOR (e:Event) ON (e.name)",
|
|
272
|
+
"CREATE INDEX IF NOT EXISTS FOR (r:Routine) ON (r.name)",
|
|
273
|
+
"CREATE INDEX IF NOT EXISTS FOR (d:Deadline) ON (d.description)",
|
|
274
|
+
]
|
|
275
|
+
with self.driver.session() as s:
|
|
276
|
+
for idx in idxs:
|
|
277
|
+
try:
|
|
278
|
+
s.run(idx)
|
|
279
|
+
except Exception as e:
|
|
280
|
+
logging.debug(f"Suppressed: {e}")
|
|
281
|
+
pass
|
|
282
|
+
|
|
283
|
+
def _run(self, query, **params):
|
|
284
|
+
if self.dry_run:
|
|
285
|
+
return
|
|
286
|
+
clean = {k: (v if v is not None else "") for k, v in params.items()}
|
|
287
|
+
with self.driver.session() as s:
|
|
288
|
+
s.run(query, **clean)
|
|
289
|
+
|
|
290
|
+
def ingest_structured(self, data, source_chat=None) -> int:
|
|
291
|
+
if not data:
|
|
292
|
+
return 0
|
|
293
|
+
count = 0
|
|
294
|
+
|
|
295
|
+
LINK_MAP = {
|
|
296
|
+
"decisions": ("who", "MADE_DECISION", "Decision", "what", "what"),
|
|
297
|
+
"commitments": ("who", "HAS_COMMITMENT", "Commitment", "what", "what"),
|
|
298
|
+
"events": (None, "PARTICIPATED_IN", "Event", "name", "name"),
|
|
299
|
+
"transactions":(None, "MADE_TRANSACTION","Transaction","description", "description"),
|
|
300
|
+
"incidents": (None, "EXPERIENCED", "Incident", "what_broke", "what_broke"),
|
|
301
|
+
"deadlines": (None, "HAS_DEADLINE", "Deadline", "description", "description"),
|
|
302
|
+
"lessons": (None, "LEARNED", "Lesson", "insight", "insight"),
|
|
303
|
+
"preferences": ("who", "HAS_PREFERENCE", "Preference", "value", "value"),
|
|
304
|
+
"routines": (None, "FOLLOWS_ROUTINE", "Routine", "name", "name"),
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
HANDLERS = {
|
|
308
|
+
"persons": lambda p: self._run("MERGE (n:Person {name: $name}) SET n.role = $role", name=p.get("name","").strip(), role=p.get("role","")),
|
|
309
|
+
"projects": lambda p: self._run("MERGE (n:Project {name: $name}) SET n.status = $s, n.updated_at = datetime()", name=p.get("name","").strip(), s=p.get("status","active")),
|
|
310
|
+
"systems": lambda p: self._run("MERGE (n:System {name: $name}) SET n.type = $t", name=p.get("name","").strip(), t=p.get("type","")),
|
|
311
|
+
"entities": lambda p: self._run("MERGE (n:Entity {name: $name}) SET n.type = $t", name=p.get("name","").strip(), t=p.get("type","")),
|
|
312
|
+
"decisions": lambda p: self._run("MERGE (n:Decision {what: $w}) SET n.who=$who, n.date=$d, n.reasoning=$r, n.source_chat=$src",
|
|
313
|
+
w=p.get("what","").strip(), who=p.get("who",""), d=p.get("date",""), r=p.get("reasoning",""), src=source_chat or ""),
|
|
314
|
+
"commitments": lambda p: self._run("MERGE (n:Commitment {what: $w}) SET n.who=$who, n.deadline=$d, n.status=$s, n.source_chat=$src",
|
|
315
|
+
w=p.get("what","").strip(), who=p.get("who",""), d=p.get("deadline",""), s=p.get("status","open"), src=source_chat or ""),
|
|
316
|
+
"events": lambda p: self._run("MERGE (n:Event {name: $name}) SET n.date=$d, n.type=$t, n.source_chat=$src",
|
|
317
|
+
name=p.get("name","").strip(), d=p.get("date",""), t=p.get("type",""), src=source_chat or ""),
|
|
318
|
+
"transactions": lambda p: self._run("MERGE (n:Transaction {description: $d}) SET n.amount=$a, n.date=$dt, n.source_chat=$src",
|
|
319
|
+
d=p.get("description","").strip(), a=p.get("amount",""), dt=p.get("date",""), src=source_chat or ""),
|
|
320
|
+
"incidents": lambda p: self._run("MERGE (n:Incident {what_broke: $w}) SET n.date=$d, n.severity=$s, n.source_chat=$src",
|
|
321
|
+
w=p.get("what_broke","").strip(), d=p.get("date",""), s=p.get("severity","medium"), src=source_chat or ""),
|
|
322
|
+
"deadlines": lambda p: self._run("MERGE (n:Deadline {description: $d}) SET n.date=$dt, n.status=$s, n.source_chat=$src",
|
|
323
|
+
d=p.get("description","").strip(), dt=p.get("date",""), s=p.get("status","upcoming"), src=source_chat or ""),
|
|
324
|
+
"topics": lambda p: self._run("MERGE (n:Topic {name: $name}) SET n.category=$c, n.source_chat=$src",
|
|
325
|
+
name=p.get("name","").strip(), c=p.get("category",""), src=source_chat or ""),
|
|
326
|
+
"lessons": lambda p: self._run("MERGE (n:Lesson {insight: $i}) SET n.source=$s, n.date=$d, n.source_chat=$src",
|
|
327
|
+
i=p.get("insight","").strip(), s=p.get("source",""), d=p.get("date",""), src=source_chat or ""),
|
|
328
|
+
"preferences": lambda p: self._run("MERGE (n:Preference {category: $c, value: $v}) SET n.who=$w, n.source_chat=$src",
|
|
329
|
+
c=p.get("category","other"), v=p.get("value","").strip(), w=p.get("who",""), src=source_chat or ""),
|
|
330
|
+
"routines": lambda p: self._run("MERGE (n:Routine {name: $name}) SET n.frequency=$f, n.description=$d, n.source_chat=$src",
|
|
331
|
+
name=p.get("name","").strip(), f=p.get("frequency",""), d=p.get("description",""), src=source_chat or ""),
|
|
332
|
+
}
|
|
333
|
+
for key, handler in HANDLERS.items():
|
|
334
|
+
for item in data.get(key, []):
|
|
335
|
+
primary = item.get("name", item.get("what", item.get("insight", item.get("description", item.get("value", "")))))
|
|
336
|
+
if not primary or not str(primary).strip():
|
|
337
|
+
continue
|
|
338
|
+
try:
|
|
339
|
+
handler(item)
|
|
340
|
+
count += 1
|
|
341
|
+
if key in LINK_MAP:
|
|
342
|
+
who_field, rel_type, label, primary_field, param_name = LINK_MAP[key]
|
|
343
|
+
who = item.get(who_field, "") if who_field else None
|
|
344
|
+
primary_val = str(primary).strip()
|
|
345
|
+
if who and str(who).strip():
|
|
346
|
+
self._run(f"""
|
|
347
|
+
MATCH (p:Person {{name: $who}})
|
|
348
|
+
MATCH (n:{label} {{{primary_field}: $pval}})
|
|
349
|
+
MERGE (p)-[r:{rel_type}]->(n)
|
|
350
|
+
SET r.updated_at = datetime()
|
|
351
|
+
""", who=str(who).strip(), pval=primary_val)
|
|
352
|
+
elif source_chat:
|
|
353
|
+
self._run(f"""
|
|
354
|
+
MERGE (src:Entity {{name: $src, type: 'chat_source'}})
|
|
355
|
+
WITH src
|
|
356
|
+
MATCH (n:{label} {{{primary_field}: $pval}})
|
|
357
|
+
MERGE (src)-[r:EXTRACTED_FROM]->(n)
|
|
358
|
+
SET r.updated_at = datetime()
|
|
359
|
+
""", src=source_chat, pval=primary_val)
|
|
360
|
+
except Exception as e:
|
|
361
|
+
logging.debug(f"Suppressed: {e}")
|
|
362
|
+
pass
|
|
363
|
+
with self.lock:
|
|
364
|
+
self.nodes_written += count
|
|
365
|
+
return count
|
|
366
|
+
|
|
367
|
+
def ingest_native(self, data) -> int:
|
|
368
|
+
if not data:
|
|
369
|
+
return 0
|
|
370
|
+
ops, novel = map_native(data)
|
|
371
|
+
count = 0
|
|
372
|
+
for op in ops:
|
|
373
|
+
try:
|
|
374
|
+
if op[0] == "node":
|
|
375
|
+
self._run(f"MERGE (n:{op[1]} {{name: $name}})", name=op[2])
|
|
376
|
+
count += 1
|
|
377
|
+
elif op[0] == "edge":
|
|
378
|
+
self._run(f"""
|
|
379
|
+
MATCH (a {{name: $src}}) MATCH (b {{name: $tgt}})
|
|
380
|
+
MERGE (a)-[r:{op[1]}]->(b) SET r.updated_at = datetime()
|
|
381
|
+
""", src=op[2], tgt=op[3])
|
|
382
|
+
count += 1
|
|
383
|
+
except Exception as e:
|
|
384
|
+
logging.debug(f"Suppressed: {e}")
|
|
385
|
+
pass
|
|
386
|
+
with self.lock:
|
|
387
|
+
self.nodes_written += sum(1 for o in ops if o[0] == "node")
|
|
388
|
+
self.edges_written += sum(1 for o in ops if o[0] == "edge")
|
|
389
|
+
self.novel_types.extend(novel)
|
|
390
|
+
return count
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# -- Message Loading --
|
|
394
|
+
def load_messages(chat_dir, offset=0) -> Any:
|
|
395
|
+
msgs = []
|
|
396
|
+
for f in sorted(chat_dir.glob("*.jsonl")):
|
|
397
|
+
with open(f) as fh:
|
|
398
|
+
for line in fh:
|
|
399
|
+
line = line.strip()
|
|
400
|
+
if not line:
|
|
401
|
+
continue
|
|
402
|
+
try:
|
|
403
|
+
msgs.append(json.loads(line))
|
|
404
|
+
except json.JSONDecodeError:
|
|
405
|
+
continue
|
|
406
|
+
return msgs[offset:]
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def format_batch(messages) -> Any:
|
|
410
|
+
lines = []
|
|
411
|
+
for msg in messages:
|
|
412
|
+
ts = msg.get("timestamp", msg.get("date", msg.get("t", "")))
|
|
413
|
+
sender = msg.get("sender", msg.get("from", msg.get("author", "Unknown")))
|
|
414
|
+
body = msg.get("body", msg.get("text", msg.get("message", "")))
|
|
415
|
+
if not body or not body.strip():
|
|
416
|
+
continue
|
|
417
|
+
if len(body) > 800:
|
|
418
|
+
body = body[:800] + "...[truncated]"
|
|
419
|
+
lines.append(f"[{ts}] {sender}: {body}")
|
|
420
|
+
return "\n".join(lines)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def is_decision_dense(text) -> Any:
|
|
424
|
+
return sum(1 for kw in DECISION_KEYWORDS if kw in text.lower()) >= 3
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
# -- State --
|
|
428
|
+
def load_state() -> Any:
|
|
429
|
+
if STATE_FILE.exists():
|
|
430
|
+
try:
|
|
431
|
+
return json.load(open(STATE_FILE))
|
|
432
|
+
except Exception as e:
|
|
433
|
+
logging.debug(f"Suppressed: {e}")
|
|
434
|
+
pass
|
|
435
|
+
return {"sources": {}, "last_run": None, "total_batches": 0,
|
|
436
|
+
"total_items": 0, "novel_types": []}
|
|
437
|
+
|
|
438
|
+
def save_state(state) -> None:
|
|
439
|
+
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
440
|
+
state["last_run"] = datetime.now(timezone.utc).isoformat()
|
|
441
|
+
json.dump(state, open(STATE_FILE, "w"), indent=2, default=str)
|
|
442
|
+
|
|
443
|
+
def load_refinement_queue() -> Any:
|
|
444
|
+
if REFINEMENT_FILE.exists():
|
|
445
|
+
try:
|
|
446
|
+
return json.load(open(REFINEMENT_FILE))
|
|
447
|
+
except Exception as e:
|
|
448
|
+
logging.debug(f"Suppressed: {e}")
|
|
449
|
+
pass
|
|
450
|
+
return {"batches": []}
|
|
451
|
+
|
|
452
|
+
def save_refinement_queue(q) -> None:
|
|
453
|
+
REFINEMENT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
454
|
+
json.dump(q, open(REFINEMENT_FILE, "w"), indent=2)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# -- Secrets --
|
|
458
|
+
def get_neo4j_config() -> dict:
|
|
459
|
+
uri = os.environ.get("PME_NEO4J_URI", "bolt://localhost:7687")
|
|
460
|
+
pw = os.environ.get("PME_NEO4J_PASSWORD", "")
|
|
461
|
+
if pw:
|
|
462
|
+
return {"uri": uri, "user": "neo4j", "password": pw}
|
|
463
|
+
if SECRETS_FILE.exists():
|
|
464
|
+
try:
|
|
465
|
+
secrets = json.load(open(SECRETS_FILE))
|
|
466
|
+
neo4j = secrets.get("neo4j", {})
|
|
467
|
+
if isinstance(neo4j, dict) and neo4j.get("password"):
|
|
468
|
+
return {"uri": neo4j.get("uri", uri),
|
|
469
|
+
"user": neo4j.get("user", "neo4j"), "password": neo4j["password"]}
|
|
470
|
+
pw = secrets.get("neo4j_password", "")
|
|
471
|
+
if pw:
|
|
472
|
+
return {"uri": uri, "user": "neo4j", "password": pw}
|
|
473
|
+
except Exception:
|
|
474
|
+
pass
|
|
475
|
+
return {"uri": uri, "user": "neo4j", "password": "password"}
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
# -- Single Batch Processing --
|
|
479
|
+
def process_one_batch(client, writer, batch_text, batch_id, verbose=False, source_chat=None) -> tuple:
|
|
480
|
+
"""Process a single batch with 2 concurrent passes. Returns (structured_count, native_count, score)."""
|
|
481
|
+
results = {}
|
|
482
|
+
|
|
483
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
484
|
+
fa = executor.submit(client.extract, PASS_A_SYSTEM, batch_text, 768)
|
|
485
|
+
fb = executor.submit(client.extract, PASS_B_SYSTEM, batch_text, 1024)
|
|
486
|
+
results["structured"] = fa.result()
|
|
487
|
+
results["native"] = fb.result()
|
|
488
|
+
|
|
489
|
+
s_count = writer.ingest_structured(results["structured"], source_chat=source_chat) if results["structured"] else 0
|
|
490
|
+
n_count = writer.ingest_native(results["native"]) if results["native"] else 0
|
|
491
|
+
|
|
492
|
+
score = 0
|
|
493
|
+
if results["structured"]:
|
|
494
|
+
filled = sum(1 for k, v in results["structured"].items() if isinstance(v, list) and v)
|
|
495
|
+
score += min(filled * 7, 50)
|
|
496
|
+
if results["native"]:
|
|
497
|
+
nodes = len(results["native"].get("nodes", []))
|
|
498
|
+
edges = len(results["native"].get("edges", []))
|
|
499
|
+
if nodes > 0:
|
|
500
|
+
score += 25
|
|
501
|
+
if edges > 0:
|
|
502
|
+
score += 25
|
|
503
|
+
|
|
504
|
+
return s_count, n_count, score, results
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
# -- Main Processing --
|
|
508
|
+
def process_source(source_type, chat_dir, client, writer, state, batch_size,
|
|
509
|
+
concurrency, test_mode=False, verbose=False):
|
|
510
|
+
if not chat_dir.exists():
|
|
511
|
+
print(f" No {source_type} directory found")
|
|
512
|
+
return
|
|
513
|
+
|
|
514
|
+
chat_dirs = [d for d in chat_dir.iterdir() if d.is_dir()]
|
|
515
|
+
print(f" Found {len(chat_dirs)} {source_type} chats")
|
|
516
|
+
refinement_queue = load_refinement_queue()
|
|
517
|
+
|
|
518
|
+
for cdir in sorted(chat_dirs):
|
|
519
|
+
chat_id = cdir.name
|
|
520
|
+
state_key = f"{source_type}:{chat_id}"
|
|
521
|
+
chat_state = state["sources"].get(state_key, {"offset": 0, "processed": 0})
|
|
522
|
+
offset = chat_state.get("offset", 0)
|
|
523
|
+
|
|
524
|
+
messages = load_messages(cdir, offset)
|
|
525
|
+
if not messages or (len(messages) < 5 and not test_mode):
|
|
526
|
+
continue
|
|
527
|
+
|
|
528
|
+
total = len(messages)
|
|
529
|
+
num_batches = (total + batch_size - 1) // batch_size
|
|
530
|
+
print(f"\n {state_key}: {total} msgs from offset {offset} ({num_batches} batches)")
|
|
531
|
+
|
|
532
|
+
batches = []
|
|
533
|
+
for i in range(0, total, batch_size):
|
|
534
|
+
batch = messages[i:i + batch_size]
|
|
535
|
+
text = format_batch(batch)
|
|
536
|
+
if text.strip():
|
|
537
|
+
batches.append((i, batch, text))
|
|
538
|
+
|
|
539
|
+
wave_num = 0
|
|
540
|
+
for wave_start in range(0, len(batches), concurrency):
|
|
541
|
+
wave = batches[wave_start:wave_start + concurrency]
|
|
542
|
+
wave_num += 1
|
|
543
|
+
wave_total = (len(batches) + concurrency - 1) // concurrency
|
|
544
|
+
print(f" Wave {wave_num}/{wave_total} ({len(wave)} batches)...", end=" ", flush=True)
|
|
545
|
+
|
|
546
|
+
t0 = time.time()
|
|
547
|
+
wave_items = 0
|
|
548
|
+
wave_results = {}
|
|
549
|
+
|
|
550
|
+
with ThreadPoolExecutor(max_workers=concurrency) as executor:
|
|
551
|
+
futures = {}
|
|
552
|
+
for idx, (batch_offset, batch, text) in enumerate(wave):
|
|
553
|
+
f = executor.submit(process_one_batch, client, writer, text, idx, verbose, source_chat=state_key)
|
|
554
|
+
futures[f] = (batch_offset, batch, text)
|
|
555
|
+
|
|
556
|
+
for f in as_completed(futures):
|
|
557
|
+
batch_offset, batch, text = futures[f]
|
|
558
|
+
try:
|
|
559
|
+
s_count, n_count, score, results = f.result()
|
|
560
|
+
wave_items += s_count + n_count
|
|
561
|
+
wave_results[batch_offset] = (s_count, n_count, score, text)
|
|
562
|
+
except Exception as e:
|
|
563
|
+
print(f"X", end="", flush=True)
|
|
564
|
+
wave_results[batch_offset] = (0, 0, 0, text)
|
|
565
|
+
|
|
566
|
+
elapsed = time.time() - t0
|
|
567
|
+
|
|
568
|
+
for batch_offset, batch, text in wave:
|
|
569
|
+
s_count, n_count, score, _ = wave_results.get(batch_offset, (0, 0, 0, text))
|
|
570
|
+
|
|
571
|
+
if is_decision_dense(text) or score < 40:
|
|
572
|
+
refinement_queue["batches"].append({
|
|
573
|
+
"source": state_key, "offset": offset + batch_offset,
|
|
574
|
+
"size": len(batch), "score": score,
|
|
575
|
+
"decision_dense": is_decision_dense(text),
|
|
576
|
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
577
|
+
})
|
|
578
|
+
|
|
579
|
+
new_offset = offset + batch_offset + len(batch)
|
|
580
|
+
state["sources"][state_key] = {
|
|
581
|
+
"offset": new_offset,
|
|
582
|
+
"processed": chat_state.get("processed", 0) + len(batch)
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
state["total_batches"] = state.get("total_batches", 0) + len(wave)
|
|
586
|
+
state["total_items"] = state.get("total_items", 0) + wave_items
|
|
587
|
+
save_state(state)
|
|
588
|
+
save_refinement_queue(refinement_queue)
|
|
589
|
+
|
|
590
|
+
print(f"OK {wave_items} items, {elapsed:.1f}s ({elapsed/len(wave):.1f}s/batch)", flush=True)
|
|
591
|
+
|
|
592
|
+
if writer.novel_types:
|
|
593
|
+
for nt in writer.novel_types[-5:]:
|
|
594
|
+
print(f" NEW {nt[0]}: {nt[1]} (from: {nt[2]})")
|
|
595
|
+
state.setdefault("novel_types", []).extend([
|
|
596
|
+
{"type": t, "value": v, "example": e}
|
|
597
|
+
for t, v, e in writer.novel_types[-20:]
|
|
598
|
+
])
|
|
599
|
+
writer.novel_types = []
|
|
600
|
+
|
|
601
|
+
if test_mode:
|
|
602
|
+
print(f"\n Test mode -- showing first batch detail:")
|
|
603
|
+
first_offset = wave[0][0]
|
|
604
|
+
s, n, score, text = wave_results[first_offset]
|
|
605
|
+
print(f" Structured: {s} items | Native: {n} items | Score: {score}")
|
|
606
|
+
r = process_one_batch(client, writer, wave[0][2], 0, True, source_chat=state_key)
|
|
607
|
+
if r[3].get("structured"):
|
|
608
|
+
print(f"\n === Structured ===")
|
|
609
|
+
for k, v in r[3]["structured"].items():
|
|
610
|
+
if isinstance(v, list) and v:
|
|
611
|
+
print(f" {k}: {json.dumps(v, indent=2)[:500]}")
|
|
612
|
+
if r[3].get("native"):
|
|
613
|
+
print(f"\n === Native ===")
|
|
614
|
+
print(json.dumps(r[3]["native"], indent=2)[:1000])
|
|
615
|
+
return
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def show_stats(neo4j_config) -> None:
|
|
619
|
+
from neo4j import GraphDatabase
|
|
620
|
+
driver = GraphDatabase.driver(neo4j_config["uri"],
|
|
621
|
+
auth=(neo4j_config["user"], neo4j_config["password"]))
|
|
622
|
+
with driver.session() as s:
|
|
623
|
+
total = s.run("MATCH (n) RETURN count(n) as c").single()["c"]
|
|
624
|
+
rels = s.run("MATCH ()-[r]->() RETURN count(r) as c").single()["c"]
|
|
625
|
+
print(f"\nKnowledge Graph Statistics")
|
|
626
|
+
print(f"{'='*50}")
|
|
627
|
+
print(f"Total nodes: {total}")
|
|
628
|
+
print(f"Total relationships: {rels}")
|
|
629
|
+
labels = s.run("MATCH (n) RETURN DISTINCT labels(n)[0] as l, count(n) as c ORDER BY c DESC").data()
|
|
630
|
+
print(f"\nBy type:")
|
|
631
|
+
for r in labels:
|
|
632
|
+
print(f" {r['l']}: {r['c']}")
|
|
633
|
+
rel_types = s.run("MATCH ()-[r]->() RETURN type(r) as t, count(r) as c ORDER BY c DESC LIMIT 15").data()
|
|
634
|
+
if rel_types:
|
|
635
|
+
print(f"\nRelationships:")
|
|
636
|
+
for r in rel_types:
|
|
637
|
+
print(f" {r['t']}: {r['c']}")
|
|
638
|
+
state = load_state()
|
|
639
|
+
print(f"\nPipeline: {state.get('total_batches',0)} batches, {state.get('total_items',0)} items")
|
|
640
|
+
novel = state.get("novel_types", [])
|
|
641
|
+
if novel:
|
|
642
|
+
print(f"Novel types: {len(novel)}")
|
|
643
|
+
rq = load_refinement_queue()
|
|
644
|
+
if rq["batches"]:
|
|
645
|
+
print(f"Refinement queue: {len(rq['batches'])} batches")
|
|
646
|
+
driver.close()
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def main() -> None:
|
|
650
|
+
parser = argparse.ArgumentParser(description="KG V2 — 2-Pass Concurrent Hybrid")
|
|
651
|
+
parser.add_argument("--source", help="telegram,whatsapp")
|
|
652
|
+
parser.add_argument("--stats", action="store_true")
|
|
653
|
+
parser.add_argument("--dry-run", action="store_true")
|
|
654
|
+
parser.add_argument("--reset", action="store_true")
|
|
655
|
+
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
656
|
+
parser.add_argument("--concurrency", type=int, default=DEFAULT_CONCURRENCY)
|
|
657
|
+
parser.add_argument("--test-batch", action="store_true")
|
|
658
|
+
parser.add_argument("--verbose", "-v", action="store_true")
|
|
659
|
+
args = parser.parse_args()
|
|
660
|
+
|
|
661
|
+
neo4j_config = get_neo4j_config()
|
|
662
|
+
|
|
663
|
+
if args.stats:
|
|
664
|
+
show_stats(neo4j_config)
|
|
665
|
+
return
|
|
666
|
+
|
|
667
|
+
if args.reset:
|
|
668
|
+
if STATE_FILE.exists():
|
|
669
|
+
STATE_FILE.unlink()
|
|
670
|
+
print("State cleared")
|
|
671
|
+
|
|
672
|
+
state = load_state()
|
|
673
|
+
client = OllamaClient(OLLAMA_URL, MODEL)
|
|
674
|
+
client.warmup()
|
|
675
|
+
writer = GraphWriter(neo4j_config["uri"], neo4j_config["user"],
|
|
676
|
+
neo4j_config["password"], dry_run=args.dry_run)
|
|
677
|
+
|
|
678
|
+
ALL_SOURCES = {
|
|
679
|
+
"telegram": ("TG", TG_DIR),
|
|
680
|
+
"whatsapp": ("WA", WA_DIR),
|
|
681
|
+
"email": ("EM", EMAIL_DIR),
|
|
682
|
+
"slack": ("SL", SLACK_DIR),
|
|
683
|
+
"imessage": ("IM", IMESSAGE_DIR),
|
|
684
|
+
}
|
|
685
|
+
sources = args.source.split(",") if args.source else list(ALL_SOURCES.keys())
|
|
686
|
+
total_calls = args.concurrency * 2
|
|
687
|
+
|
|
688
|
+
print(f"\nKG V2 — 2-Pass Concurrent Hybrid via Ollama")
|
|
689
|
+
print(f"{'='*60}")
|
|
690
|
+
print(f"Model: {MODEL} | Batch: {args.batch_size} msgs | Concurrency: {args.concurrency} batches ({total_calls} calls)")
|
|
691
|
+
print(f"Sources: {', '.join(sources)} | Dry run: {args.dry_run} | Cost: $0.00")
|
|
692
|
+
print(f"{'='*60}")
|
|
693
|
+
|
|
694
|
+
try:
|
|
695
|
+
for src in sources:
|
|
696
|
+
icon, d = ALL_SOURCES.get(src, ("??", CHAT_ROOT / src))
|
|
697
|
+
print(f"\n[{icon}] {src.title()}...")
|
|
698
|
+
process_source(src, d, client, writer, state, args.batch_size,
|
|
699
|
+
args.concurrency, test_mode=args.test_batch, verbose=args.verbose)
|
|
700
|
+
except KeyboardInterrupt:
|
|
701
|
+
print("\nInterrupted -- state saved")
|
|
702
|
+
except Exception as e:
|
|
703
|
+
print(f"\nError: {e}")
|
|
704
|
+
traceback.print_exc()
|
|
705
|
+
finally:
|
|
706
|
+
save_state(state)
|
|
707
|
+
writer.close()
|
|
708
|
+
avg = client.total_time / max(client.total_calls, 1)
|
|
709
|
+
tps = client.total_tokens / max(client.total_time, 0.1)
|
|
710
|
+
print(f"\n{'='*60}")
|
|
711
|
+
print(f"Summary")
|
|
712
|
+
print(f" Batches: {state.get('total_batches', 0)}")
|
|
713
|
+
print(f" Ollama calls: {client.total_calls} ({avg:.1f}s avg, {tps:.0f} tok/s)")
|
|
714
|
+
print(f" Neo4j: {writer.nodes_written} nodes, {writer.edges_written} edges")
|
|
715
|
+
print(f" Items total: {state.get('total_items', 0)}")
|
|
716
|
+
print(f" Cost: $0.00")
|
|
717
|
+
rq = load_refinement_queue()
|
|
718
|
+
if rq["batches"]:
|
|
719
|
+
print(f" Refinement queue: {len(rq['batches'])} batches")
|
|
720
|
+
|
|
721
|
+
if not args.dry_run and writer.nodes_written > 0:
|
|
722
|
+
print("\nRefreshing node degrees for bridge inference...")
|
|
723
|
+
try:
|
|
724
|
+
import subprocess as _sp
|
|
725
|
+
_r = _sp.run(
|
|
726
|
+
["python3", str(Path(__file__).parent / "graph-reasoner.py"), "precompute-degrees"],
|
|
727
|
+
capture_output=True, text=True, timeout=60
|
|
728
|
+
)
|
|
729
|
+
if _r.returncode == 0:
|
|
730
|
+
print(" Degrees refreshed")
|
|
731
|
+
else:
|
|
732
|
+
print(f" Degree refresh failed: {_r.stderr[:200]}")
|
|
733
|
+
except Exception as _e:
|
|
734
|
+
print(f" Degree refresh skipped: {_e}")
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
if __name__ == "__main__":
|
|
738
|
+
main()
|