ltcai 0.1.4 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -0
- package/docs/OPERATIONS.md +149 -0
- package/knowledge_graph.py +815 -0
- package/ltcai_cli.py +45 -1
- package/package.json +15 -3
- package/requirements.txt +1 -0
- package/server.py +805 -44
- package/skills/SKILL_TEMPLATE.md +57 -0
- package/skills/code_review/SKILL.md +76 -0
- package/skills/data_analysis/SKILL.md +79 -0
- package/skills/file_edit/SKILL.md +68 -0
- package/skills/web_search/SKILL.md +74 -0
- package/static/account.html +14 -2
- package/static/admin.html +225 -6
- package/static/chat.html +644 -140
- package/static/graph.html +612 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/manifest.json +35 -0
- package/static/sw.js +51 -0
- package/telegram_bot.py +631 -217
- package/tests/__init__.py +0 -0
- package/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/tests/integration/__init__.py +0 -0
- package/tests/integration/test_api.py +94 -0
- package/tests/unit/__init__.py +0 -0
- package/tests/unit/__pycache__/__init__.cpython-314.pyc +0 -0
- package/tests/unit/__pycache__/test_security.cpython-314-pytest-9.0.3.pyc +0 -0
- package/tests/unit/__pycache__/test_tools.cpython-314-pytest-9.0.3.pyc +0 -0
- package/tests/unit/test_security.py +125 -0
- package/tests/unit/test_tools.py +127 -0
- package/tools.py +169 -13
|
@@ -0,0 +1,815 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SQLite knowledge graph for Lattice AI workspace memory.
|
|
3
|
+
|
|
4
|
+
The graph keeps raw event JSON, normalized node metadata, and edges in one
|
|
5
|
+
portable database so it can later migrate to Neo4j/Postgres without changing
|
|
6
|
+
the ingestion contract.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
13
|
+
import shutil
|
|
14
|
+
import sqlite3
|
|
15
|
+
import zipfile
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Dict, List, Optional
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
GRAPH_SCHEMA_VERSION = 1
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _now() -> str:
|
|
25
|
+
return datetime.now().isoformat()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _json(data: Optional[Dict[str, Any]]) -> str:
|
|
29
|
+
return json.dumps(data or {}, ensure_ascii=False, sort_keys=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _safe_loads(raw: Optional[str]) -> Dict[str, Any]:
|
|
33
|
+
"""Tolerantly parse a metadata_json column — returns {} on corrupt rows."""
|
|
34
|
+
if not raw:
|
|
35
|
+
return {}
|
|
36
|
+
try:
|
|
37
|
+
value = json.loads(raw)
|
|
38
|
+
return value if isinstance(value, dict) else {}
|
|
39
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
40
|
+
logging.warning("knowledge_graph: corrupt metadata_json (%s) — using empty dict", e)
|
|
41
|
+
return {}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _slug(text: str, max_len: int = 96) -> str:
|
|
45
|
+
value = re.sub(r"\s+", " ", str(text or "")).strip().lower()
|
|
46
|
+
value = re.sub(r"[^0-9a-zA-Z가-힣._:@/-]+", "-", value).strip("-")
|
|
47
|
+
return (value or "untitled")[:max_len]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _sha256_bytes(data: bytes) -> str:
|
|
51
|
+
return hashlib.sha256(data).hexdigest()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _sha256_text(text: str) -> str:
|
|
55
|
+
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _clean_text(text: str) -> str:
|
|
59
|
+
return re.sub(r"\s+", " ", str(text or "")).strip()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _chunks(text: str, size: int = 1200, overlap: int = 160) -> List[str]:
|
|
63
|
+
cleaned = str(text or "").strip()
|
|
64
|
+
if not cleaned:
|
|
65
|
+
return []
|
|
66
|
+
chunks: List[str] = []
|
|
67
|
+
start = 0
|
|
68
|
+
while start < len(cleaned):
|
|
69
|
+
end = min(len(cleaned), start + size)
|
|
70
|
+
chunks.append(cleaned[start:end])
|
|
71
|
+
if end >= len(cleaned):
|
|
72
|
+
break
|
|
73
|
+
start = max(0, end - overlap)
|
|
74
|
+
return chunks
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _topic_candidates(text: str, limit: int = 8) -> List[str]:
|
|
78
|
+
text = str(text or "")
|
|
79
|
+
candidates: Dict[str, int] = {}
|
|
80
|
+
patterns = [
|
|
81
|
+
r"[A-Za-z][A-Za-z0-9_\-./]{2,}",
|
|
82
|
+
r"[가-힣][가-힣A-Za-z0-9_\-]{1,}",
|
|
83
|
+
]
|
|
84
|
+
stop = {
|
|
85
|
+
"the", "and", "for", "with", "this", "that", "from", "into",
|
|
86
|
+
"사용자", "내용", "파일", "채팅", "답변", "입니다", "그리고", "처럼",
|
|
87
|
+
}
|
|
88
|
+
for pattern in patterns:
|
|
89
|
+
for match in re.findall(pattern, text):
|
|
90
|
+
key = match.strip("._-/").lower()
|
|
91
|
+
if (len(key) < 3 and not re.search(r"[가-힣]", key)) or key in stop or key.isdigit():
|
|
92
|
+
continue
|
|
93
|
+
candidates[key] = candidates.get(key, 0) + 1
|
|
94
|
+
return [
|
|
95
|
+
k for k, v in sorted(candidates.items(), key=lambda item: (-item[1], item[0]))
|
|
96
|
+
if (re.search(r"[가-힣]", k) and len(k) >= 2) or (len(k) >= 4 and (v >= 2 or len(k) >= 6))
|
|
97
|
+
][:limit]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _semantic_items(text: str) -> List[Dict[str, str]]:
|
|
101
|
+
"""Lightweight extraction for product MVP before model-based IE is wired in."""
|
|
102
|
+
items: List[Dict[str, str]] = []
|
|
103
|
+
for raw_line in str(text or "").splitlines():
|
|
104
|
+
line = _clean_text(raw_line)
|
|
105
|
+
if len(line) < 6:
|
|
106
|
+
continue
|
|
107
|
+
lowered = line.lower()
|
|
108
|
+
if re.search(r"(결정|확정|하기로|decided|decision)", lowered):
|
|
109
|
+
items.append({"type": "Decision", "title": line[:120], "summary": line[:500]})
|
|
110
|
+
if re.search(r"(todo|해야|하자|진행|구현|수정|확인|next|task|\[ \])", lowered):
|
|
111
|
+
items.append({"type": "Task", "title": line[:120], "summary": line[:500]})
|
|
112
|
+
return items[:12]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class KnowledgeGraphStore:
|
|
116
|
+
def __init__(self, db_path: Path, blob_dir: Path):
|
|
117
|
+
self.db_path = Path(db_path)
|
|
118
|
+
self.blob_dir = Path(blob_dir)
|
|
119
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
self.blob_dir.mkdir(parents=True, exist_ok=True)
|
|
121
|
+
self._init_db()
|
|
122
|
+
|
|
123
|
+
def _connect(self) -> sqlite3.Connection:
|
|
124
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
125
|
+
conn.row_factory = sqlite3.Row
|
|
126
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
127
|
+
conn.execute("PRAGMA foreign_keys=ON")
|
|
128
|
+
return conn
|
|
129
|
+
|
|
130
|
+
def _init_db(self) -> None:
|
|
131
|
+
with self._connect() as conn:
|
|
132
|
+
conn.executescript(
|
|
133
|
+
"""
|
|
134
|
+
CREATE TABLE IF NOT EXISTS graph_meta (
|
|
135
|
+
key TEXT PRIMARY KEY,
|
|
136
|
+
value TEXT NOT NULL
|
|
137
|
+
);
|
|
138
|
+
CREATE TABLE IF NOT EXISTS nodes (
|
|
139
|
+
id TEXT PRIMARY KEY,
|
|
140
|
+
type TEXT NOT NULL,
|
|
141
|
+
title TEXT NOT NULL,
|
|
142
|
+
summary TEXT,
|
|
143
|
+
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
144
|
+
raw_json TEXT NOT NULL CHECK (json_valid(raw_json)),
|
|
145
|
+
created_at TEXT NOT NULL,
|
|
146
|
+
updated_at TEXT NOT NULL
|
|
147
|
+
);
|
|
148
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
149
|
+
id TEXT PRIMARY KEY,
|
|
150
|
+
from_node TEXT NOT NULL,
|
|
151
|
+
to_node TEXT NOT NULL,
|
|
152
|
+
type TEXT NOT NULL,
|
|
153
|
+
weight REAL NOT NULL DEFAULT 1.0,
|
|
154
|
+
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
155
|
+
created_at TEXT NOT NULL,
|
|
156
|
+
UNIQUE(from_node, to_node, type),
|
|
157
|
+
FOREIGN KEY(from_node) REFERENCES nodes(id) ON DELETE CASCADE,
|
|
158
|
+
FOREIGN KEY(to_node) REFERENCES nodes(id) ON DELETE CASCADE
|
|
159
|
+
);
|
|
160
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
161
|
+
id TEXT PRIMARY KEY,
|
|
162
|
+
source_node TEXT NOT NULL,
|
|
163
|
+
text TEXT NOT NULL,
|
|
164
|
+
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
|
|
165
|
+
created_at TEXT NOT NULL,
|
|
166
|
+
FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
|
|
167
|
+
);
|
|
168
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
|
|
169
|
+
CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
|
|
170
|
+
CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
|
|
171
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_node);
|
|
172
|
+
"""
|
|
173
|
+
)
|
|
174
|
+
conn.execute(
|
|
175
|
+
"INSERT OR REPLACE INTO graph_meta(key, value) VALUES (?, ?)",
|
|
176
|
+
("schema_version", str(GRAPH_SCHEMA_VERSION)),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def _upsert_node(
|
|
180
|
+
self,
|
|
181
|
+
conn: sqlite3.Connection,
|
|
182
|
+
node_id: str,
|
|
183
|
+
node_type: str,
|
|
184
|
+
title: str,
|
|
185
|
+
summary: str = "",
|
|
186
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
187
|
+
raw: Optional[Dict[str, Any]] = None,
|
|
188
|
+
) -> str:
|
|
189
|
+
now = _now()
|
|
190
|
+
conn.execute(
|
|
191
|
+
"""
|
|
192
|
+
INSERT INTO nodes(id, type, title, summary, metadata_json, raw_json, created_at, updated_at)
|
|
193
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
194
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
195
|
+
title=excluded.title,
|
|
196
|
+
summary=excluded.summary,
|
|
197
|
+
metadata_json=excluded.metadata_json,
|
|
198
|
+
raw_json=excluded.raw_json,
|
|
199
|
+
updated_at=excluded.updated_at
|
|
200
|
+
""",
|
|
201
|
+
(node_id, node_type, title[:240], summary[:1000], _json(metadata), _json(raw), now, now),
|
|
202
|
+
)
|
|
203
|
+
return node_id
|
|
204
|
+
|
|
205
|
+
def _upsert_edge(
|
|
206
|
+
self,
|
|
207
|
+
conn: sqlite3.Connection,
|
|
208
|
+
from_node: str,
|
|
209
|
+
to_node: str,
|
|
210
|
+
edge_type: str,
|
|
211
|
+
weight: float = 1.0,
|
|
212
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
213
|
+
) -> str:
|
|
214
|
+
edge_id = f"edge:{_sha256_text(f'{from_node}|{edge_type}|{to_node}')[:24]}"
|
|
215
|
+
conn.execute(
|
|
216
|
+
"""
|
|
217
|
+
INSERT INTO edges(id, from_node, to_node, type, weight, metadata_json, created_at)
|
|
218
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
219
|
+
ON CONFLICT(from_node, to_node, type) DO UPDATE SET
|
|
220
|
+
weight=max(edges.weight, excluded.weight),
|
|
221
|
+
metadata_json=excluded.metadata_json
|
|
222
|
+
""",
|
|
223
|
+
(edge_id, from_node, to_node, edge_type, float(weight), _json(metadata), _now()),
|
|
224
|
+
)
|
|
225
|
+
return edge_id
|
|
226
|
+
|
|
227
|
+
def ingest_message(
|
|
228
|
+
self,
|
|
229
|
+
role: str,
|
|
230
|
+
content: str,
|
|
231
|
+
*,
|
|
232
|
+
user_email: Optional[str] = None,
|
|
233
|
+
user_nickname: Optional[str] = None,
|
|
234
|
+
source: Optional[str] = None,
|
|
235
|
+
conversation_id: Optional[str] = None,
|
|
236
|
+
raw: Optional[Dict[str, Any]] = None,
|
|
237
|
+
) -> Dict[str, Any]:
|
|
238
|
+
content = str(content or "")
|
|
239
|
+
digest = _sha256_text("|".join([role or "", content, conversation_id or "", user_email or ""]))[:24]
|
|
240
|
+
node_type = "AIResponse" if role == "assistant" else "Message"
|
|
241
|
+
node_id = f"{node_type.lower()}:{digest}"
|
|
242
|
+
conv_id = f"conversation:{_slug(conversation_id or 'default')}"
|
|
243
|
+
metadata = {
|
|
244
|
+
"role": role,
|
|
245
|
+
"source": source,
|
|
246
|
+
"conversation_id": conversation_id,
|
|
247
|
+
"user_email": user_email,
|
|
248
|
+
"user_nickname": user_nickname,
|
|
249
|
+
"chars": len(content),
|
|
250
|
+
}
|
|
251
|
+
with self._connect() as conn:
|
|
252
|
+
self._upsert_node(conn, conv_id, "Conversation", conversation_id or "Default conversation", metadata={"source": source})
|
|
253
|
+
self._upsert_node(
|
|
254
|
+
conn,
|
|
255
|
+
node_id,
|
|
256
|
+
node_type,
|
|
257
|
+
_clean_text(content)[:80] or role,
|
|
258
|
+
summary=_clean_text(content)[:500],
|
|
259
|
+
metadata=metadata,
|
|
260
|
+
raw=raw or metadata,
|
|
261
|
+
)
|
|
262
|
+
self._upsert_edge(conn, conv_id, node_id, "contains", metadata={"source": source})
|
|
263
|
+
if user_email or user_nickname:
|
|
264
|
+
person_key = user_email or user_nickname or "unknown"
|
|
265
|
+
person_id = f"person:{_slug(person_key)}"
|
|
266
|
+
self._upsert_node(conn, person_id, "Person", user_nickname or user_email or "Unknown user", metadata={"email": user_email})
|
|
267
|
+
self._upsert_edge(conn, person_id, node_id, "authored", metadata={"role": role})
|
|
268
|
+
for index, chunk in enumerate(_chunks(content)):
|
|
269
|
+
chunk_id = f"chunk:{_sha256_text(f'{node_id}:{index}:{chunk}')[:24]}"
|
|
270
|
+
self._upsert_node(conn, chunk_id, "Chunk", f"{node_type} chunk {index + 1}", summary=chunk[:500], metadata={"index": index, "source_node": node_id})
|
|
271
|
+
conn.execute(
|
|
272
|
+
"""
|
|
273
|
+
INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at)
|
|
274
|
+
VALUES (?, ?, ?, ?, ?)
|
|
275
|
+
""",
|
|
276
|
+
(chunk_id, node_id, chunk, _json({"index": index, "source_node": node_id}), _now()),
|
|
277
|
+
)
|
|
278
|
+
self._upsert_edge(conn, node_id, chunk_id, "has_chunk")
|
|
279
|
+
for topic in _topic_candidates(content):
|
|
280
|
+
topic_id = f"topic:{_slug(topic)}"
|
|
281
|
+
self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
|
|
282
|
+
self._upsert_edge(conn, node_id, topic_id, "mentions", weight=0.5)
|
|
283
|
+
for item in _semantic_items(content):
|
|
284
|
+
semantic_type = item["type"]
|
|
285
|
+
semantic_title = item["title"]
|
|
286
|
+
semantic_id = f"{semantic_type.lower()}:{_sha256_text(f'{node_id}:{semantic_type}:{semantic_title}')[:24]}"
|
|
287
|
+
self._upsert_node(
|
|
288
|
+
conn,
|
|
289
|
+
semantic_id,
|
|
290
|
+
semantic_type,
|
|
291
|
+
semantic_title,
|
|
292
|
+
summary=item["summary"],
|
|
293
|
+
metadata={"auto_extracted": True, "source_node": node_id},
|
|
294
|
+
raw=item,
|
|
295
|
+
)
|
|
296
|
+
self._upsert_edge(conn, node_id, semantic_id, "implies", weight=0.8)
|
|
297
|
+
if node_type == "AIResponse":
|
|
298
|
+
self._upsert_edge(conn, semantic_id, node_id, "based_on", weight=0.6)
|
|
299
|
+
return {"node_id": node_id, "type": node_type}
|
|
300
|
+
|
|
301
|
+
def ingest_document(
|
|
302
|
+
self,
|
|
303
|
+
path: Path,
|
|
304
|
+
*,
|
|
305
|
+
original_filename: Optional[str] = None,
|
|
306
|
+
mime_type: Optional[str] = None,
|
|
307
|
+
uploader: Optional[str] = None,
|
|
308
|
+
conversation_id: Optional[str] = None,
|
|
309
|
+
extracted: Optional[Dict[str, Any]] = None,
|
|
310
|
+
) -> Dict[str, Any]:
|
|
311
|
+
path = Path(path)
|
|
312
|
+
data = path.read_bytes()
|
|
313
|
+
digest = _sha256_bytes(data)
|
|
314
|
+
ext = path.suffix.lower()
|
|
315
|
+
filename = original_filename or path.name
|
|
316
|
+
blob_path = self.blob_dir / digest[:2] / f"{digest}{ext}"
|
|
317
|
+
blob_path.parent.mkdir(parents=True, exist_ok=True)
|
|
318
|
+
if not blob_path.exists():
|
|
319
|
+
shutil.copyfile(path, blob_path)
|
|
320
|
+
|
|
321
|
+
doc_meta = self._document_structure(path, ext)
|
|
322
|
+
text = str((extracted or {}).get("content") or (extracted or {}).get("preview") or "")
|
|
323
|
+
file_id = f"file:{digest[:24]}"
|
|
324
|
+
metadata = {
|
|
325
|
+
"filename": filename,
|
|
326
|
+
"ext": ext,
|
|
327
|
+
"mime_type": mime_type,
|
|
328
|
+
"bytes": len(data),
|
|
329
|
+
"sha256": digest,
|
|
330
|
+
"blob_path": str(blob_path),
|
|
331
|
+
"uploader": uploader,
|
|
332
|
+
"conversation_id": conversation_id,
|
|
333
|
+
"extracted": {k: v for k, v in (extracted or {}).items() if k != "content"},
|
|
334
|
+
"structure": doc_meta,
|
|
335
|
+
}
|
|
336
|
+
with self._connect() as conn:
|
|
337
|
+
self._upsert_node(conn, file_id, "File", filename, summary=(text or filename)[:500], metadata=metadata, raw=metadata)
|
|
338
|
+
self._ingest_structure_nodes(conn, file_id, filename, doc_meta)
|
|
339
|
+
if uploader:
|
|
340
|
+
person_id = f"person:{_slug(uploader)}"
|
|
341
|
+
self._upsert_node(conn, person_id, "Person", uploader, metadata={"email": uploader})
|
|
342
|
+
self._upsert_edge(conn, person_id, file_id, "uploaded")
|
|
343
|
+
if conversation_id:
|
|
344
|
+
conv_id = f"conversation:{_slug(conversation_id)}"
|
|
345
|
+
self._upsert_node(conn, conv_id, "Conversation", conversation_id)
|
|
346
|
+
self._upsert_edge(conn, conv_id, file_id, "contains")
|
|
347
|
+
for index, chunk in enumerate(_chunks(text)):
|
|
348
|
+
chunk_id = f"chunk:{_sha256_text(f'{file_id}:{index}:{chunk}')[:24]}"
|
|
349
|
+
self._upsert_node(conn, chunk_id, "Chunk", f"{filename} chunk {index + 1}", summary=chunk[:500], metadata={"index": index, "source_node": file_id})
|
|
350
|
+
conn.execute(
|
|
351
|
+
"INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at) VALUES (?, ?, ?, ?, ?)",
|
|
352
|
+
(chunk_id, file_id, chunk, _json({"index": index, "source_node": file_id}), _now()),
|
|
353
|
+
)
|
|
354
|
+
self._upsert_edge(conn, file_id, chunk_id, "has_chunk")
|
|
355
|
+
for topic in _topic_candidates(f"{filename}\n{text}"):
|
|
356
|
+
topic_id = f"topic:{_slug(topic)}"
|
|
357
|
+
self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
|
|
358
|
+
self._upsert_edge(conn, file_id, topic_id, "discusses", weight=0.7)
|
|
359
|
+
for item in _semantic_items(text):
|
|
360
|
+
semantic_type = item["type"]
|
|
361
|
+
semantic_title = item["title"]
|
|
362
|
+
semantic_id = f"{semantic_type.lower()}:{_sha256_text(f'{file_id}:{semantic_type}:{semantic_title}')[:24]}"
|
|
363
|
+
self._upsert_node(
|
|
364
|
+
conn,
|
|
365
|
+
semantic_id,
|
|
366
|
+
semantic_type,
|
|
367
|
+
semantic_title,
|
|
368
|
+
summary=item["summary"],
|
|
369
|
+
metadata={"auto_extracted": True, "source_node": file_id, "filename": filename},
|
|
370
|
+
raw=item,
|
|
371
|
+
)
|
|
372
|
+
self._upsert_edge(conn, file_id, semantic_id, "contains_signal", weight=0.8)
|
|
373
|
+
return {"node_id": file_id, "sha256": digest, "metadata": metadata}
|
|
374
|
+
|
|
375
|
+
def ingest_event(
|
|
376
|
+
self,
|
|
377
|
+
event_type: str,
|
|
378
|
+
title: str,
|
|
379
|
+
*,
|
|
380
|
+
user_email: Optional[str] = None,
|
|
381
|
+
user_nickname: Optional[str] = None,
|
|
382
|
+
source: Optional[str] = None,
|
|
383
|
+
conversation_id: Optional[str] = None,
|
|
384
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
385
|
+
) -> Dict[str, Any]:
|
|
386
|
+
event_type = str(event_type or "Event")
|
|
387
|
+
title = str(title or event_type)
|
|
388
|
+
payload = {
|
|
389
|
+
"event_type": event_type,
|
|
390
|
+
"title": title,
|
|
391
|
+
"user_email": user_email,
|
|
392
|
+
"user_nickname": user_nickname,
|
|
393
|
+
"source": source,
|
|
394
|
+
"conversation_id": conversation_id,
|
|
395
|
+
"metadata": metadata or {},
|
|
396
|
+
"timestamp": _now(),
|
|
397
|
+
}
|
|
398
|
+
event_id = f"event:{_sha256_text(_json(payload))[:24]}"
|
|
399
|
+
conv_id = f"conversation:{_slug(conversation_id or 'default')}"
|
|
400
|
+
with self._connect() as conn:
|
|
401
|
+
self._upsert_node(conn, event_id, event_type, title, summary=title, metadata=payload, raw=payload)
|
|
402
|
+
self._upsert_node(conn, conv_id, "Conversation", conversation_id or "Default conversation", metadata={"source": source})
|
|
403
|
+
self._upsert_edge(conn, conv_id, event_id, "has_event", metadata={"source": source})
|
|
404
|
+
if user_email or user_nickname:
|
|
405
|
+
person_key = user_email or user_nickname or "unknown"
|
|
406
|
+
person_id = f"person:{_slug(person_key)}"
|
|
407
|
+
self._upsert_node(conn, person_id, "Person", user_nickname or user_email or "Unknown user", metadata={"email": user_email})
|
|
408
|
+
self._upsert_edge(conn, person_id, event_id, "triggered", metadata={"event_type": event_type})
|
|
409
|
+
return {"node_id": event_id, "type": event_type}
|
|
410
|
+
|
|
411
|
+
def _ingest_structure_nodes(
|
|
412
|
+
self,
|
|
413
|
+
conn: sqlite3.Connection,
|
|
414
|
+
file_id: str,
|
|
415
|
+
filename: str,
|
|
416
|
+
structure: Dict[str, Any],
|
|
417
|
+
) -> None:
|
|
418
|
+
for slide in structure.get("slides") or []:
|
|
419
|
+
index = slide.get("index")
|
|
420
|
+
slide_id = f"slide:{_sha256_text(f'{file_id}:slide:{index}')[:24]}"
|
|
421
|
+
title = f"{filename} slide {index}"
|
|
422
|
+
summary = "\n".join(slide.get("texts") or [])[:800]
|
|
423
|
+
self._upsert_node(conn, slide_id, "Slide", title, summary=summary, metadata=slide)
|
|
424
|
+
self._upsert_edge(conn, file_id, slide_id, "has_slide")
|
|
425
|
+
for text in slide.get("texts") or []:
|
|
426
|
+
for topic in _topic_candidates(text, limit=4):
|
|
427
|
+
topic_id = f"topic:{_slug(topic)}"
|
|
428
|
+
self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
|
|
429
|
+
self._upsert_edge(conn, slide_id, topic_id, "discusses", weight=0.6)
|
|
430
|
+
|
|
431
|
+
for page in structure.get("pages") or []:
|
|
432
|
+
index = page.get("index")
|
|
433
|
+
page_id = f"page:{_sha256_text(f'{file_id}:page:{index}')[:24]}"
|
|
434
|
+
title = f"{filename} page {index}"
|
|
435
|
+
self._upsert_node(conn, page_id, "Page", title, summary=page.get("preview") or "", metadata=page)
|
|
436
|
+
self._upsert_edge(conn, file_id, page_id, "has_page")
|
|
437
|
+
for topic in _topic_candidates(page.get("preview") or "", limit=4):
|
|
438
|
+
topic_id = f"topic:{_slug(topic)}"
|
|
439
|
+
self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
|
|
440
|
+
self._upsert_edge(conn, page_id, topic_id, "discusses", weight=0.6)
|
|
441
|
+
|
|
442
|
+
for sheet in (structure.get("sheets") or []):
|
|
443
|
+
sheet_title = sheet.get("title")
|
|
444
|
+
sheet_id = f"sheet:{_sha256_text(f'{file_id}:sheet:{sheet_title}')[:24]}"
|
|
445
|
+
self._upsert_node(conn, sheet_id, "Sheet", f"{filename} / {sheet_title}", metadata=sheet)
|
|
446
|
+
self._upsert_edge(conn, file_id, sheet_id, "has_sheet")
|
|
447
|
+
|
|
448
|
+
for image in (structure.get("images") or []):
|
|
449
|
+
image_key = image.get("sha256") or _sha256_text(json.dumps(image, ensure_ascii=False, sort_keys=True))
|
|
450
|
+
image_id = f"image:{str(image_key)[:24]}"
|
|
451
|
+
title_parts = [filename, "image"]
|
|
452
|
+
if image.get("page"):
|
|
453
|
+
title_parts.append(f"page {image.get('page')}")
|
|
454
|
+
if image.get("name"):
|
|
455
|
+
title_parts.append(str(image.get("name")).split("/")[-1])
|
|
456
|
+
self._upsert_node(conn, image_id, "Image", " / ".join(title_parts), metadata=image)
|
|
457
|
+
self._upsert_edge(conn, file_id, image_id, "contains_image")
|
|
458
|
+
|
|
459
|
+
def _document_structure(self, path: Path, ext: str) -> Dict[str, Any]:
|
|
460
|
+
try:
|
|
461
|
+
if ext == ".pptx":
|
|
462
|
+
return self._pptx_structure(path)
|
|
463
|
+
if ext == ".pdf":
|
|
464
|
+
return self._pdf_structure(path)
|
|
465
|
+
if ext == ".docx":
|
|
466
|
+
return self._docx_structure(path)
|
|
467
|
+
if ext == ".xlsx":
|
|
468
|
+
return self._xlsx_structure(path)
|
|
469
|
+
except Exception as exc:
|
|
470
|
+
return {"error": str(exc)}
|
|
471
|
+
return {}
|
|
472
|
+
|
|
473
|
+
def _pptx_structure(self, path: Path) -> Dict[str, Any]:
|
|
474
|
+
result: Dict[str, Any] = {"slides": [], "images": []}
|
|
475
|
+
try:
|
|
476
|
+
from PIL import Image
|
|
477
|
+
from pptx import Presentation
|
|
478
|
+
prs = Presentation(str(path))
|
|
479
|
+
for slide_index, slide in enumerate(prs.slides, start=1):
|
|
480
|
+
slide_info = {"index": slide_index, "shapes": [], "texts": []}
|
|
481
|
+
for shape_index, shape in enumerate(slide.shapes, start=1):
|
|
482
|
+
shape_info = {
|
|
483
|
+
"index": shape_index,
|
|
484
|
+
"name": getattr(shape, "name", ""),
|
|
485
|
+
"shape_type": str(getattr(shape, "shape_type", "")),
|
|
486
|
+
"bbox": {
|
|
487
|
+
"left": int(getattr(shape, "left", 0) or 0),
|
|
488
|
+
"top": int(getattr(shape, "top", 0) or 0),
|
|
489
|
+
"width": int(getattr(shape, "width", 0) or 0),
|
|
490
|
+
"height": int(getattr(shape, "height", 0) or 0),
|
|
491
|
+
},
|
|
492
|
+
}
|
|
493
|
+
if getattr(shape, "has_text_frame", False):
|
|
494
|
+
text = shape.text_frame.text.strip()
|
|
495
|
+
if text:
|
|
496
|
+
shape_info["text"] = text[:1000]
|
|
497
|
+
slide_info["texts"].append(text)
|
|
498
|
+
slide_info["shapes"].append(shape_info)
|
|
499
|
+
result["slides"].append(slide_info)
|
|
500
|
+
with zipfile.ZipFile(path) as zf:
|
|
501
|
+
for name in zf.namelist():
|
|
502
|
+
if not name.startswith("ppt/media/"):
|
|
503
|
+
continue
|
|
504
|
+
data = zf.read(name)
|
|
505
|
+
image_info: Dict[str, Any] = {
|
|
506
|
+
"name": name,
|
|
507
|
+
"bytes": len(data),
|
|
508
|
+
"sha256": _sha256_bytes(data),
|
|
509
|
+
}
|
|
510
|
+
try:
|
|
511
|
+
from io import BytesIO
|
|
512
|
+
with Image.open(BytesIO(data)) as img:
|
|
513
|
+
image_info.update({"width": img.width, "height": img.height, "format": img.format})
|
|
514
|
+
except Exception:
|
|
515
|
+
pass
|
|
516
|
+
result["images"].append(image_info)
|
|
517
|
+
except Exception as exc:
|
|
518
|
+
result["error"] = str(exc)
|
|
519
|
+
return result
|
|
520
|
+
|
|
521
|
+
def _pdf_structure(self, path: Path) -> Dict[str, Any]:
|
|
522
|
+
result: Dict[str, Any] = {"pages": [], "images": []}
|
|
523
|
+
try:
|
|
524
|
+
import pdfplumber
|
|
525
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
526
|
+
metadata = dict(pdf.metadata or {})
|
|
527
|
+
result["metadata"] = {str(k): str(v) for k, v in metadata.items()}
|
|
528
|
+
for page_index, page in enumerate(pdf.pages, start=1):
|
|
529
|
+
text = page.extract_text() or ""
|
|
530
|
+
page_info = {
|
|
531
|
+
"index": page_index,
|
|
532
|
+
"width": float(page.width or 0),
|
|
533
|
+
"height": float(page.height or 0),
|
|
534
|
+
"chars": len(text),
|
|
535
|
+
"preview": _clean_text(text)[:500],
|
|
536
|
+
"image_count": len(page.images or []),
|
|
537
|
+
}
|
|
538
|
+
result["pages"].append(page_info)
|
|
539
|
+
for image_index, image in enumerate(page.images or [], start=1):
|
|
540
|
+
result["images"].append({
|
|
541
|
+
"page": page_index,
|
|
542
|
+
"index": image_index,
|
|
543
|
+
"name": image.get("name"),
|
|
544
|
+
"width": image.get("width"),
|
|
545
|
+
"height": image.get("height"),
|
|
546
|
+
"bbox": {
|
|
547
|
+
"x0": image.get("x0"),
|
|
548
|
+
"top": image.get("top"),
|
|
549
|
+
"x1": image.get("x1"),
|
|
550
|
+
"bottom": image.get("bottom"),
|
|
551
|
+
},
|
|
552
|
+
})
|
|
553
|
+
except Exception as exc:
|
|
554
|
+
result["error"] = str(exc)
|
|
555
|
+
return result
|
|
556
|
+
|
|
557
|
+
def _docx_structure(self, path: Path) -> Dict[str, Any]:
|
|
558
|
+
from docx import Document
|
|
559
|
+
doc = Document(str(path))
|
|
560
|
+
headings = []
|
|
561
|
+
paragraphs = 0
|
|
562
|
+
for p in doc.paragraphs:
|
|
563
|
+
text = p.text.strip()
|
|
564
|
+
if not text:
|
|
565
|
+
continue
|
|
566
|
+
paragraphs += 1
|
|
567
|
+
style = getattr(p.style, "name", "")
|
|
568
|
+
if style.lower().startswith("heading"):
|
|
569
|
+
headings.append({"style": style, "text": text[:240]})
|
|
570
|
+
return {"paragraphs": paragraphs, "headings": headings[:80], "tables": len(doc.tables)}
|
|
571
|
+
|
|
572
|
+
def _xlsx_structure(self, path: Path) -> Dict[str, Any]:
|
|
573
|
+
from openpyxl import load_workbook
|
|
574
|
+
wb = load_workbook(str(path), read_only=True, data_only=True)
|
|
575
|
+
sheets = []
|
|
576
|
+
for ws in wb.worksheets:
|
|
577
|
+
sheets.append({"title": ws.title, "max_row": ws.max_row, "max_column": ws.max_column})
|
|
578
|
+
return {"sheets": sheets}
|
|
579
|
+
|
|
580
|
+
def graph(self, limit: int = 300) -> Dict[str, Any]:
|
|
581
|
+
limit = max(1, min(int(limit or 300), 2000))
|
|
582
|
+
with self._connect() as conn:
|
|
583
|
+
nodes = [
|
|
584
|
+
{
|
|
585
|
+
"id": row["id"],
|
|
586
|
+
"type": row["type"],
|
|
587
|
+
"title": row["title"],
|
|
588
|
+
"summary": row["summary"],
|
|
589
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
590
|
+
}
|
|
591
|
+
for row in conn.execute(
|
|
592
|
+
"SELECT id, type, title, summary, metadata_json FROM nodes WHERE type != 'Chunk' ORDER BY updated_at DESC LIMIT ?",
|
|
593
|
+
(limit,),
|
|
594
|
+
)
|
|
595
|
+
]
|
|
596
|
+
node_ids = {node["id"] for node in nodes}
|
|
597
|
+
edges = [
|
|
598
|
+
{
|
|
599
|
+
"id": row["id"],
|
|
600
|
+
"from": row["from_node"],
|
|
601
|
+
"to": row["to_node"],
|
|
602
|
+
"type": row["type"],
|
|
603
|
+
"weight": row["weight"],
|
|
604
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
605
|
+
}
|
|
606
|
+
for row in conn.execute(
|
|
607
|
+
"SELECT id, from_node, to_node, type, weight, metadata_json FROM edges ORDER BY created_at DESC LIMIT ?",
|
|
608
|
+
(limit * 3,),
|
|
609
|
+
)
|
|
610
|
+
if row["from_node"] in node_ids and row["to_node"] in node_ids
|
|
611
|
+
]
|
|
612
|
+
return {"nodes": nodes, "edges": edges}
|
|
613
|
+
|
|
614
|
+
def search(self, query: str, limit: int = 30) -> Dict[str, Any]:
|
|
615
|
+
query = str(query or "").strip()
|
|
616
|
+
q = f"%{query}%"
|
|
617
|
+
limit = max(1, min(int(limit or 30), 100))
|
|
618
|
+
with self._connect() as conn:
|
|
619
|
+
rows = []
|
|
620
|
+
if query:
|
|
621
|
+
rows = conn.execute(
|
|
622
|
+
"""
|
|
623
|
+
SELECT id, type, title, summary, metadata_json, updated_at
|
|
624
|
+
FROM nodes
|
|
625
|
+
WHERE title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?
|
|
626
|
+
ORDER BY updated_at DESC
|
|
627
|
+
LIMIT ?
|
|
628
|
+
""",
|
|
629
|
+
(q, q, q, limit),
|
|
630
|
+
).fetchall()
|
|
631
|
+
|
|
632
|
+
if len(rows) < limit:
|
|
633
|
+
terms = _topic_candidates(query, limit=8)
|
|
634
|
+
if terms:
|
|
635
|
+
clauses = []
|
|
636
|
+
params: List[str] = []
|
|
637
|
+
for term in terms:
|
|
638
|
+
clauses.append("(title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)")
|
|
639
|
+
params.extend([f"%{term}%", f"%{term}%", f"%{term}%"])
|
|
640
|
+
extra = conn.execute(
|
|
641
|
+
f"""
|
|
642
|
+
SELECT id, type, title, summary, metadata_json, updated_at
|
|
643
|
+
FROM nodes
|
|
644
|
+
WHERE {' OR '.join(clauses)}
|
|
645
|
+
ORDER BY updated_at DESC
|
|
646
|
+
LIMIT ?
|
|
647
|
+
""",
|
|
648
|
+
(*params, limit * 3),
|
|
649
|
+
).fetchall()
|
|
650
|
+
by_id = {row["id"]: row for row in rows}
|
|
651
|
+
for row in extra:
|
|
652
|
+
by_id.setdefault(row["id"], row)
|
|
653
|
+
rows = list(by_id.values())
|
|
654
|
+
|
|
655
|
+
terms_for_score = set(_topic_candidates(query, limit=12))
|
|
656
|
+
def score(row: sqlite3.Row) -> tuple:
|
|
657
|
+
haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
|
|
658
|
+
hits = sum(1 for term in terms_for_score if term.lower() in haystack)
|
|
659
|
+
type_boost = 1 if row["type"] in {"Decision", "Task", "File", "Page", "Slide"} else 0
|
|
660
|
+
return (hits, type_boost, row["updated_at"] or "")
|
|
661
|
+
|
|
662
|
+
rows = sorted(rows, key=score, reverse=True)[:limit]
|
|
663
|
+
return {
|
|
664
|
+
"query": query,
|
|
665
|
+
"matches": [
|
|
666
|
+
{
|
|
667
|
+
"id": row["id"],
|
|
668
|
+
"type": row["type"],
|
|
669
|
+
"title": row["title"],
|
|
670
|
+
"summary": row["summary"],
|
|
671
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
672
|
+
}
|
|
673
|
+
for row in rows
|
|
674
|
+
],
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
def context_for_query(self, query: str, limit: int = 6) -> str:
|
|
678
|
+
"""Return compact graph-backed RAG context for chat generation."""
|
|
679
|
+
query = str(query or "").strip()
|
|
680
|
+
if not query:
|
|
681
|
+
return ""
|
|
682
|
+
matches = self.search(query, limit).get("matches", [])
|
|
683
|
+
if not matches:
|
|
684
|
+
topics = _topic_candidates(query, limit=4)
|
|
685
|
+
if topics:
|
|
686
|
+
with self._connect() as conn:
|
|
687
|
+
rows = []
|
|
688
|
+
for topic in topics:
|
|
689
|
+
rows.extend(conn.execute(
|
|
690
|
+
"""
|
|
691
|
+
SELECT id, type, title, summary, metadata_json
|
|
692
|
+
FROM nodes
|
|
693
|
+
WHERE title LIKE ? OR metadata_json LIKE ?
|
|
694
|
+
ORDER BY updated_at DESC
|
|
695
|
+
LIMIT 3
|
|
696
|
+
""",
|
|
697
|
+
(f"%{topic}%", f"%{topic}%"),
|
|
698
|
+
).fetchall())
|
|
699
|
+
seen = set()
|
|
700
|
+
matches = []
|
|
701
|
+
for row in rows:
|
|
702
|
+
if row["id"] in seen:
|
|
703
|
+
continue
|
|
704
|
+
seen.add(row["id"])
|
|
705
|
+
matches.append({
|
|
706
|
+
"id": row["id"],
|
|
707
|
+
"type": row["type"],
|
|
708
|
+
"title": row["title"],
|
|
709
|
+
"summary": row["summary"],
|
|
710
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
711
|
+
})
|
|
712
|
+
if len(matches) >= limit:
|
|
713
|
+
break
|
|
714
|
+
lines = []
|
|
715
|
+
for match in matches[:limit]:
|
|
716
|
+
meta = match.get("metadata") or {}
|
|
717
|
+
source = meta.get("filename") or meta.get("conversation_id") or meta.get("source") or match["id"]
|
|
718
|
+
summary = _clean_text(match.get("summary") or "")[:700]
|
|
719
|
+
lines.append(f"- [{match['type']}] {match['title']} | source={source} | {summary}")
|
|
720
|
+
return "\n".join(lines)
|
|
721
|
+
|
|
722
|
+
def neighbors(self, node_id: str) -> Dict[str, Any]:
|
|
723
|
+
"""Return direct neighbors (1-hop) of a node."""
|
|
724
|
+
with self._connect() as conn:
|
|
725
|
+
edge_rows = conn.execute(
|
|
726
|
+
"SELECT from_node, to_node, type, weight FROM edges WHERE from_node=? OR to_node=?",
|
|
727
|
+
(node_id, node_id),
|
|
728
|
+
).fetchall()
|
|
729
|
+
neighbor_ids: set = set()
|
|
730
|
+
edges = []
|
|
731
|
+
for row in edge_rows:
|
|
732
|
+
neighbor_ids.add(row["from_node"])
|
|
733
|
+
neighbor_ids.add(row["to_node"])
|
|
734
|
+
edges.append({"from": row["from_node"], "to": row["to_node"], "type": row["type"], "weight": row["weight"]})
|
|
735
|
+
neighbor_ids.discard(node_id)
|
|
736
|
+
nodes = []
|
|
737
|
+
if neighbor_ids:
|
|
738
|
+
placeholders = ",".join("?" * len(neighbor_ids))
|
|
739
|
+
nodes = [
|
|
740
|
+
{
|
|
741
|
+
"id": row["id"],
|
|
742
|
+
"type": row["type"],
|
|
743
|
+
"title": row["title"],
|
|
744
|
+
"summary": row["summary"],
|
|
745
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
746
|
+
}
|
|
747
|
+
for row in conn.execute(
|
|
748
|
+
f"SELECT id, type, title, summary, metadata_json FROM nodes WHERE id IN ({placeholders})",
|
|
749
|
+
list(neighbor_ids),
|
|
750
|
+
)
|
|
751
|
+
]
|
|
752
|
+
return {"node_id": node_id, "neighbors": nodes, "edges": edges}
|
|
753
|
+
|
|
754
|
+
def delete_conversation(self, conversation_id: str) -> Dict[str, Any]:
|
|
755
|
+
conversation_id = str(conversation_id or "").strip()
|
|
756
|
+
if not conversation_id:
|
|
757
|
+
return {"status": "skipped", "removed_nodes": 0}
|
|
758
|
+
conv_id = f"conversation:{_slug(conversation_id)}"
|
|
759
|
+
with self._connect() as conn:
|
|
760
|
+
direct_ids = [
|
|
761
|
+
row["to_node"]
|
|
762
|
+
for row in conn.execute(
|
|
763
|
+
"SELECT to_node FROM edges WHERE from_node=? AND type='contains'",
|
|
764
|
+
(conv_id,),
|
|
765
|
+
)
|
|
766
|
+
]
|
|
767
|
+
remove_ids = set(direct_ids)
|
|
768
|
+
for source_id in list(direct_ids):
|
|
769
|
+
for row in conn.execute(
|
|
770
|
+
"""
|
|
771
|
+
SELECT to_node FROM edges
|
|
772
|
+
WHERE from_node=? AND type IN ('has_chunk', 'implies', 'contains_signal', 'has_page', 'has_slide', 'has_sheet', 'contains_image')
|
|
773
|
+
""",
|
|
774
|
+
(source_id,),
|
|
775
|
+
):
|
|
776
|
+
remove_ids.add(row["to_node"])
|
|
777
|
+
remove_ids.add(conv_id)
|
|
778
|
+
for node_id in remove_ids:
|
|
779
|
+
conn.execute("DELETE FROM nodes WHERE id=?", (node_id,))
|
|
780
|
+
conn.execute(
|
|
781
|
+
"""
|
|
782
|
+
DELETE FROM nodes
|
|
783
|
+
WHERE type='Topic'
|
|
784
|
+
AND id NOT IN (SELECT to_node FROM edges)
|
|
785
|
+
AND id NOT IN (SELECT from_node FROM edges)
|
|
786
|
+
"""
|
|
787
|
+
)
|
|
788
|
+
return {"status": "ok", "conversation_id": conversation_id, "removed_nodes": len(remove_ids)}
|
|
789
|
+
|
|
790
|
+
def clear_all(self) -> Dict[str, Any]:
|
|
791
|
+
with self._connect() as conn:
|
|
792
|
+
counts = {
|
|
793
|
+
"nodes": conn.execute("SELECT COUNT(*) AS c FROM nodes").fetchone()["c"],
|
|
794
|
+
"edges": conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"],
|
|
795
|
+
"chunks": conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()["c"],
|
|
796
|
+
}
|
|
797
|
+
conn.execute("DELETE FROM chunks")
|
|
798
|
+
conn.execute("DELETE FROM edges")
|
|
799
|
+
conn.execute("DELETE FROM nodes")
|
|
800
|
+
if self.blob_dir.exists():
|
|
801
|
+
shutil.rmtree(self.blob_dir, ignore_errors=True)
|
|
802
|
+
self.blob_dir.mkdir(parents=True, exist_ok=True)
|
|
803
|
+
return {"status": "ok", "removed": counts}
|
|
804
|
+
|
|
805
|
+
def stats(self) -> Dict[str, Any]:
|
|
806
|
+
with self._connect() as conn:
|
|
807
|
+
node_counts = {
|
|
808
|
+
row["type"]: row["count"]
|
|
809
|
+
for row in conn.execute("SELECT type, COUNT(*) AS count FROM nodes GROUP BY type")
|
|
810
|
+
}
|
|
811
|
+
edge_counts = {
|
|
812
|
+
row["type"]: row["count"]
|
|
813
|
+
for row in conn.execute("SELECT type, COUNT(*) AS count FROM edges GROUP BY type")
|
|
814
|
+
}
|
|
815
|
+
return {"db_path": str(self.db_path), "nodes": node_counts, "edges": edge_counts}
|