ltcai 0.1.3 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,802 @@
1
+ """
2
+ SQLite knowledge graph for Lattice AI workspace memory.
3
+
4
+ The graph keeps raw event JSON, normalized node metadata, and edges in one
5
+ portable database so it can later migrate to Neo4j/Postgres without changing
6
+ the ingestion contract.
7
+ """
8
+
9
+ import hashlib
10
+ import json
11
+ import re
12
+ import shutil
13
+ import sqlite3
14
+ import zipfile
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List, Optional
18
+
19
+
20
+ GRAPH_SCHEMA_VERSION = 1
21
+
22
+
23
+ def _now() -> str:
24
+ return datetime.now().isoformat()
25
+
26
+
27
+ def _json(data: Optional[Dict[str, Any]]) -> str:
28
+ return json.dumps(data or {}, ensure_ascii=False, sort_keys=True)
29
+
30
+
31
+ def _slug(text: str, max_len: int = 96) -> str:
32
+ value = re.sub(r"\s+", " ", str(text or "")).strip().lower()
33
+ value = re.sub(r"[^0-9a-zA-Z가-힣._:@/-]+", "-", value).strip("-")
34
+ return (value or "untitled")[:max_len]
35
+
36
+
37
+ def _sha256_bytes(data: bytes) -> str:
38
+ return hashlib.sha256(data).hexdigest()
39
+
40
+
41
+ def _sha256_text(text: str) -> str:
42
+ return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
43
+
44
+
45
+ def _clean_text(text: str) -> str:
46
+ return re.sub(r"\s+", " ", str(text or "")).strip()
47
+
48
+
49
+ def _chunks(text: str, size: int = 1200, overlap: int = 160) -> List[str]:
50
+ cleaned = str(text or "").strip()
51
+ if not cleaned:
52
+ return []
53
+ chunks: List[str] = []
54
+ start = 0
55
+ while start < len(cleaned):
56
+ end = min(len(cleaned), start + size)
57
+ chunks.append(cleaned[start:end])
58
+ if end >= len(cleaned):
59
+ break
60
+ start = max(0, end - overlap)
61
+ return chunks
62
+
63
+
64
+ def _topic_candidates(text: str, limit: int = 8) -> List[str]:
65
+ text = str(text or "")
66
+ candidates: Dict[str, int] = {}
67
+ patterns = [
68
+ r"[A-Za-z][A-Za-z0-9_\-./]{2,}",
69
+ r"[가-힣][가-힣A-Za-z0-9_\-]{1,}",
70
+ ]
71
+ stop = {
72
+ "the", "and", "for", "with", "this", "that", "from", "into",
73
+ "사용자", "내용", "파일", "채팅", "답변", "입니다", "그리고", "처럼",
74
+ }
75
+ for pattern in patterns:
76
+ for match in re.findall(pattern, text):
77
+ key = match.strip("._-/").lower()
78
+ if (len(key) < 3 and not re.search(r"[가-힣]", key)) or key in stop or key.isdigit():
79
+ continue
80
+ candidates[key] = candidates.get(key, 0) + 1
81
+ return [
82
+ k for k, v in sorted(candidates.items(), key=lambda item: (-item[1], item[0]))
83
+ if (re.search(r"[가-힣]", k) and len(k) >= 2) or (len(k) >= 4 and (v >= 2 or len(k) >= 6))
84
+ ][:limit]
85
+
86
+
87
+ def _semantic_items(text: str) -> List[Dict[str, str]]:
88
+ """Lightweight extraction for product MVP before model-based IE is wired in."""
89
+ items: List[Dict[str, str]] = []
90
+ for raw_line in str(text or "").splitlines():
91
+ line = _clean_text(raw_line)
92
+ if len(line) < 6:
93
+ continue
94
+ lowered = line.lower()
95
+ if re.search(r"(결정|확정|하기로|decided|decision)", lowered):
96
+ items.append({"type": "Decision", "title": line[:120], "summary": line[:500]})
97
+ if re.search(r"(todo|해야|하자|진행|구현|수정|확인|next|task|\[ \])", lowered):
98
+ items.append({"type": "Task", "title": line[:120], "summary": line[:500]})
99
+ return items[:12]
100
+
101
+
102
+ class KnowledgeGraphStore:
103
+ def __init__(self, db_path: Path, blob_dir: Path):
104
+ self.db_path = Path(db_path)
105
+ self.blob_dir = Path(blob_dir)
106
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
107
+ self.blob_dir.mkdir(parents=True, exist_ok=True)
108
+ self._init_db()
109
+
110
+ def _connect(self) -> sqlite3.Connection:
111
+ conn = sqlite3.connect(str(self.db_path))
112
+ conn.row_factory = sqlite3.Row
113
+ conn.execute("PRAGMA journal_mode=WAL")
114
+ conn.execute("PRAGMA foreign_keys=ON")
115
+ return conn
116
+
117
+ def _init_db(self) -> None:
118
+ with self._connect() as conn:
119
+ conn.executescript(
120
+ """
121
+ CREATE TABLE IF NOT EXISTS graph_meta (
122
+ key TEXT PRIMARY KEY,
123
+ value TEXT NOT NULL
124
+ );
125
+ CREATE TABLE IF NOT EXISTS nodes (
126
+ id TEXT PRIMARY KEY,
127
+ type TEXT NOT NULL,
128
+ title TEXT NOT NULL,
129
+ summary TEXT,
130
+ metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
131
+ raw_json TEXT NOT NULL CHECK (json_valid(raw_json)),
132
+ created_at TEXT NOT NULL,
133
+ updated_at TEXT NOT NULL
134
+ );
135
+ CREATE TABLE IF NOT EXISTS edges (
136
+ id TEXT PRIMARY KEY,
137
+ from_node TEXT NOT NULL,
138
+ to_node TEXT NOT NULL,
139
+ type TEXT NOT NULL,
140
+ weight REAL NOT NULL DEFAULT 1.0,
141
+ metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
142
+ created_at TEXT NOT NULL,
143
+ UNIQUE(from_node, to_node, type),
144
+ FOREIGN KEY(from_node) REFERENCES nodes(id) ON DELETE CASCADE,
145
+ FOREIGN KEY(to_node) REFERENCES nodes(id) ON DELETE CASCADE
146
+ );
147
+ CREATE TABLE IF NOT EXISTS chunks (
148
+ id TEXT PRIMARY KEY,
149
+ source_node TEXT NOT NULL,
150
+ text TEXT NOT NULL,
151
+ metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json)),
152
+ created_at TEXT NOT NULL,
153
+ FOREIGN KEY(source_node) REFERENCES nodes(id) ON DELETE CASCADE
154
+ );
155
+ CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
156
+ CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
157
+ CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
158
+ CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_node);
159
+ """
160
+ )
161
+ conn.execute(
162
+ "INSERT OR REPLACE INTO graph_meta(key, value) VALUES (?, ?)",
163
+ ("schema_version", str(GRAPH_SCHEMA_VERSION)),
164
+ )
165
+
166
+ def _upsert_node(
167
+ self,
168
+ conn: sqlite3.Connection,
169
+ node_id: str,
170
+ node_type: str,
171
+ title: str,
172
+ summary: str = "",
173
+ metadata: Optional[Dict[str, Any]] = None,
174
+ raw: Optional[Dict[str, Any]] = None,
175
+ ) -> str:
176
+ now = _now()
177
+ conn.execute(
178
+ """
179
+ INSERT INTO nodes(id, type, title, summary, metadata_json, raw_json, created_at, updated_at)
180
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
181
+ ON CONFLICT(id) DO UPDATE SET
182
+ title=excluded.title,
183
+ summary=excluded.summary,
184
+ metadata_json=excluded.metadata_json,
185
+ raw_json=excluded.raw_json,
186
+ updated_at=excluded.updated_at
187
+ """,
188
+ (node_id, node_type, title[:240], summary[:1000], _json(metadata), _json(raw), now, now),
189
+ )
190
+ return node_id
191
+
192
+ def _upsert_edge(
193
+ self,
194
+ conn: sqlite3.Connection,
195
+ from_node: str,
196
+ to_node: str,
197
+ edge_type: str,
198
+ weight: float = 1.0,
199
+ metadata: Optional[Dict[str, Any]] = None,
200
+ ) -> str:
201
+ edge_id = f"edge:{_sha256_text(f'{from_node}|{edge_type}|{to_node}')[:24]}"
202
+ conn.execute(
203
+ """
204
+ INSERT INTO edges(id, from_node, to_node, type, weight, metadata_json, created_at)
205
+ VALUES (?, ?, ?, ?, ?, ?, ?)
206
+ ON CONFLICT(from_node, to_node, type) DO UPDATE SET
207
+ weight=max(edges.weight, excluded.weight),
208
+ metadata_json=excluded.metadata_json
209
+ """,
210
+ (edge_id, from_node, to_node, edge_type, float(weight), _json(metadata), _now()),
211
+ )
212
+ return edge_id
213
+
214
+ def ingest_message(
215
+ self,
216
+ role: str,
217
+ content: str,
218
+ *,
219
+ user_email: Optional[str] = None,
220
+ user_nickname: Optional[str] = None,
221
+ source: Optional[str] = None,
222
+ conversation_id: Optional[str] = None,
223
+ raw: Optional[Dict[str, Any]] = None,
224
+ ) -> Dict[str, Any]:
225
+ content = str(content or "")
226
+ digest = _sha256_text("|".join([role or "", content, conversation_id or "", user_email or ""]))[:24]
227
+ node_type = "AIResponse" if role == "assistant" else "Message"
228
+ node_id = f"{node_type.lower()}:{digest}"
229
+ conv_id = f"conversation:{_slug(conversation_id or 'default')}"
230
+ metadata = {
231
+ "role": role,
232
+ "source": source,
233
+ "conversation_id": conversation_id,
234
+ "user_email": user_email,
235
+ "user_nickname": user_nickname,
236
+ "chars": len(content),
237
+ }
238
+ with self._connect() as conn:
239
+ self._upsert_node(conn, conv_id, "Conversation", conversation_id or "Default conversation", metadata={"source": source})
240
+ self._upsert_node(
241
+ conn,
242
+ node_id,
243
+ node_type,
244
+ _clean_text(content)[:80] or role,
245
+ summary=_clean_text(content)[:500],
246
+ metadata=metadata,
247
+ raw=raw or metadata,
248
+ )
249
+ self._upsert_edge(conn, conv_id, node_id, "contains", metadata={"source": source})
250
+ if user_email or user_nickname:
251
+ person_key = user_email or user_nickname or "unknown"
252
+ person_id = f"person:{_slug(person_key)}"
253
+ self._upsert_node(conn, person_id, "Person", user_nickname or user_email or "Unknown user", metadata={"email": user_email})
254
+ self._upsert_edge(conn, person_id, node_id, "authored", metadata={"role": role})
255
+ for index, chunk in enumerate(_chunks(content)):
256
+ chunk_id = f"chunk:{_sha256_text(f'{node_id}:{index}:{chunk}')[:24]}"
257
+ self._upsert_node(conn, chunk_id, "Chunk", f"{node_type} chunk {index + 1}", summary=chunk[:500], metadata={"index": index, "source_node": node_id})
258
+ conn.execute(
259
+ """
260
+ INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at)
261
+ VALUES (?, ?, ?, ?, ?)
262
+ """,
263
+ (chunk_id, node_id, chunk, _json({"index": index, "source_node": node_id}), _now()),
264
+ )
265
+ self._upsert_edge(conn, node_id, chunk_id, "has_chunk")
266
+ for topic in _topic_candidates(content):
267
+ topic_id = f"topic:{_slug(topic)}"
268
+ self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
269
+ self._upsert_edge(conn, node_id, topic_id, "mentions", weight=0.5)
270
+ for item in _semantic_items(content):
271
+ semantic_type = item["type"]
272
+ semantic_title = item["title"]
273
+ semantic_id = f"{semantic_type.lower()}:{_sha256_text(f'{node_id}:{semantic_type}:{semantic_title}')[:24]}"
274
+ self._upsert_node(
275
+ conn,
276
+ semantic_id,
277
+ semantic_type,
278
+ semantic_title,
279
+ summary=item["summary"],
280
+ metadata={"auto_extracted": True, "source_node": node_id},
281
+ raw=item,
282
+ )
283
+ self._upsert_edge(conn, node_id, semantic_id, "implies", weight=0.8)
284
+ if node_type == "AIResponse":
285
+ self._upsert_edge(conn, semantic_id, node_id, "based_on", weight=0.6)
286
+ return {"node_id": node_id, "type": node_type}
287
+
288
+ def ingest_document(
289
+ self,
290
+ path: Path,
291
+ *,
292
+ original_filename: Optional[str] = None,
293
+ mime_type: Optional[str] = None,
294
+ uploader: Optional[str] = None,
295
+ conversation_id: Optional[str] = None,
296
+ extracted: Optional[Dict[str, Any]] = None,
297
+ ) -> Dict[str, Any]:
298
+ path = Path(path)
299
+ data = path.read_bytes()
300
+ digest = _sha256_bytes(data)
301
+ ext = path.suffix.lower()
302
+ filename = original_filename or path.name
303
+ blob_path = self.blob_dir / digest[:2] / f"{digest}{ext}"
304
+ blob_path.parent.mkdir(parents=True, exist_ok=True)
305
+ if not blob_path.exists():
306
+ shutil.copyfile(path, blob_path)
307
+
308
+ doc_meta = self._document_structure(path, ext)
309
+ text = str((extracted or {}).get("content") or (extracted or {}).get("preview") or "")
310
+ file_id = f"file:{digest[:24]}"
311
+ metadata = {
312
+ "filename": filename,
313
+ "ext": ext,
314
+ "mime_type": mime_type,
315
+ "bytes": len(data),
316
+ "sha256": digest,
317
+ "blob_path": str(blob_path),
318
+ "uploader": uploader,
319
+ "conversation_id": conversation_id,
320
+ "extracted": {k: v for k, v in (extracted or {}).items() if k != "content"},
321
+ "structure": doc_meta,
322
+ }
323
+ with self._connect() as conn:
324
+ self._upsert_node(conn, file_id, "File", filename, summary=(text or filename)[:500], metadata=metadata, raw=metadata)
325
+ self._ingest_structure_nodes(conn, file_id, filename, doc_meta)
326
+ if uploader:
327
+ person_id = f"person:{_slug(uploader)}"
328
+ self._upsert_node(conn, person_id, "Person", uploader, metadata={"email": uploader})
329
+ self._upsert_edge(conn, person_id, file_id, "uploaded")
330
+ if conversation_id:
331
+ conv_id = f"conversation:{_slug(conversation_id)}"
332
+ self._upsert_node(conn, conv_id, "Conversation", conversation_id)
333
+ self._upsert_edge(conn, conv_id, file_id, "contains")
334
+ for index, chunk in enumerate(_chunks(text)):
335
+ chunk_id = f"chunk:{_sha256_text(f'{file_id}:{index}:{chunk}')[:24]}"
336
+ self._upsert_node(conn, chunk_id, "Chunk", f"{filename} chunk {index + 1}", summary=chunk[:500], metadata={"index": index, "source_node": file_id})
337
+ conn.execute(
338
+ "INSERT OR REPLACE INTO chunks(id, source_node, text, metadata_json, created_at) VALUES (?, ?, ?, ?, ?)",
339
+ (chunk_id, file_id, chunk, _json({"index": index, "source_node": file_id}), _now()),
340
+ )
341
+ self._upsert_edge(conn, file_id, chunk_id, "has_chunk")
342
+ for topic in _topic_candidates(f"{filename}\n{text}"):
343
+ topic_id = f"topic:{_slug(topic)}"
344
+ self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
345
+ self._upsert_edge(conn, file_id, topic_id, "discusses", weight=0.7)
346
+ for item in _semantic_items(text):
347
+ semantic_type = item["type"]
348
+ semantic_title = item["title"]
349
+ semantic_id = f"{semantic_type.lower()}:{_sha256_text(f'{file_id}:{semantic_type}:{semantic_title}')[:24]}"
350
+ self._upsert_node(
351
+ conn,
352
+ semantic_id,
353
+ semantic_type,
354
+ semantic_title,
355
+ summary=item["summary"],
356
+ metadata={"auto_extracted": True, "source_node": file_id, "filename": filename},
357
+ raw=item,
358
+ )
359
+ self._upsert_edge(conn, file_id, semantic_id, "contains_signal", weight=0.8)
360
+ return {"node_id": file_id, "sha256": digest, "metadata": metadata}
361
+
362
+ def ingest_event(
363
+ self,
364
+ event_type: str,
365
+ title: str,
366
+ *,
367
+ user_email: Optional[str] = None,
368
+ user_nickname: Optional[str] = None,
369
+ source: Optional[str] = None,
370
+ conversation_id: Optional[str] = None,
371
+ metadata: Optional[Dict[str, Any]] = None,
372
+ ) -> Dict[str, Any]:
373
+ event_type = str(event_type or "Event")
374
+ title = str(title or event_type)
375
+ payload = {
376
+ "event_type": event_type,
377
+ "title": title,
378
+ "user_email": user_email,
379
+ "user_nickname": user_nickname,
380
+ "source": source,
381
+ "conversation_id": conversation_id,
382
+ "metadata": metadata or {},
383
+ "timestamp": _now(),
384
+ }
385
+ event_id = f"event:{_sha256_text(_json(payload))[:24]}"
386
+ conv_id = f"conversation:{_slug(conversation_id or 'default')}"
387
+ with self._connect() as conn:
388
+ self._upsert_node(conn, event_id, event_type, title, summary=title, metadata=payload, raw=payload)
389
+ self._upsert_node(conn, conv_id, "Conversation", conversation_id or "Default conversation", metadata={"source": source})
390
+ self._upsert_edge(conn, conv_id, event_id, "has_event", metadata={"source": source})
391
+ if user_email or user_nickname:
392
+ person_key = user_email or user_nickname or "unknown"
393
+ person_id = f"person:{_slug(person_key)}"
394
+ self._upsert_node(conn, person_id, "Person", user_nickname or user_email or "Unknown user", metadata={"email": user_email})
395
+ self._upsert_edge(conn, person_id, event_id, "triggered", metadata={"event_type": event_type})
396
+ return {"node_id": event_id, "type": event_type}
397
+
398
+ def _ingest_structure_nodes(
399
+ self,
400
+ conn: sqlite3.Connection,
401
+ file_id: str,
402
+ filename: str,
403
+ structure: Dict[str, Any],
404
+ ) -> None:
405
+ for slide in structure.get("slides") or []:
406
+ index = slide.get("index")
407
+ slide_id = f"slide:{_sha256_text(f'{file_id}:slide:{index}')[:24]}"
408
+ title = f"{filename} slide {index}"
409
+ summary = "\n".join(slide.get("texts") or [])[:800]
410
+ self._upsert_node(conn, slide_id, "Slide", title, summary=summary, metadata=slide)
411
+ self._upsert_edge(conn, file_id, slide_id, "has_slide")
412
+ for text in slide.get("texts") or []:
413
+ for topic in _topic_candidates(text, limit=4):
414
+ topic_id = f"topic:{_slug(topic)}"
415
+ self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
416
+ self._upsert_edge(conn, slide_id, topic_id, "discusses", weight=0.6)
417
+
418
+ for page in structure.get("pages") or []:
419
+ index = page.get("index")
420
+ page_id = f"page:{_sha256_text(f'{file_id}:page:{index}')[:24]}"
421
+ title = f"{filename} page {index}"
422
+ self._upsert_node(conn, page_id, "Page", title, summary=page.get("preview") or "", metadata=page)
423
+ self._upsert_edge(conn, file_id, page_id, "has_page")
424
+ for topic in _topic_candidates(page.get("preview") or "", limit=4):
425
+ topic_id = f"topic:{_slug(topic)}"
426
+ self._upsert_node(conn, topic_id, "Topic", topic, metadata={"auto_extracted": True})
427
+ self._upsert_edge(conn, page_id, topic_id, "discusses", weight=0.6)
428
+
429
+ for sheet in (structure.get("sheets") or []):
430
+ sheet_title = sheet.get("title")
431
+ sheet_id = f"sheet:{_sha256_text(f'{file_id}:sheet:{sheet_title}')[:24]}"
432
+ self._upsert_node(conn, sheet_id, "Sheet", f"{filename} / {sheet_title}", metadata=sheet)
433
+ self._upsert_edge(conn, file_id, sheet_id, "has_sheet")
434
+
435
+ for image in (structure.get("images") or []):
436
+ image_key = image.get("sha256") or _sha256_text(json.dumps(image, ensure_ascii=False, sort_keys=True))
437
+ image_id = f"image:{str(image_key)[:24]}"
438
+ title_parts = [filename, "image"]
439
+ if image.get("page"):
440
+ title_parts.append(f"page {image.get('page')}")
441
+ if image.get("name"):
442
+ title_parts.append(str(image.get("name")).split("/")[-1])
443
+ self._upsert_node(conn, image_id, "Image", " / ".join(title_parts), metadata=image)
444
+ self._upsert_edge(conn, file_id, image_id, "contains_image")
445
+
446
+ def _document_structure(self, path: Path, ext: str) -> Dict[str, Any]:
447
+ try:
448
+ if ext == ".pptx":
449
+ return self._pptx_structure(path)
450
+ if ext == ".pdf":
451
+ return self._pdf_structure(path)
452
+ if ext == ".docx":
453
+ return self._docx_structure(path)
454
+ if ext == ".xlsx":
455
+ return self._xlsx_structure(path)
456
+ except Exception as exc:
457
+ return {"error": str(exc)}
458
+ return {}
459
+
460
+ def _pptx_structure(self, path: Path) -> Dict[str, Any]:
461
+ result: Dict[str, Any] = {"slides": [], "images": []}
462
+ try:
463
+ from PIL import Image
464
+ from pptx import Presentation
465
+ prs = Presentation(str(path))
466
+ for slide_index, slide in enumerate(prs.slides, start=1):
467
+ slide_info = {"index": slide_index, "shapes": [], "texts": []}
468
+ for shape_index, shape in enumerate(slide.shapes, start=1):
469
+ shape_info = {
470
+ "index": shape_index,
471
+ "name": getattr(shape, "name", ""),
472
+ "shape_type": str(getattr(shape, "shape_type", "")),
473
+ "bbox": {
474
+ "left": int(getattr(shape, "left", 0) or 0),
475
+ "top": int(getattr(shape, "top", 0) or 0),
476
+ "width": int(getattr(shape, "width", 0) or 0),
477
+ "height": int(getattr(shape, "height", 0) or 0),
478
+ },
479
+ }
480
+ if getattr(shape, "has_text_frame", False):
481
+ text = shape.text_frame.text.strip()
482
+ if text:
483
+ shape_info["text"] = text[:1000]
484
+ slide_info["texts"].append(text)
485
+ slide_info["shapes"].append(shape_info)
486
+ result["slides"].append(slide_info)
487
+ with zipfile.ZipFile(path) as zf:
488
+ for name in zf.namelist():
489
+ if not name.startswith("ppt/media/"):
490
+ continue
491
+ data = zf.read(name)
492
+ image_info: Dict[str, Any] = {
493
+ "name": name,
494
+ "bytes": len(data),
495
+ "sha256": _sha256_bytes(data),
496
+ }
497
+ try:
498
+ from io import BytesIO
499
+ with Image.open(BytesIO(data)) as img:
500
+ image_info.update({"width": img.width, "height": img.height, "format": img.format})
501
+ except Exception:
502
+ pass
503
+ result["images"].append(image_info)
504
+ except Exception as exc:
505
+ result["error"] = str(exc)
506
+ return result
507
+
508
+ def _pdf_structure(self, path: Path) -> Dict[str, Any]:
509
+ result: Dict[str, Any] = {"pages": [], "images": []}
510
+ try:
511
+ import pdfplumber
512
+ with pdfplumber.open(str(path)) as pdf:
513
+ metadata = dict(pdf.metadata or {})
514
+ result["metadata"] = {str(k): str(v) for k, v in metadata.items()}
515
+ for page_index, page in enumerate(pdf.pages, start=1):
516
+ text = page.extract_text() or ""
517
+ page_info = {
518
+ "index": page_index,
519
+ "width": float(page.width or 0),
520
+ "height": float(page.height or 0),
521
+ "chars": len(text),
522
+ "preview": _clean_text(text)[:500],
523
+ "image_count": len(page.images or []),
524
+ }
525
+ result["pages"].append(page_info)
526
+ for image_index, image in enumerate(page.images or [], start=1):
527
+ result["images"].append({
528
+ "page": page_index,
529
+ "index": image_index,
530
+ "name": image.get("name"),
531
+ "width": image.get("width"),
532
+ "height": image.get("height"),
533
+ "bbox": {
534
+ "x0": image.get("x0"),
535
+ "top": image.get("top"),
536
+ "x1": image.get("x1"),
537
+ "bottom": image.get("bottom"),
538
+ },
539
+ })
540
+ except Exception as exc:
541
+ result["error"] = str(exc)
542
+ return result
543
+
544
+ def _docx_structure(self, path: Path) -> Dict[str, Any]:
545
+ from docx import Document
546
+ doc = Document(str(path))
547
+ headings = []
548
+ paragraphs = 0
549
+ for p in doc.paragraphs:
550
+ text = p.text.strip()
551
+ if not text:
552
+ continue
553
+ paragraphs += 1
554
+ style = getattr(p.style, "name", "")
555
+ if style.lower().startswith("heading"):
556
+ headings.append({"style": style, "text": text[:240]})
557
+ return {"paragraphs": paragraphs, "headings": headings[:80], "tables": len(doc.tables)}
558
+
559
+ def _xlsx_structure(self, path: Path) -> Dict[str, Any]:
560
+ from openpyxl import load_workbook
561
+ wb = load_workbook(str(path), read_only=True, data_only=True)
562
+ sheets = []
563
+ for ws in wb.worksheets:
564
+ sheets.append({"title": ws.title, "max_row": ws.max_row, "max_column": ws.max_column})
565
+ return {"sheets": sheets}
566
+
567
+ def graph(self, limit: int = 300) -> Dict[str, Any]:
568
+ limit = max(1, min(int(limit or 300), 2000))
569
+ with self._connect() as conn:
570
+ nodes = [
571
+ {
572
+ "id": row["id"],
573
+ "type": row["type"],
574
+ "title": row["title"],
575
+ "summary": row["summary"],
576
+ "metadata": json.loads(row["metadata_json"] or "{}"),
577
+ }
578
+ for row in conn.execute(
579
+ "SELECT id, type, title, summary, metadata_json FROM nodes WHERE type != 'Chunk' ORDER BY updated_at DESC LIMIT ?",
580
+ (limit,),
581
+ )
582
+ ]
583
+ node_ids = {node["id"] for node in nodes}
584
+ edges = [
585
+ {
586
+ "id": row["id"],
587
+ "from": row["from_node"],
588
+ "to": row["to_node"],
589
+ "type": row["type"],
590
+ "weight": row["weight"],
591
+ "metadata": json.loads(row["metadata_json"] or "{}"),
592
+ }
593
+ for row in conn.execute(
594
+ "SELECT id, from_node, to_node, type, weight, metadata_json FROM edges ORDER BY created_at DESC LIMIT ?",
595
+ (limit * 3,),
596
+ )
597
+ if row["from_node"] in node_ids and row["to_node"] in node_ids
598
+ ]
599
+ return {"nodes": nodes, "edges": edges}
600
+
601
+ def search(self, query: str, limit: int = 30) -> Dict[str, Any]:
602
+ query = str(query or "").strip()
603
+ q = f"%{query}%"
604
+ limit = max(1, min(int(limit or 30), 100))
605
+ with self._connect() as conn:
606
+ rows = []
607
+ if query:
608
+ rows = conn.execute(
609
+ """
610
+ SELECT id, type, title, summary, metadata_json, updated_at
611
+ FROM nodes
612
+ WHERE title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?
613
+ ORDER BY updated_at DESC
614
+ LIMIT ?
615
+ """,
616
+ (q, q, q, limit),
617
+ ).fetchall()
618
+
619
+ if len(rows) < limit:
620
+ terms = _topic_candidates(query, limit=8)
621
+ if terms:
622
+ clauses = []
623
+ params: List[str] = []
624
+ for term in terms:
625
+ clauses.append("(title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)")
626
+ params.extend([f"%{term}%", f"%{term}%", f"%{term}%"])
627
+ extra = conn.execute(
628
+ f"""
629
+ SELECT id, type, title, summary, metadata_json, updated_at
630
+ FROM nodes
631
+ WHERE {' OR '.join(clauses)}
632
+ ORDER BY updated_at DESC
633
+ LIMIT ?
634
+ """,
635
+ (*params, limit * 3),
636
+ ).fetchall()
637
+ by_id = {row["id"]: row for row in rows}
638
+ for row in extra:
639
+ by_id.setdefault(row["id"], row)
640
+ rows = list(by_id.values())
641
+
642
+ terms_for_score = set(_topic_candidates(query, limit=12))
643
+ def score(row: sqlite3.Row) -> tuple:
644
+ haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
645
+ hits = sum(1 for term in terms_for_score if term.lower() in haystack)
646
+ type_boost = 1 if row["type"] in {"Decision", "Task", "File", "Page", "Slide"} else 0
647
+ return (hits, type_boost, row["updated_at"] or "")
648
+
649
+ rows = sorted(rows, key=score, reverse=True)[:limit]
650
+ return {
651
+ "query": query,
652
+ "matches": [
653
+ {
654
+ "id": row["id"],
655
+ "type": row["type"],
656
+ "title": row["title"],
657
+ "summary": row["summary"],
658
+ "metadata": json.loads(row["metadata_json"] or "{}"),
659
+ }
660
+ for row in rows
661
+ ],
662
+ }
663
+
664
+ def context_for_query(self, query: str, limit: int = 6) -> str:
665
+ """Return compact graph-backed RAG context for chat generation."""
666
+ query = str(query or "").strip()
667
+ if not query:
668
+ return ""
669
+ matches = self.search(query, limit).get("matches", [])
670
+ if not matches:
671
+ topics = _topic_candidates(query, limit=4)
672
+ if topics:
673
+ with self._connect() as conn:
674
+ rows = []
675
+ for topic in topics:
676
+ rows.extend(conn.execute(
677
+ """
678
+ SELECT id, type, title, summary, metadata_json
679
+ FROM nodes
680
+ WHERE title LIKE ? OR metadata_json LIKE ?
681
+ ORDER BY updated_at DESC
682
+ LIMIT 3
683
+ """,
684
+ (f"%{topic}%", f"%{topic}%"),
685
+ ).fetchall())
686
+ seen = set()
687
+ matches = []
688
+ for row in rows:
689
+ if row["id"] in seen:
690
+ continue
691
+ seen.add(row["id"])
692
+ matches.append({
693
+ "id": row["id"],
694
+ "type": row["type"],
695
+ "title": row["title"],
696
+ "summary": row["summary"],
697
+ "metadata": json.loads(row["metadata_json"] or "{}"),
698
+ })
699
+ if len(matches) >= limit:
700
+ break
701
+ lines = []
702
+ for match in matches[:limit]:
703
+ meta = match.get("metadata") or {}
704
+ source = meta.get("filename") or meta.get("conversation_id") or meta.get("source") or match["id"]
705
+ summary = _clean_text(match.get("summary") or "")[:700]
706
+ lines.append(f"- [{match['type']}] {match['title']} | source={source} | {summary}")
707
+ return "\n".join(lines)
708
+
709
+ def neighbors(self, node_id: str) -> Dict[str, Any]:
710
+ """Return direct neighbors (1-hop) of a node."""
711
+ with self._connect() as conn:
712
+ edge_rows = conn.execute(
713
+ "SELECT from_node, to_node, type, weight FROM edges WHERE from_node=? OR to_node=?",
714
+ (node_id, node_id),
715
+ ).fetchall()
716
+ neighbor_ids: set = set()
717
+ edges = []
718
+ for row in edge_rows:
719
+ neighbor_ids.add(row["from_node"])
720
+ neighbor_ids.add(row["to_node"])
721
+ edges.append({"from": row["from_node"], "to": row["to_node"], "type": row["type"], "weight": row["weight"]})
722
+ neighbor_ids.discard(node_id)
723
+ nodes = []
724
+ if neighbor_ids:
725
+ placeholders = ",".join("?" * len(neighbor_ids))
726
+ nodes = [
727
+ {
728
+ "id": row["id"],
729
+ "type": row["type"],
730
+ "title": row["title"],
731
+ "summary": row["summary"],
732
+ "metadata": json.loads(row["metadata_json"] or "{}"),
733
+ }
734
+ for row in conn.execute(
735
+ f"SELECT id, type, title, summary, metadata_json FROM nodes WHERE id IN ({placeholders})",
736
+ list(neighbor_ids),
737
+ )
738
+ ]
739
+ return {"node_id": node_id, "neighbors": nodes, "edges": edges}
740
+
741
+ def delete_conversation(self, conversation_id: str) -> Dict[str, Any]:
742
+ conversation_id = str(conversation_id or "").strip()
743
+ if not conversation_id:
744
+ return {"status": "skipped", "removed_nodes": 0}
745
+ conv_id = f"conversation:{_slug(conversation_id)}"
746
+ with self._connect() as conn:
747
+ direct_ids = [
748
+ row["to_node"]
749
+ for row in conn.execute(
750
+ "SELECT to_node FROM edges WHERE from_node=? AND type='contains'",
751
+ (conv_id,),
752
+ )
753
+ ]
754
+ remove_ids = set(direct_ids)
755
+ for source_id in list(direct_ids):
756
+ for row in conn.execute(
757
+ """
758
+ SELECT to_node FROM edges
759
+ WHERE from_node=? AND type IN ('has_chunk', 'implies', 'contains_signal', 'has_page', 'has_slide', 'has_sheet', 'contains_image')
760
+ """,
761
+ (source_id,),
762
+ ):
763
+ remove_ids.add(row["to_node"])
764
+ remove_ids.add(conv_id)
765
+ for node_id in remove_ids:
766
+ conn.execute("DELETE FROM nodes WHERE id=?", (node_id,))
767
+ conn.execute(
768
+ """
769
+ DELETE FROM nodes
770
+ WHERE type='Topic'
771
+ AND id NOT IN (SELECT to_node FROM edges)
772
+ AND id NOT IN (SELECT from_node FROM edges)
773
+ """
774
+ )
775
+ return {"status": "ok", "conversation_id": conversation_id, "removed_nodes": len(remove_ids)}
776
+
777
+ def clear_all(self) -> Dict[str, Any]:
778
+ with self._connect() as conn:
779
+ counts = {
780
+ "nodes": conn.execute("SELECT COUNT(*) AS c FROM nodes").fetchone()["c"],
781
+ "edges": conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"],
782
+ "chunks": conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()["c"],
783
+ }
784
+ conn.execute("DELETE FROM chunks")
785
+ conn.execute("DELETE FROM edges")
786
+ conn.execute("DELETE FROM nodes")
787
+ if self.blob_dir.exists():
788
+ shutil.rmtree(self.blob_dir, ignore_errors=True)
789
+ self.blob_dir.mkdir(parents=True, exist_ok=True)
790
+ return {"status": "ok", "removed": counts}
791
+
792
+ def stats(self) -> Dict[str, Any]:
793
+ with self._connect() as conn:
794
+ node_counts = {
795
+ row["type"]: row["count"]
796
+ for row in conn.execute("SELECT type, COUNT(*) AS count FROM nodes GROUP BY type")
797
+ }
798
+ edge_counts = {
799
+ row["type"]: row["count"]
800
+ for row in conn.execute("SELECT type, COUNT(*) AS count FROM edges GROUP BY type")
801
+ }
802
+ return {"db_path": str(self.db_path), "nodes": node_counts, "edges": edge_counts}