ltcai 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,7 @@ portable database so it can later migrate to Neo4j/Postgres without changing
6
6
  the ingestion contract.
7
7
  """
8
8
 
9
+ import asyncio
9
10
  import hashlib
10
11
  import json
11
12
  import logging
@@ -26,6 +27,12 @@ try:
26
27
  except Exception: # pragma: no cover - v2 schema is optional at import time
27
28
  KGStoreV2 = None # type: ignore[assignment]
28
29
 
30
+ _llm_router_ref = None
31
+
32
+ def set_llm_router(router_instance):
33
+ global _llm_router_ref
34
+ _llm_router_ref = router_instance
35
+
29
36
 
30
37
  GRAPH_SCHEMA_VERSION = 1
31
38
 
@@ -365,6 +372,109 @@ def _chunks(text: str, size: int = 1200, overlap: int = 160) -> List[str]:
365
372
  return chunks
366
373
 
367
374
 
375
+ _LLM_EXTRACT_CONCEPT_PROMPT = """Extract the key concepts from the following text.
376
+ Return ONLY a JSON array of objects, each with "concept" (string) and "importance" (float 0-1).
377
+ Extract up to {limit} concepts. Focus on named entities, technical terms, and domain-specific nouns.
378
+ Do NOT include common words, stop words, or generic terms.
379
+
380
+ Text:
381
+ {text}
382
+
383
+ JSON:"""
384
+
385
+ _LLM_EXTRACT_TRIPLE_PROMPT = """Extract relationship triples from the following text.
386
+ Return ONLY a JSON array of objects, each with:
387
+ - "subject": source concept (string)
388
+ - "relation": relationship verb (string, Korean or English)
389
+ - "object": target concept (string)
390
+ - "evidence": the sentence supporting this triple (string, max 240 chars)
391
+ - "confidence": how confident you are (float 0-1)
392
+
393
+ Extract up to {limit} triples. Focus on meaningful semantic relationships.
394
+
395
+ Text:
396
+ {text}
397
+
398
+ Concepts already identified: {concepts}
399
+
400
+ JSON:"""
401
+
402
+ ENABLE_LLM_EXTRACTION = os.getenv("LATTICEAI_LLM_EXTRACTION", "true").lower() in ("1", "true", "yes")
403
+
404
+
405
+ def _llm_extract_concepts(text: str, limit: int = 12) -> Optional[List[str]]:
406
+ if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
407
+ return None
408
+ if not _llm_router_ref.current_model_id:
409
+ return None
410
+ prompt = _LLM_EXTRACT_CONCEPT_PROMPT.format(text=text[:3000], limit=limit)
411
+ try:
412
+ loop = asyncio.get_event_loop()
413
+ if loop.is_running():
414
+ import concurrent.futures
415
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
416
+ future = pool.submit(asyncio.run, _llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1))
417
+ raw = future.result(timeout=30)
418
+ else:
419
+ raw = asyncio.run(_llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1))
420
+ raw = raw.strip()
421
+ if raw.startswith("```"):
422
+ raw = re.sub(r"^```(?:json)?\s*", "", raw)
423
+ raw = re.sub(r"\s*```$", "", raw)
424
+ parsed = json.loads(raw)
425
+ if isinstance(parsed, list):
426
+ concepts = []
427
+ for item in parsed[:limit]:
428
+ if isinstance(item, dict) and "concept" in item:
429
+ concepts.append(item["concept"])
430
+ elif isinstance(item, str):
431
+ concepts.append(item)
432
+ return concepts if concepts else None
433
+ except Exception as e:
434
+ logging.debug("LLM concept extraction failed (falling back to rules): %s", e)
435
+ return None
436
+
437
+
438
+ def _llm_extract_triples(text: str, concepts: List[str], limit: int = 20) -> Optional[List[Dict[str, str]]]:
439
+ if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
440
+ return None
441
+ if not _llm_router_ref.current_model_id:
442
+ return None
443
+ prompt = _LLM_EXTRACT_TRIPLE_PROMPT.format(
444
+ text=text[:3000], limit=limit,
445
+ concepts=", ".join(concepts[:15]),
446
+ )
447
+ try:
448
+ loop = asyncio.get_event_loop()
449
+ if loop.is_running():
450
+ import concurrent.futures
451
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
452
+ future = pool.submit(asyncio.run, _llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1))
453
+ raw = future.result(timeout=30)
454
+ else:
455
+ raw = asyncio.run(_llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1))
456
+ raw = raw.strip()
457
+ if raw.startswith("```"):
458
+ raw = re.sub(r"^```(?:json)?\s*", "", raw)
459
+ raw = re.sub(r"\s*```$", "", raw)
460
+ parsed = json.loads(raw)
461
+ if isinstance(parsed, list):
462
+ triples = []
463
+ for item in parsed[:limit]:
464
+ if isinstance(item, dict) and "subject" in item and "object" in item:
465
+ triples.append({
466
+ "subject": str(item["subject"]),
467
+ "relation": str(item.get("relation", "관련됨")),
468
+ "object": str(item["object"]),
469
+ "context": str(item.get("evidence", ""))[:240],
470
+ "confidence": float(item.get("confidence", 0.8)),
471
+ })
472
+ return triples if triples else None
473
+ except Exception as e:
474
+ logging.debug("LLM triple extraction failed (falling back to rules): %s", e)
475
+ return None
476
+
477
+
368
478
  _CONCEPT_STOP: set = {
369
479
  # English stop words
370
480
  "the", "and", "for", "with", "this", "that", "from", "into", "which",
@@ -385,7 +495,15 @@ _CONCEPT_STOP: set = {
385
495
 
386
496
 
387
497
  def _extract_concepts(text: str, limit: int = 12) -> List[str]:
388
- """Extract meaningful named concepts from text.
498
+ """LLM-first concept extraction with rule-based fallback."""
499
+ llm_result = _llm_extract_concepts(text, limit)
500
+ if llm_result:
501
+ return llm_result
502
+ return _extract_concepts_rules(text, limit)
503
+
504
+
505
+ def _extract_concepts_rules(text: str, limit: int = 12) -> List[str]:
506
+ """Extract meaningful named concepts from text (rule-based).
389
507
 
390
508
  Priority order:
391
509
  1. Backtick / quoted terms (explicitly technical)
@@ -586,7 +704,19 @@ def _extract_triples(
586
704
  concepts: List[str],
587
705
  limit: int = 20,
588
706
  ) -> List[Dict[str, str]]:
589
- """Extract (subject, verb-edge, object, context) triples from text.
707
+ """LLM-first triple extraction with rule-based fallback."""
708
+ llm_result = _llm_extract_triples(text, concepts, limit)
709
+ if llm_result:
710
+ return llm_result
711
+ return _extract_triples_rules(text, concepts, limit)
712
+
713
+
714
+ def _extract_triples_rules(
715
+ text: str,
716
+ concepts: List[str],
717
+ limit: int = 20,
718
+ ) -> List[Dict[str, str]]:
719
+ """Extract (subject, verb-edge, object, context) triples from text (rule-based).
590
720
 
591
721
  For each sentence containing ≥2 concepts, infer the verb-form edge label
592
722
  from surrounding context and create a directed triple.
@@ -1223,21 +1353,44 @@ class KnowledgeGraphStore:
1223
1353
  from docx import Document
1224
1354
  doc = Document(str(path))
1225
1355
  paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
1356
+ table_lines = []
1357
+ for table in doc.tables:
1358
+ for row in table.rows:
1359
+ cells = [_clean_text(cell.text) for cell in row.cells]
1360
+ if any(cells):
1361
+ table_lines.append("\t".join(cells))
1226
1362
  meta["paragraphs"] = len(paragraphs)
1227
1363
  meta["tables"] = len(doc.tables)
1228
- text = "\n\n".join(paragraphs)
1364
+ meta["table_rows"] = len(table_lines)
1365
+ text = "\n\n".join([*paragraphs, *table_lines])
1229
1366
  elif ext == ".xlsx":
1230
1367
  from openpyxl import load_workbook
1231
1368
  wb = load_workbook(str(path), read_only=True, data_only=True)
1232
1369
  rows_all = []
1370
+ non_empty_rows = 0
1371
+ non_empty_cells = 0
1372
+ char_count = 0
1233
1373
  for ws in wb.worksheets:
1234
- rows_all.append(f"[Sheet: {ws.title}]")
1374
+ sheet_rows = []
1235
1375
  for row in ws.iter_rows(values_only=True):
1236
- cells = [str(cell) if cell is not None else "" for cell in row]
1237
- rows_all.append("\t".join(cells))
1238
- if len("\n".join(rows_all)) > 200_000:
1376
+ cells = [str(cell).strip() if cell is not None else "" for cell in row]
1377
+ if not any(cells):
1378
+ continue
1379
+ line = "\t".join(cells)
1380
+ non_empty_rows += 1
1381
+ non_empty_cells += sum(1 for cell in cells if cell)
1382
+ sheet_rows.append(line)
1383
+ char_count += len(line) + 1
1384
+ if char_count > 200_000:
1239
1385
  break
1386
+ if sheet_rows:
1387
+ rows_all.append(f"[Sheet: {ws.title}]")
1388
+ rows_all.extend(sheet_rows)
1389
+ if char_count > 200_000:
1390
+ break
1240
1391
  meta["sheets"] = len(wb.worksheets)
1392
+ meta["rows"] = non_empty_rows
1393
+ meta["cells"] = non_empty_cells
1241
1394
  text = "\n".join(rows_all)
1242
1395
  elif ext == ".pptx":
1243
1396
  from pptx import Presentation
@@ -1247,9 +1400,13 @@ class KnowledgeGraphStore:
1247
1400
  parts = []
1248
1401
  for shape in slide.shapes:
1249
1402
  if getattr(shape, "has_text_frame", False):
1250
- parts.append(shape.text_frame.text)
1251
- slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
1403
+ slide_text = shape.text_frame.text.strip()
1404
+ if slide_text:
1405
+ parts.append(slide_text)
1406
+ if parts:
1407
+ slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
1252
1408
  meta["slides"] = len(prs.slides)
1409
+ meta["text_slides"] = len(slides_text)
1253
1410
  text = "\n\n".join(slides_text)
1254
1411
  elif category == "image":
1255
1412
  from PIL import Image
@@ -1362,13 +1519,13 @@ class KnowledgeGraphStore:
1362
1519
  extension=excluded.extension,
1363
1520
  size_bytes=excluded.size_bytes,
1364
1521
  modified_at=excluded.modified_at,
1365
- sha256=COALESCE(excluded.sha256, local_file_index.sha256),
1522
+ sha256=excluded.sha256,
1366
1523
  last_scanned_at=excluded.last_scanned_at,
1367
- last_indexed_at=COALESCE(excluded.last_indexed_at, local_file_index.last_indexed_at),
1524
+ last_indexed_at=excluded.last_indexed_at,
1368
1525
  parser_type=excluded.parser_type,
1369
1526
  status=excluded.status,
1370
1527
  error_message=excluded.error_message,
1371
- graph_node_id=COALESCE(excluded.graph_node_id, local_file_index.graph_node_id),
1528
+ graph_node_id=excluded.graph_node_id,
1372
1529
  deleted=excluded.deleted,
1373
1530
  metadata_json=excluded.metadata_json
1374
1531
  """,
@@ -1381,6 +1538,113 @@ class KnowledgeGraphStore:
1381
1538
  )
1382
1539
  return index_id
1383
1540
 
1541
+ def _delete_local_file_graph(self, conn: sqlite3.Connection, file_node_id: Optional[str]) -> None:
1542
+ if not file_node_id:
1543
+ return
1544
+
1545
+ file_row = conn.execute(
1546
+ "SELECT metadata_json FROM nodes WHERE id=?",
1547
+ (file_node_id,),
1548
+ ).fetchone()
1549
+ source_id = None
1550
+ if file_row:
1551
+ source_id = _safe_loads(file_row["metadata_json"]).get("source_id")
1552
+
1553
+ linked_rows = conn.execute(
1554
+ """
1555
+ SELECT n.id, n.type, n.metadata_json
1556
+ FROM edges e
1557
+ JOIN nodes n ON n.id=e.to_node
1558
+ WHERE e.from_node=?
1559
+ """,
1560
+ (file_node_id,),
1561
+ ).fetchall()
1562
+ owned_ids: set = set()
1563
+ auto_candidate_ids: set = set()
1564
+ for row in linked_rows:
1565
+ metadata = _safe_loads(row["metadata_json"])
1566
+ if row["type"] in {"Chunk", "ImageText", "Section"} or metadata.get("source_node") == file_node_id:
1567
+ owned_ids.add(row["id"])
1568
+ elif metadata.get("auto_extracted") and metadata.get("source") == "local_folder":
1569
+ auto_candidate_ids.add(row["id"])
1570
+
1571
+ conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
1572
+ conn.execute("DELETE FROM edges WHERE from_node=? OR to_node=?", (file_node_id, file_node_id))
1573
+ conn.execute("DELETE FROM nodes WHERE id=?", (file_node_id,))
1574
+
1575
+ def delete_nodes(node_ids: set) -> None:
1576
+ if not node_ids:
1577
+ return
1578
+ placeholders = ",".join("?" * len(node_ids))
1579
+ params = list(node_ids)
1580
+ conn.execute(f"DELETE FROM chunks WHERE source_node IN ({placeholders})", params)
1581
+ conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", params * 2)
1582
+ conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", params)
1583
+
1584
+ delete_nodes(owned_ids)
1585
+
1586
+ removable_auto_ids: set = set()
1587
+ for node_id in auto_candidate_ids:
1588
+ remaining_edges = conn.execute(
1589
+ "SELECT from_node, to_node FROM edges WHERE from_node=? OR to_node=?",
1590
+ (node_id, node_id),
1591
+ ).fetchall()
1592
+ if all(
1593
+ (row["from_node"] in auto_candidate_ids and row["to_node"] in auto_candidate_ids)
1594
+ for row in remaining_edges
1595
+ ):
1596
+ removable_auto_ids.add(node_id)
1597
+ delete_nodes(removable_auto_ids)
1598
+ if source_id:
1599
+ self._cleanup_local_graph_orphans(conn, str(source_id))
1600
+
1601
+ def _cleanup_local_graph_orphans(self, conn: sqlite3.Connection, source_id: str) -> None:
1602
+ while True:
1603
+ folder_rows = conn.execute(
1604
+ "SELECT id, metadata_json FROM nodes WHERE type='Folder'"
1605
+ ).fetchall()
1606
+ leaf_ids = []
1607
+ for row in folder_rows:
1608
+ metadata = _safe_loads(row["metadata_json"])
1609
+ if metadata.get("source_id") != source_id:
1610
+ continue
1611
+ has_children = conn.execute(
1612
+ "SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
1613
+ (row["id"],),
1614
+ ).fetchone()
1615
+ if not has_children:
1616
+ leaf_ids.append(row["id"])
1617
+ if not leaf_ids:
1618
+ break
1619
+ placeholders = ",".join("?" * len(leaf_ids))
1620
+ conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", leaf_ids * 2)
1621
+ conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", leaf_ids)
1622
+
1623
+ for node_type in ("Drive", "Computer"):
1624
+ rows = conn.execute("SELECT id FROM nodes WHERE type=?", (node_type,)).fetchall()
1625
+ removable = []
1626
+ for row in rows:
1627
+ has_children = conn.execute(
1628
+ "SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
1629
+ (row["id"],),
1630
+ ).fetchone()
1631
+ if not has_children:
1632
+ removable.append(row["id"])
1633
+ if removable:
1634
+ placeholders = ",".join("?" * len(removable))
1635
+ conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", removable * 2)
1636
+ conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", removable)
1637
+
1638
+ def _local_file_index_has_extracted_text(self, row: sqlite3.Row) -> bool:
1639
+ metadata = _safe_loads(row["metadata_json"])
1640
+ parser = metadata.get("parser") if isinstance(metadata, dict) else {}
1641
+ if not isinstance(parser, dict):
1642
+ return False
1643
+ try:
1644
+ return int(parser.get("extracted_chars") or 0) > 0
1645
+ except (TypeError, ValueError):
1646
+ return False
1647
+
1384
1648
  def _upsert_local_file_node(
1385
1649
  self,
1386
1650
  conn: sqlite3.Connection,
@@ -1397,6 +1661,9 @@ class KnowledgeGraphStore:
1397
1661
  text: str,
1398
1662
  parser_meta: Dict[str, Any],
1399
1663
  ) -> str:
1664
+ text = _clean_text(text)
1665
+ if not text:
1666
+ raise ValueError("텍스트 추출 결과가 비어 있습니다.")
1400
1667
  try:
1401
1668
  relative_path = file_path.relative_to(root).as_posix()
1402
1669
  except ValueError:
@@ -1446,7 +1713,7 @@ class KnowledgeGraphStore:
1446
1713
  file_node_id,
1447
1714
  _node_type_for_category(category),
1448
1715
  file_path.name,
1449
- summary=(_clean_text(text) or relative_path)[:700],
1716
+ summary=text[:700],
1450
1717
  metadata=metadata,
1451
1718
  raw=metadata,
1452
1719
  )
@@ -1488,7 +1755,7 @@ class KnowledgeGraphStore:
1488
1755
  )
1489
1756
  self._upsert_edge(conn, file_node_id, chunk_id, "포함함", weight=0.7, metadata={"source": "local_scan"})
1490
1757
 
1491
- concepts = _extract_concepts(f"{file_path.name}\n{target_for_concepts}", limit=18)
1758
+ concepts = _extract_concepts(target_for_concepts, limit=18)
1492
1759
  concept_ids: Dict[str, str] = {}
1493
1760
  for concept in concepts:
1494
1761
  node_t = _classify_node_type(concept, target_for_concepts)
@@ -1620,10 +1887,21 @@ class KnowledgeGraphStore:
1620
1887
  except ValueError:
1621
1888
  relative_path = file_path.name
1622
1889
  seen_relative_paths.add(relative_path)
1890
+ modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
1891
+ existing = conn.execute(
1892
+ """
1893
+ SELECT size_bytes, modified_at, sha256, graph_node_id, status, metadata_json
1894
+ FROM local_file_index
1895
+ WHERE source_id=? AND relative_path=?
1896
+ """,
1897
+ (source_id, relative_path),
1898
+ ).fetchone()
1623
1899
  decision = self._local_file_decision(file_path, root, stat)
1624
1900
  parser_type = decision["parser_type"]
1625
1901
  if not decision["indexable"]:
1626
1902
  counts[decision["status"]] += 1
1903
+ if existing and existing["graph_node_id"]:
1904
+ self._delete_local_file_graph(conn, existing["graph_node_id"])
1627
1905
  self._upsert_local_file_index(
1628
1906
  conn,
1629
1907
  source_id=source_id,
@@ -1638,19 +1916,11 @@ class KnowledgeGraphStore:
1638
1916
  )
1639
1917
  continue
1640
1918
 
1641
- modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
1642
- existing = conn.execute(
1643
- """
1644
- SELECT size_bytes, modified_at, sha256, graph_node_id, status
1645
- FROM local_file_index
1646
- WHERE source_id=? AND relative_path=?
1647
- """,
1648
- (source_id, relative_path),
1649
- ).fetchone()
1650
1919
  if (
1651
1920
  existing
1652
1921
  and existing["status"] == "indexed"
1653
1922
  and existing["graph_node_id"]
1923
+ and self._local_file_index_has_extracted_text(existing)
1654
1924
  and existing["size_bytes"] == stat.st_size
1655
1925
  and existing["modified_at"] == modified_at
1656
1926
  ):
@@ -1667,7 +1937,7 @@ class KnowledgeGraphStore:
1667
1937
  parser_type=parser_type,
1668
1938
  sha256=existing["sha256"],
1669
1939
  graph_node_id=existing["graph_node_id"],
1670
- metadata={"category": decision["category"], "unchanged": True},
1940
+ metadata={**_safe_loads(existing["metadata_json"]), "category": decision["category"], "unchanged": True},
1671
1941
  )
1672
1942
  continue
1673
1943
 
@@ -1677,6 +1947,8 @@ class KnowledgeGraphStore:
1677
1947
  except Exception as exc:
1678
1948
  counts["failed"] += 1
1679
1949
  errors.append({"path": str(file_path), "error": str(exc)})
1950
+ if existing and existing["graph_node_id"]:
1951
+ self._delete_local_file_graph(conn, existing["graph_node_id"])
1680
1952
  self._upsert_local_file_index(
1681
1953
  conn,
1682
1954
  source_id=source_id,
@@ -1692,7 +1964,12 @@ class KnowledgeGraphStore:
1692
1964
  )
1693
1965
  continue
1694
1966
 
1695
- if existing and existing["sha256"] == digest and existing["graph_node_id"]:
1967
+ if (
1968
+ existing
1969
+ and existing["sha256"] == digest
1970
+ and existing["graph_node_id"]
1971
+ and self._local_file_index_has_extracted_text(existing)
1972
+ ):
1696
1973
  counts["skipped_unchanged"] += 1
1697
1974
  self._upsert_local_file_index(
1698
1975
  conn,
@@ -1706,7 +1983,7 @@ class KnowledgeGraphStore:
1706
1983
  parser_type=parser_type,
1707
1984
  sha256=digest,
1708
1985
  graph_node_id=existing["graph_node_id"],
1709
- metadata={"category": decision["category"], "sha256_unchanged": True},
1986
+ metadata={**_safe_loads(existing["metadata_json"]), "category": decision["category"], "sha256_unchanged": True},
1710
1987
  )
1711
1988
  continue
1712
1989
 
@@ -1716,6 +1993,27 @@ class KnowledgeGraphStore:
1716
1993
  decision["category"],
1717
1994
  include_ocr=include_ocr,
1718
1995
  )
1996
+ text = _clean_text(text)
1997
+ parser_meta = {**parser_meta, "extracted_chars": len(text)}
1998
+ if not text:
1999
+ counts["skipped_empty_text"] += 1
2000
+ if existing and existing["graph_node_id"]:
2001
+ self._delete_local_file_graph(conn, existing["graph_node_id"])
2002
+ self._upsert_local_file_index(
2003
+ conn,
2004
+ source_id=source_id,
2005
+ root=root,
2006
+ file_path=file_path,
2007
+ stat=stat,
2008
+ os_type=os_type,
2009
+ drive_id=drive_id,
2010
+ status="skipped_empty_text",
2011
+ parser_type=parser_type,
2012
+ sha256=digest,
2013
+ error_message="텍스트 추출 결과가 비어 있습니다.",
2014
+ metadata={"category": decision["category"], "parser": parser_meta},
2015
+ )
2016
+ continue
1719
2017
  graph_node_id = self._upsert_local_file_node(
1720
2018
  conn,
1721
2019
  source_id=source_id,
@@ -1749,6 +2047,8 @@ class KnowledgeGraphStore:
1749
2047
  except Exception as exc:
1750
2048
  counts["failed"] += 1
1751
2049
  errors.append({"path": str(file_path), "error": str(exc)})
2050
+ if existing and existing["graph_node_id"]:
2051
+ self._delete_local_file_graph(conn, existing["graph_node_id"])
1752
2052
  self._upsert_local_file_index(
1753
2053
  conn,
1754
2054
  source_id=source_id,
@@ -1765,19 +2065,20 @@ class KnowledgeGraphStore:
1765
2065
  )
1766
2066
 
1767
2067
  if not limit_reached:
1768
- existing_paths = {
1769
- row["relative_path"]
2068
+ existing_rows = {
2069
+ row["relative_path"]: row["graph_node_id"]
1770
2070
  for row in conn.execute(
1771
- "SELECT relative_path FROM local_file_index WHERE source_id=?",
2071
+ "SELECT relative_path, graph_node_id FROM local_file_index WHERE source_id=?",
1772
2072
  (source_id,),
1773
2073
  )
1774
2074
  }
1775
- deleted_paths = existing_paths - seen_relative_paths
2075
+ deleted_paths = set(existing_rows) - seen_relative_paths
1776
2076
  for relative_path in deleted_paths:
2077
+ self._delete_local_file_graph(conn, existing_rows.get(relative_path))
1777
2078
  conn.execute(
1778
2079
  """
1779
2080
  UPDATE local_file_index
1780
- SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL
2081
+ SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL, graph_node_id=NULL
1781
2082
  WHERE source_id=? AND relative_path=?
1782
2083
  """,
1783
2084
  (_now(), source_id, relative_path),
@@ -2639,3 +2940,170 @@ class KnowledgeGraphStore:
2639
2940
  "local_file_status": local_file_status,
2640
2941
  "v2": v2,
2641
2942
  }
2943
+
2944
+ def search_for_document_generation(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
2945
+ """Hybrid retrieval optimized for document generation.
2946
+
2947
+ Scoring: 0.5*text_relevance + 0.3*graph_relationship + 0.2*recency
2948
+ Returns nodes with rich context for document generation prompts.
2949
+ """
2950
+ query = str(query or "").strip()
2951
+ if not query:
2952
+ return []
2953
+ limit = max(1, min(int(limit or 10), 50))
2954
+ terms = _topic_candidates(query, limit=12)
2955
+ now = datetime.now()
2956
+
2957
+ with self._connect() as conn:
2958
+ candidate_rows = []
2959
+ seen_ids = set()
2960
+
2961
+ if query:
2962
+ q = f"%{query}%"
2963
+ rows = conn.execute(
2964
+ """
2965
+ SELECT id, type, title, summary, metadata_json, updated_at
2966
+ FROM nodes
2967
+ WHERE (title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)
2968
+ AND type IN ('Document', 'File', 'CodeFile', 'SlideDeck',
2969
+ 'Spreadsheet', 'Image', 'ImageText', 'Chat',
2970
+ 'Decision', 'Task', 'Concept', 'Feature',
2971
+ 'Page', 'Slide')
2972
+ ORDER BY updated_at DESC
2973
+ LIMIT ?
2974
+ """,
2975
+ (q, q, q, limit * 5),
2976
+ ).fetchall()
2977
+ for row in rows:
2978
+ if row["id"] not in seen_ids:
2979
+ seen_ids.add(row["id"])
2980
+ candidate_rows.append(row)
2981
+
2982
+ for term in terms:
2983
+ t = f"%{term}%"
2984
+ rows = conn.execute(
2985
+ """
2986
+ SELECT id, type, title, summary, metadata_json, updated_at
2987
+ FROM nodes
2988
+ WHERE (title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)
2989
+ AND type IN ('Document', 'File', 'CodeFile', 'SlideDeck',
2990
+ 'Spreadsheet', 'Image', 'ImageText', 'Chat',
2991
+ 'Decision', 'Task', 'Concept', 'Feature',
2992
+ 'Page', 'Slide')
2993
+ ORDER BY updated_at DESC
2994
+ LIMIT ?
2995
+ """,
2996
+ (t, t, t, limit * 3),
2997
+ ).fetchall()
2998
+ for row in rows:
2999
+ if row["id"] not in seen_ids:
3000
+ seen_ids.add(row["id"])
3001
+ candidate_rows.append(row)
3002
+
3003
+ scored_results = []
3004
+ for row in candidate_rows:
3005
+ haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
3006
+
3007
+ text_hits = sum(1 for term in terms if term.lower() in haystack)
3008
+ text_score = min(1.0, text_hits / max(len(terms), 1))
3009
+
3010
+ edge_count = conn.execute(
3011
+ "SELECT COUNT(*) AS c FROM edges WHERE from_node=? OR to_node=?",
3012
+ (row["id"], row["id"]),
3013
+ ).fetchone()["c"]
3014
+ graph_score = min(1.0, math.log1p(edge_count) / 4.0)
3015
+
3016
+ recency = _recency_score(row["updated_at"], now=now, half_life_days=14.0)
3017
+
3018
+ doc_type_boost = 1.2 if row["type"] in (
3019
+ "Document", "File", "SlideDeck", "Decision",
3020
+ ) else 1.0
3021
+
3022
+ hybrid_score = (
3023
+ 0.5 * text_score
3024
+ + 0.3 * graph_score
3025
+ + 0.2 * recency
3026
+ ) * doc_type_boost
3027
+
3028
+ meta = _safe_loads(row["metadata_json"])
3029
+ neighbor_concepts = []
3030
+ neighbor_rows = conn.execute(
3031
+ """
3032
+ SELECT n.title, n.type FROM edges e
3033
+ JOIN nodes n ON n.id = CASE WHEN e.from_node = ? THEN e.to_node ELSE e.from_node END
3034
+ WHERE (e.from_node = ? OR e.to_node = ?)
3035
+ AND n.type IN ('Concept', 'Feature', 'Decision', 'Task')
3036
+ LIMIT 8
3037
+ """,
3038
+ (row["id"], row["id"], row["id"]),
3039
+ ).fetchall()
3040
+ for nr in neighbor_rows:
3041
+ neighbor_concepts.append({"title": nr["title"], "type": nr["type"]})
3042
+
3043
+ scored_results.append({
3044
+ "id": row["id"],
3045
+ "type": row["type"],
3046
+ "title": row["title"],
3047
+ "summary": row["summary"],
3048
+ "metadata": meta,
3049
+ "updated_at": row["updated_at"],
3050
+ "hybrid_score": round(hybrid_score, 4),
3051
+ "scores": {
3052
+ "text": round(text_score, 4),
3053
+ "graph": round(graph_score, 4),
3054
+ "recency": round(recency, 4),
3055
+ },
3056
+ "related_concepts": neighbor_concepts,
3057
+ })
3058
+
3059
+ scored_results.sort(key=lambda x: x["hybrid_score"], reverse=True)
3060
+ return scored_results[:limit]
3061
+
3062
+ def multi_hop_context(self, node_ids: List[str], max_hops: int = 2) -> Dict[str, Any]:
3063
+ """Multi-hop graph traversal from seed nodes for richer context."""
3064
+ visited_nodes = set()
3065
+ visited_edges = set()
3066
+ all_nodes = []
3067
+ all_edges = []
3068
+ frontier = set(node_ids)
3069
+
3070
+ with self._connect() as conn:
3071
+ for hop in range(max_hops):
3072
+ if not frontier:
3073
+ break
3074
+ next_frontier = set()
3075
+ for nid in frontier:
3076
+ if nid in visited_nodes:
3077
+ continue
3078
+ visited_nodes.add(nid)
3079
+ row = conn.execute(
3080
+ "SELECT id, type, title, summary, metadata_json, updated_at FROM nodes WHERE id=?",
3081
+ (nid,),
3082
+ ).fetchone()
3083
+ if row:
3084
+ all_nodes.append({
3085
+ "id": row["id"], "type": row["type"],
3086
+ "title": row["title"], "summary": row["summary"],
3087
+ "metadata": _safe_loads(row["metadata_json"]),
3088
+ "hop": hop,
3089
+ })
3090
+ edge_rows = conn.execute(
3091
+ """
3092
+ SELECT id, from_node, to_node, type, weight
3093
+ FROM edges WHERE from_node=? OR to_node=?
3094
+ """,
3095
+ (nid, nid),
3096
+ ).fetchall()
3097
+ for er in edge_rows:
3098
+ if er["id"] not in visited_edges:
3099
+ visited_edges.add(er["id"])
3100
+ all_edges.append({
3101
+ "from": er["from_node"], "to": er["to_node"],
3102
+ "type": er["type"], "weight": er["weight"],
3103
+ })
3104
+ other = er["to_node"] if er["from_node"] == nid else er["from_node"]
3105
+ if other not in visited_nodes:
3106
+ next_frontier.add(other)
3107
+ frontier = next_frontier
3108
+
3109
+ return {"nodes": all_nodes, "edges": all_edges}