ltcai 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -2
- package/auto_setup.py +15 -1
- package/docs/CHANGELOG.md +67 -0
- package/kg_schema.py +64 -15
- package/knowledge_graph.py +499 -31
- package/latticeai/core/__init__.py +1 -1
- package/latticeai/core/context_builder.py +191 -0
- package/latticeai/core/document_generator.py +103 -0
- package/llm_router.py +148 -1
- package/package.json +2 -2
- package/server.py +207 -27
- package/static/css/tokens.css +26 -0
- package/static/lattice-reference.css +390 -375
- package/latticeai/__pycache__/__init__.cpython-314.pyc +0 -0
- package/latticeai/api/__pycache__/admin.cpython-314.pyc +0 -0
- package/latticeai/api/__pycache__/auth.cpython-314.pyc +0 -0
- package/latticeai/core/__pycache__/__init__.cpython-314.pyc +0 -0
- package/latticeai/core/__pycache__/audit.cpython-314.pyc +0 -0
- package/latticeai/core/__pycache__/security.cpython-314.pyc +0 -0
- package/latticeai/core/__pycache__/sessions.cpython-314.pyc +0 -0
package/knowledge_graph.py
CHANGED
|
@@ -6,6 +6,7 @@ portable database so it can later migrate to Neo4j/Postgres without changing
|
|
|
6
6
|
the ingestion contract.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
import asyncio
|
|
9
10
|
import hashlib
|
|
10
11
|
import json
|
|
11
12
|
import logging
|
|
@@ -26,6 +27,12 @@ try:
|
|
|
26
27
|
except Exception: # pragma: no cover - v2 schema is optional at import time
|
|
27
28
|
KGStoreV2 = None # type: ignore[assignment]
|
|
28
29
|
|
|
30
|
+
_llm_router_ref = None
|
|
31
|
+
|
|
32
|
+
def set_llm_router(router_instance):
|
|
33
|
+
global _llm_router_ref
|
|
34
|
+
_llm_router_ref = router_instance
|
|
35
|
+
|
|
29
36
|
|
|
30
37
|
GRAPH_SCHEMA_VERSION = 1
|
|
31
38
|
|
|
@@ -365,6 +372,109 @@ def _chunks(text: str, size: int = 1200, overlap: int = 160) -> List[str]:
|
|
|
365
372
|
return chunks
|
|
366
373
|
|
|
367
374
|
|
|
375
|
+
_LLM_EXTRACT_CONCEPT_PROMPT = """Extract the key concepts from the following text.
|
|
376
|
+
Return ONLY a JSON array of objects, each with "concept" (string) and "importance" (float 0-1).
|
|
377
|
+
Extract up to {limit} concepts. Focus on named entities, technical terms, and domain-specific nouns.
|
|
378
|
+
Do NOT include common words, stop words, or generic terms.
|
|
379
|
+
|
|
380
|
+
Text:
|
|
381
|
+
{text}
|
|
382
|
+
|
|
383
|
+
JSON:"""
|
|
384
|
+
|
|
385
|
+
_LLM_EXTRACT_TRIPLE_PROMPT = """Extract relationship triples from the following text.
|
|
386
|
+
Return ONLY a JSON array of objects, each with:
|
|
387
|
+
- "subject": source concept (string)
|
|
388
|
+
- "relation": relationship verb (string, Korean or English)
|
|
389
|
+
- "object": target concept (string)
|
|
390
|
+
- "evidence": the sentence supporting this triple (string, max 240 chars)
|
|
391
|
+
- "confidence": how confident you are (float 0-1)
|
|
392
|
+
|
|
393
|
+
Extract up to {limit} triples. Focus on meaningful semantic relationships.
|
|
394
|
+
|
|
395
|
+
Text:
|
|
396
|
+
{text}
|
|
397
|
+
|
|
398
|
+
Concepts already identified: {concepts}
|
|
399
|
+
|
|
400
|
+
JSON:"""
|
|
401
|
+
|
|
402
|
+
ENABLE_LLM_EXTRACTION = os.getenv("LATTICEAI_LLM_EXTRACTION", "true").lower() in ("1", "true", "yes")
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _llm_extract_concepts(text: str, limit: int = 12) -> Optional[List[str]]:
|
|
406
|
+
if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
|
|
407
|
+
return None
|
|
408
|
+
if not _llm_router_ref.current_model_id:
|
|
409
|
+
return None
|
|
410
|
+
prompt = _LLM_EXTRACT_CONCEPT_PROMPT.format(text=text[:3000], limit=limit)
|
|
411
|
+
try:
|
|
412
|
+
loop = asyncio.get_event_loop()
|
|
413
|
+
if loop.is_running():
|
|
414
|
+
import concurrent.futures
|
|
415
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
416
|
+
future = pool.submit(asyncio.run, _llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1))
|
|
417
|
+
raw = future.result(timeout=30)
|
|
418
|
+
else:
|
|
419
|
+
raw = asyncio.run(_llm_router_ref.generate(prompt, max_tokens=1024, temperature=0.1))
|
|
420
|
+
raw = raw.strip()
|
|
421
|
+
if raw.startswith("```"):
|
|
422
|
+
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
|
423
|
+
raw = re.sub(r"\s*```$", "", raw)
|
|
424
|
+
parsed = json.loads(raw)
|
|
425
|
+
if isinstance(parsed, list):
|
|
426
|
+
concepts = []
|
|
427
|
+
for item in parsed[:limit]:
|
|
428
|
+
if isinstance(item, dict) and "concept" in item:
|
|
429
|
+
concepts.append(item["concept"])
|
|
430
|
+
elif isinstance(item, str):
|
|
431
|
+
concepts.append(item)
|
|
432
|
+
return concepts if concepts else None
|
|
433
|
+
except Exception as e:
|
|
434
|
+
logging.debug("LLM concept extraction failed (falling back to rules): %s", e)
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _llm_extract_triples(text: str, concepts: List[str], limit: int = 20) -> Optional[List[Dict[str, str]]]:
|
|
439
|
+
if not ENABLE_LLM_EXTRACTION or not _llm_router_ref:
|
|
440
|
+
return None
|
|
441
|
+
if not _llm_router_ref.current_model_id:
|
|
442
|
+
return None
|
|
443
|
+
prompt = _LLM_EXTRACT_TRIPLE_PROMPT.format(
|
|
444
|
+
text=text[:3000], limit=limit,
|
|
445
|
+
concepts=", ".join(concepts[:15]),
|
|
446
|
+
)
|
|
447
|
+
try:
|
|
448
|
+
loop = asyncio.get_event_loop()
|
|
449
|
+
if loop.is_running():
|
|
450
|
+
import concurrent.futures
|
|
451
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
452
|
+
future = pool.submit(asyncio.run, _llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1))
|
|
453
|
+
raw = future.result(timeout=30)
|
|
454
|
+
else:
|
|
455
|
+
raw = asyncio.run(_llm_router_ref.generate(prompt, max_tokens=2048, temperature=0.1))
|
|
456
|
+
raw = raw.strip()
|
|
457
|
+
if raw.startswith("```"):
|
|
458
|
+
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
|
459
|
+
raw = re.sub(r"\s*```$", "", raw)
|
|
460
|
+
parsed = json.loads(raw)
|
|
461
|
+
if isinstance(parsed, list):
|
|
462
|
+
triples = []
|
|
463
|
+
for item in parsed[:limit]:
|
|
464
|
+
if isinstance(item, dict) and "subject" in item and "object" in item:
|
|
465
|
+
triples.append({
|
|
466
|
+
"subject": str(item["subject"]),
|
|
467
|
+
"relation": str(item.get("relation", "관련됨")),
|
|
468
|
+
"object": str(item["object"]),
|
|
469
|
+
"context": str(item.get("evidence", ""))[:240],
|
|
470
|
+
"confidence": float(item.get("confidence", 0.8)),
|
|
471
|
+
})
|
|
472
|
+
return triples if triples else None
|
|
473
|
+
except Exception as e:
|
|
474
|
+
logging.debug("LLM triple extraction failed (falling back to rules): %s", e)
|
|
475
|
+
return None
|
|
476
|
+
|
|
477
|
+
|
|
368
478
|
_CONCEPT_STOP: set = {
|
|
369
479
|
# English stop words
|
|
370
480
|
"the", "and", "for", "with", "this", "that", "from", "into", "which",
|
|
@@ -385,7 +495,15 @@ _CONCEPT_STOP: set = {
|
|
|
385
495
|
|
|
386
496
|
|
|
387
497
|
def _extract_concepts(text: str, limit: int = 12) -> List[str]:
|
|
388
|
-
"""
|
|
498
|
+
"""LLM-first concept extraction with rule-based fallback."""
|
|
499
|
+
llm_result = _llm_extract_concepts(text, limit)
|
|
500
|
+
if llm_result:
|
|
501
|
+
return llm_result
|
|
502
|
+
return _extract_concepts_rules(text, limit)
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _extract_concepts_rules(text: str, limit: int = 12) -> List[str]:
|
|
506
|
+
"""Extract meaningful named concepts from text (rule-based).
|
|
389
507
|
|
|
390
508
|
Priority order:
|
|
391
509
|
1. Backtick / quoted terms (explicitly technical)
|
|
@@ -586,7 +704,19 @@ def _extract_triples(
|
|
|
586
704
|
concepts: List[str],
|
|
587
705
|
limit: int = 20,
|
|
588
706
|
) -> List[Dict[str, str]]:
|
|
589
|
-
"""
|
|
707
|
+
"""LLM-first triple extraction with rule-based fallback."""
|
|
708
|
+
llm_result = _llm_extract_triples(text, concepts, limit)
|
|
709
|
+
if llm_result:
|
|
710
|
+
return llm_result
|
|
711
|
+
return _extract_triples_rules(text, concepts, limit)
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
def _extract_triples_rules(
|
|
715
|
+
text: str,
|
|
716
|
+
concepts: List[str],
|
|
717
|
+
limit: int = 20,
|
|
718
|
+
) -> List[Dict[str, str]]:
|
|
719
|
+
"""Extract (subject, verb-edge, object, context) triples from text (rule-based).
|
|
590
720
|
|
|
591
721
|
For each sentence containing ≥2 concepts, infer the verb-form edge label
|
|
592
722
|
from surrounding context and create a directed triple.
|
|
@@ -1223,21 +1353,44 @@ class KnowledgeGraphStore:
|
|
|
1223
1353
|
from docx import Document
|
|
1224
1354
|
doc = Document(str(path))
|
|
1225
1355
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
1356
|
+
table_lines = []
|
|
1357
|
+
for table in doc.tables:
|
|
1358
|
+
for row in table.rows:
|
|
1359
|
+
cells = [_clean_text(cell.text) for cell in row.cells]
|
|
1360
|
+
if any(cells):
|
|
1361
|
+
table_lines.append("\t".join(cells))
|
|
1226
1362
|
meta["paragraphs"] = len(paragraphs)
|
|
1227
1363
|
meta["tables"] = len(doc.tables)
|
|
1228
|
-
|
|
1364
|
+
meta["table_rows"] = len(table_lines)
|
|
1365
|
+
text = "\n\n".join([*paragraphs, *table_lines])
|
|
1229
1366
|
elif ext == ".xlsx":
|
|
1230
1367
|
from openpyxl import load_workbook
|
|
1231
1368
|
wb = load_workbook(str(path), read_only=True, data_only=True)
|
|
1232
1369
|
rows_all = []
|
|
1370
|
+
non_empty_rows = 0
|
|
1371
|
+
non_empty_cells = 0
|
|
1372
|
+
char_count = 0
|
|
1233
1373
|
for ws in wb.worksheets:
|
|
1234
|
-
|
|
1374
|
+
sheet_rows = []
|
|
1235
1375
|
for row in ws.iter_rows(values_only=True):
|
|
1236
|
-
cells = [str(cell) if cell is not None else "" for cell in row]
|
|
1237
|
-
|
|
1238
|
-
|
|
1376
|
+
cells = [str(cell).strip() if cell is not None else "" for cell in row]
|
|
1377
|
+
if not any(cells):
|
|
1378
|
+
continue
|
|
1379
|
+
line = "\t".join(cells)
|
|
1380
|
+
non_empty_rows += 1
|
|
1381
|
+
non_empty_cells += sum(1 for cell in cells if cell)
|
|
1382
|
+
sheet_rows.append(line)
|
|
1383
|
+
char_count += len(line) + 1
|
|
1384
|
+
if char_count > 200_000:
|
|
1239
1385
|
break
|
|
1386
|
+
if sheet_rows:
|
|
1387
|
+
rows_all.append(f"[Sheet: {ws.title}]")
|
|
1388
|
+
rows_all.extend(sheet_rows)
|
|
1389
|
+
if char_count > 200_000:
|
|
1390
|
+
break
|
|
1240
1391
|
meta["sheets"] = len(wb.worksheets)
|
|
1392
|
+
meta["rows"] = non_empty_rows
|
|
1393
|
+
meta["cells"] = non_empty_cells
|
|
1241
1394
|
text = "\n".join(rows_all)
|
|
1242
1395
|
elif ext == ".pptx":
|
|
1243
1396
|
from pptx import Presentation
|
|
@@ -1247,9 +1400,13 @@ class KnowledgeGraphStore:
|
|
|
1247
1400
|
parts = []
|
|
1248
1401
|
for shape in slide.shapes:
|
|
1249
1402
|
if getattr(shape, "has_text_frame", False):
|
|
1250
|
-
|
|
1251
|
-
|
|
1403
|
+
slide_text = shape.text_frame.text.strip()
|
|
1404
|
+
if slide_text:
|
|
1405
|
+
parts.append(slide_text)
|
|
1406
|
+
if parts:
|
|
1407
|
+
slides_text.append(f"[Slide {index}]\n" + "\n".join(parts))
|
|
1252
1408
|
meta["slides"] = len(prs.slides)
|
|
1409
|
+
meta["text_slides"] = len(slides_text)
|
|
1253
1410
|
text = "\n\n".join(slides_text)
|
|
1254
1411
|
elif category == "image":
|
|
1255
1412
|
from PIL import Image
|
|
@@ -1362,13 +1519,13 @@ class KnowledgeGraphStore:
|
|
|
1362
1519
|
extension=excluded.extension,
|
|
1363
1520
|
size_bytes=excluded.size_bytes,
|
|
1364
1521
|
modified_at=excluded.modified_at,
|
|
1365
|
-
sha256=
|
|
1522
|
+
sha256=excluded.sha256,
|
|
1366
1523
|
last_scanned_at=excluded.last_scanned_at,
|
|
1367
|
-
last_indexed_at=
|
|
1524
|
+
last_indexed_at=excluded.last_indexed_at,
|
|
1368
1525
|
parser_type=excluded.parser_type,
|
|
1369
1526
|
status=excluded.status,
|
|
1370
1527
|
error_message=excluded.error_message,
|
|
1371
|
-
graph_node_id=
|
|
1528
|
+
graph_node_id=excluded.graph_node_id,
|
|
1372
1529
|
deleted=excluded.deleted,
|
|
1373
1530
|
metadata_json=excluded.metadata_json
|
|
1374
1531
|
""",
|
|
@@ -1381,6 +1538,113 @@ class KnowledgeGraphStore:
|
|
|
1381
1538
|
)
|
|
1382
1539
|
return index_id
|
|
1383
1540
|
|
|
1541
|
+
def _delete_local_file_graph(self, conn: sqlite3.Connection, file_node_id: Optional[str]) -> None:
|
|
1542
|
+
if not file_node_id:
|
|
1543
|
+
return
|
|
1544
|
+
|
|
1545
|
+
file_row = conn.execute(
|
|
1546
|
+
"SELECT metadata_json FROM nodes WHERE id=?",
|
|
1547
|
+
(file_node_id,),
|
|
1548
|
+
).fetchone()
|
|
1549
|
+
source_id = None
|
|
1550
|
+
if file_row:
|
|
1551
|
+
source_id = _safe_loads(file_row["metadata_json"]).get("source_id")
|
|
1552
|
+
|
|
1553
|
+
linked_rows = conn.execute(
|
|
1554
|
+
"""
|
|
1555
|
+
SELECT n.id, n.type, n.metadata_json
|
|
1556
|
+
FROM edges e
|
|
1557
|
+
JOIN nodes n ON n.id=e.to_node
|
|
1558
|
+
WHERE e.from_node=?
|
|
1559
|
+
""",
|
|
1560
|
+
(file_node_id,),
|
|
1561
|
+
).fetchall()
|
|
1562
|
+
owned_ids: set = set()
|
|
1563
|
+
auto_candidate_ids: set = set()
|
|
1564
|
+
for row in linked_rows:
|
|
1565
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1566
|
+
if row["type"] in {"Chunk", "ImageText", "Section"} or metadata.get("source_node") == file_node_id:
|
|
1567
|
+
owned_ids.add(row["id"])
|
|
1568
|
+
elif metadata.get("auto_extracted") and metadata.get("source") == "local_folder":
|
|
1569
|
+
auto_candidate_ids.add(row["id"])
|
|
1570
|
+
|
|
1571
|
+
conn.execute("DELETE FROM chunks WHERE source_node=?", (file_node_id,))
|
|
1572
|
+
conn.execute("DELETE FROM edges WHERE from_node=? OR to_node=?", (file_node_id, file_node_id))
|
|
1573
|
+
conn.execute("DELETE FROM nodes WHERE id=?", (file_node_id,))
|
|
1574
|
+
|
|
1575
|
+
def delete_nodes(node_ids: set) -> None:
|
|
1576
|
+
if not node_ids:
|
|
1577
|
+
return
|
|
1578
|
+
placeholders = ",".join("?" * len(node_ids))
|
|
1579
|
+
params = list(node_ids)
|
|
1580
|
+
conn.execute(f"DELETE FROM chunks WHERE source_node IN ({placeholders})", params)
|
|
1581
|
+
conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", params * 2)
|
|
1582
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", params)
|
|
1583
|
+
|
|
1584
|
+
delete_nodes(owned_ids)
|
|
1585
|
+
|
|
1586
|
+
removable_auto_ids: set = set()
|
|
1587
|
+
for node_id in auto_candidate_ids:
|
|
1588
|
+
remaining_edges = conn.execute(
|
|
1589
|
+
"SELECT from_node, to_node FROM edges WHERE from_node=? OR to_node=?",
|
|
1590
|
+
(node_id, node_id),
|
|
1591
|
+
).fetchall()
|
|
1592
|
+
if all(
|
|
1593
|
+
(row["from_node"] in auto_candidate_ids and row["to_node"] in auto_candidate_ids)
|
|
1594
|
+
for row in remaining_edges
|
|
1595
|
+
):
|
|
1596
|
+
removable_auto_ids.add(node_id)
|
|
1597
|
+
delete_nodes(removable_auto_ids)
|
|
1598
|
+
if source_id:
|
|
1599
|
+
self._cleanup_local_graph_orphans(conn, str(source_id))
|
|
1600
|
+
|
|
1601
|
+
def _cleanup_local_graph_orphans(self, conn: sqlite3.Connection, source_id: str) -> None:
|
|
1602
|
+
while True:
|
|
1603
|
+
folder_rows = conn.execute(
|
|
1604
|
+
"SELECT id, metadata_json FROM nodes WHERE type='Folder'"
|
|
1605
|
+
).fetchall()
|
|
1606
|
+
leaf_ids = []
|
|
1607
|
+
for row in folder_rows:
|
|
1608
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1609
|
+
if metadata.get("source_id") != source_id:
|
|
1610
|
+
continue
|
|
1611
|
+
has_children = conn.execute(
|
|
1612
|
+
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
1613
|
+
(row["id"],),
|
|
1614
|
+
).fetchone()
|
|
1615
|
+
if not has_children:
|
|
1616
|
+
leaf_ids.append(row["id"])
|
|
1617
|
+
if not leaf_ids:
|
|
1618
|
+
break
|
|
1619
|
+
placeholders = ",".join("?" * len(leaf_ids))
|
|
1620
|
+
conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", leaf_ids * 2)
|
|
1621
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", leaf_ids)
|
|
1622
|
+
|
|
1623
|
+
for node_type in ("Drive", "Computer"):
|
|
1624
|
+
rows = conn.execute("SELECT id FROM nodes WHERE type=?", (node_type,)).fetchall()
|
|
1625
|
+
removable = []
|
|
1626
|
+
for row in rows:
|
|
1627
|
+
has_children = conn.execute(
|
|
1628
|
+
"SELECT 1 FROM edges WHERE from_node=? LIMIT 1",
|
|
1629
|
+
(row["id"],),
|
|
1630
|
+
).fetchone()
|
|
1631
|
+
if not has_children:
|
|
1632
|
+
removable.append(row["id"])
|
|
1633
|
+
if removable:
|
|
1634
|
+
placeholders = ",".join("?" * len(removable))
|
|
1635
|
+
conn.execute(f"DELETE FROM edges WHERE from_node IN ({placeholders}) OR to_node IN ({placeholders})", removable * 2)
|
|
1636
|
+
conn.execute(f"DELETE FROM nodes WHERE id IN ({placeholders})", removable)
|
|
1637
|
+
|
|
1638
|
+
def _local_file_index_has_extracted_text(self, row: sqlite3.Row) -> bool:
|
|
1639
|
+
metadata = _safe_loads(row["metadata_json"])
|
|
1640
|
+
parser = metadata.get("parser") if isinstance(metadata, dict) else {}
|
|
1641
|
+
if not isinstance(parser, dict):
|
|
1642
|
+
return False
|
|
1643
|
+
try:
|
|
1644
|
+
return int(parser.get("extracted_chars") or 0) > 0
|
|
1645
|
+
except (TypeError, ValueError):
|
|
1646
|
+
return False
|
|
1647
|
+
|
|
1384
1648
|
def _upsert_local_file_node(
|
|
1385
1649
|
self,
|
|
1386
1650
|
conn: sqlite3.Connection,
|
|
@@ -1397,6 +1661,9 @@ class KnowledgeGraphStore:
|
|
|
1397
1661
|
text: str,
|
|
1398
1662
|
parser_meta: Dict[str, Any],
|
|
1399
1663
|
) -> str:
|
|
1664
|
+
text = _clean_text(text)
|
|
1665
|
+
if not text:
|
|
1666
|
+
raise ValueError("텍스트 추출 결과가 비어 있습니다.")
|
|
1400
1667
|
try:
|
|
1401
1668
|
relative_path = file_path.relative_to(root).as_posix()
|
|
1402
1669
|
except ValueError:
|
|
@@ -1446,7 +1713,7 @@ class KnowledgeGraphStore:
|
|
|
1446
1713
|
file_node_id,
|
|
1447
1714
|
_node_type_for_category(category),
|
|
1448
1715
|
file_path.name,
|
|
1449
|
-
summary=
|
|
1716
|
+
summary=text[:700],
|
|
1450
1717
|
metadata=metadata,
|
|
1451
1718
|
raw=metadata,
|
|
1452
1719
|
)
|
|
@@ -1488,7 +1755,7 @@ class KnowledgeGraphStore:
|
|
|
1488
1755
|
)
|
|
1489
1756
|
self._upsert_edge(conn, file_node_id, chunk_id, "포함함", weight=0.7, metadata={"source": "local_scan"})
|
|
1490
1757
|
|
|
1491
|
-
concepts = _extract_concepts(
|
|
1758
|
+
concepts = _extract_concepts(target_for_concepts, limit=18)
|
|
1492
1759
|
concept_ids: Dict[str, str] = {}
|
|
1493
1760
|
for concept in concepts:
|
|
1494
1761
|
node_t = _classify_node_type(concept, target_for_concepts)
|
|
@@ -1620,10 +1887,21 @@ class KnowledgeGraphStore:
|
|
|
1620
1887
|
except ValueError:
|
|
1621
1888
|
relative_path = file_path.name
|
|
1622
1889
|
seen_relative_paths.add(relative_path)
|
|
1890
|
+
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
1891
|
+
existing = conn.execute(
|
|
1892
|
+
"""
|
|
1893
|
+
SELECT size_bytes, modified_at, sha256, graph_node_id, status, metadata_json
|
|
1894
|
+
FROM local_file_index
|
|
1895
|
+
WHERE source_id=? AND relative_path=?
|
|
1896
|
+
""",
|
|
1897
|
+
(source_id, relative_path),
|
|
1898
|
+
).fetchone()
|
|
1623
1899
|
decision = self._local_file_decision(file_path, root, stat)
|
|
1624
1900
|
parser_type = decision["parser_type"]
|
|
1625
1901
|
if not decision["indexable"]:
|
|
1626
1902
|
counts[decision["status"]] += 1
|
|
1903
|
+
if existing and existing["graph_node_id"]:
|
|
1904
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1627
1905
|
self._upsert_local_file_index(
|
|
1628
1906
|
conn,
|
|
1629
1907
|
source_id=source_id,
|
|
@@ -1638,19 +1916,11 @@ class KnowledgeGraphStore:
|
|
|
1638
1916
|
)
|
|
1639
1917
|
continue
|
|
1640
1918
|
|
|
1641
|
-
modified_at = _safe_iso_from_stat_mtime(stat.st_mtime)
|
|
1642
|
-
existing = conn.execute(
|
|
1643
|
-
"""
|
|
1644
|
-
SELECT size_bytes, modified_at, sha256, graph_node_id, status
|
|
1645
|
-
FROM local_file_index
|
|
1646
|
-
WHERE source_id=? AND relative_path=?
|
|
1647
|
-
""",
|
|
1648
|
-
(source_id, relative_path),
|
|
1649
|
-
).fetchone()
|
|
1650
1919
|
if (
|
|
1651
1920
|
existing
|
|
1652
1921
|
and existing["status"] == "indexed"
|
|
1653
1922
|
and existing["graph_node_id"]
|
|
1923
|
+
and self._local_file_index_has_extracted_text(existing)
|
|
1654
1924
|
and existing["size_bytes"] == stat.st_size
|
|
1655
1925
|
and existing["modified_at"] == modified_at
|
|
1656
1926
|
):
|
|
@@ -1667,7 +1937,7 @@ class KnowledgeGraphStore:
|
|
|
1667
1937
|
parser_type=parser_type,
|
|
1668
1938
|
sha256=existing["sha256"],
|
|
1669
1939
|
graph_node_id=existing["graph_node_id"],
|
|
1670
|
-
metadata={"category": decision["category"], "unchanged": True},
|
|
1940
|
+
metadata={**_safe_loads(existing["metadata_json"]), "category": decision["category"], "unchanged": True},
|
|
1671
1941
|
)
|
|
1672
1942
|
continue
|
|
1673
1943
|
|
|
@@ -1677,6 +1947,8 @@ class KnowledgeGraphStore:
|
|
|
1677
1947
|
except Exception as exc:
|
|
1678
1948
|
counts["failed"] += 1
|
|
1679
1949
|
errors.append({"path": str(file_path), "error": str(exc)})
|
|
1950
|
+
if existing and existing["graph_node_id"]:
|
|
1951
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1680
1952
|
self._upsert_local_file_index(
|
|
1681
1953
|
conn,
|
|
1682
1954
|
source_id=source_id,
|
|
@@ -1692,7 +1964,12 @@ class KnowledgeGraphStore:
|
|
|
1692
1964
|
)
|
|
1693
1965
|
continue
|
|
1694
1966
|
|
|
1695
|
-
if
|
|
1967
|
+
if (
|
|
1968
|
+
existing
|
|
1969
|
+
and existing["sha256"] == digest
|
|
1970
|
+
and existing["graph_node_id"]
|
|
1971
|
+
and self._local_file_index_has_extracted_text(existing)
|
|
1972
|
+
):
|
|
1696
1973
|
counts["skipped_unchanged"] += 1
|
|
1697
1974
|
self._upsert_local_file_index(
|
|
1698
1975
|
conn,
|
|
@@ -1706,7 +1983,7 @@ class KnowledgeGraphStore:
|
|
|
1706
1983
|
parser_type=parser_type,
|
|
1707
1984
|
sha256=digest,
|
|
1708
1985
|
graph_node_id=existing["graph_node_id"],
|
|
1709
|
-
metadata={"category": decision["category"], "sha256_unchanged": True},
|
|
1986
|
+
metadata={**_safe_loads(existing["metadata_json"]), "category": decision["category"], "sha256_unchanged": True},
|
|
1710
1987
|
)
|
|
1711
1988
|
continue
|
|
1712
1989
|
|
|
@@ -1716,6 +1993,27 @@ class KnowledgeGraphStore:
|
|
|
1716
1993
|
decision["category"],
|
|
1717
1994
|
include_ocr=include_ocr,
|
|
1718
1995
|
)
|
|
1996
|
+
text = _clean_text(text)
|
|
1997
|
+
parser_meta = {**parser_meta, "extracted_chars": len(text)}
|
|
1998
|
+
if not text:
|
|
1999
|
+
counts["skipped_empty_text"] += 1
|
|
2000
|
+
if existing and existing["graph_node_id"]:
|
|
2001
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
2002
|
+
self._upsert_local_file_index(
|
|
2003
|
+
conn,
|
|
2004
|
+
source_id=source_id,
|
|
2005
|
+
root=root,
|
|
2006
|
+
file_path=file_path,
|
|
2007
|
+
stat=stat,
|
|
2008
|
+
os_type=os_type,
|
|
2009
|
+
drive_id=drive_id,
|
|
2010
|
+
status="skipped_empty_text",
|
|
2011
|
+
parser_type=parser_type,
|
|
2012
|
+
sha256=digest,
|
|
2013
|
+
error_message="텍스트 추출 결과가 비어 있습니다.",
|
|
2014
|
+
metadata={"category": decision["category"], "parser": parser_meta},
|
|
2015
|
+
)
|
|
2016
|
+
continue
|
|
1719
2017
|
graph_node_id = self._upsert_local_file_node(
|
|
1720
2018
|
conn,
|
|
1721
2019
|
source_id=source_id,
|
|
@@ -1749,6 +2047,8 @@ class KnowledgeGraphStore:
|
|
|
1749
2047
|
except Exception as exc:
|
|
1750
2048
|
counts["failed"] += 1
|
|
1751
2049
|
errors.append({"path": str(file_path), "error": str(exc)})
|
|
2050
|
+
if existing and existing["graph_node_id"]:
|
|
2051
|
+
self._delete_local_file_graph(conn, existing["graph_node_id"])
|
|
1752
2052
|
self._upsert_local_file_index(
|
|
1753
2053
|
conn,
|
|
1754
2054
|
source_id=source_id,
|
|
@@ -1765,19 +2065,20 @@ class KnowledgeGraphStore:
|
|
|
1765
2065
|
)
|
|
1766
2066
|
|
|
1767
2067
|
if not limit_reached:
|
|
1768
|
-
|
|
1769
|
-
row["relative_path"]
|
|
2068
|
+
existing_rows = {
|
|
2069
|
+
row["relative_path"]: row["graph_node_id"]
|
|
1770
2070
|
for row in conn.execute(
|
|
1771
|
-
"SELECT relative_path FROM local_file_index WHERE source_id=?",
|
|
2071
|
+
"SELECT relative_path, graph_node_id FROM local_file_index WHERE source_id=?",
|
|
1772
2072
|
(source_id,),
|
|
1773
2073
|
)
|
|
1774
2074
|
}
|
|
1775
|
-
deleted_paths =
|
|
2075
|
+
deleted_paths = set(existing_rows) - seen_relative_paths
|
|
1776
2076
|
for relative_path in deleted_paths:
|
|
2077
|
+
self._delete_local_file_graph(conn, existing_rows.get(relative_path))
|
|
1777
2078
|
conn.execute(
|
|
1778
2079
|
"""
|
|
1779
2080
|
UPDATE local_file_index
|
|
1780
|
-
SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL
|
|
2081
|
+
SET status='deleted', deleted=1, last_scanned_at=?, error_message=NULL, graph_node_id=NULL
|
|
1781
2082
|
WHERE source_id=? AND relative_path=?
|
|
1782
2083
|
""",
|
|
1783
2084
|
(_now(), source_id, relative_path),
|
|
@@ -2639,3 +2940,170 @@ class KnowledgeGraphStore:
|
|
|
2639
2940
|
"local_file_status": local_file_status,
|
|
2640
2941
|
"v2": v2,
|
|
2641
2942
|
}
|
|
2943
|
+
|
|
2944
|
+
def search_for_document_generation(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
|
2945
|
+
"""Hybrid retrieval optimized for document generation.
|
|
2946
|
+
|
|
2947
|
+
Scoring: 0.5*text_relevance + 0.3*graph_relationship + 0.2*recency
|
|
2948
|
+
Returns nodes with rich context for document generation prompts.
|
|
2949
|
+
"""
|
|
2950
|
+
query = str(query or "").strip()
|
|
2951
|
+
if not query:
|
|
2952
|
+
return []
|
|
2953
|
+
limit = max(1, min(int(limit or 10), 50))
|
|
2954
|
+
terms = _topic_candidates(query, limit=12)
|
|
2955
|
+
now = datetime.now()
|
|
2956
|
+
|
|
2957
|
+
with self._connect() as conn:
|
|
2958
|
+
candidate_rows = []
|
|
2959
|
+
seen_ids = set()
|
|
2960
|
+
|
|
2961
|
+
if query:
|
|
2962
|
+
q = f"%{query}%"
|
|
2963
|
+
rows = conn.execute(
|
|
2964
|
+
"""
|
|
2965
|
+
SELECT id, type, title, summary, metadata_json, updated_at
|
|
2966
|
+
FROM nodes
|
|
2967
|
+
WHERE (title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)
|
|
2968
|
+
AND type IN ('Document', 'File', 'CodeFile', 'SlideDeck',
|
|
2969
|
+
'Spreadsheet', 'Image', 'ImageText', 'Chat',
|
|
2970
|
+
'Decision', 'Task', 'Concept', 'Feature',
|
|
2971
|
+
'Page', 'Slide')
|
|
2972
|
+
ORDER BY updated_at DESC
|
|
2973
|
+
LIMIT ?
|
|
2974
|
+
""",
|
|
2975
|
+
(q, q, q, limit * 5),
|
|
2976
|
+
).fetchall()
|
|
2977
|
+
for row in rows:
|
|
2978
|
+
if row["id"] not in seen_ids:
|
|
2979
|
+
seen_ids.add(row["id"])
|
|
2980
|
+
candidate_rows.append(row)
|
|
2981
|
+
|
|
2982
|
+
for term in terms:
|
|
2983
|
+
t = f"%{term}%"
|
|
2984
|
+
rows = conn.execute(
|
|
2985
|
+
"""
|
|
2986
|
+
SELECT id, type, title, summary, metadata_json, updated_at
|
|
2987
|
+
FROM nodes
|
|
2988
|
+
WHERE (title LIKE ? OR summary LIKE ? OR metadata_json LIKE ?)
|
|
2989
|
+
AND type IN ('Document', 'File', 'CodeFile', 'SlideDeck',
|
|
2990
|
+
'Spreadsheet', 'Image', 'ImageText', 'Chat',
|
|
2991
|
+
'Decision', 'Task', 'Concept', 'Feature',
|
|
2992
|
+
'Page', 'Slide')
|
|
2993
|
+
ORDER BY updated_at DESC
|
|
2994
|
+
LIMIT ?
|
|
2995
|
+
""",
|
|
2996
|
+
(t, t, t, limit * 3),
|
|
2997
|
+
).fetchall()
|
|
2998
|
+
for row in rows:
|
|
2999
|
+
if row["id"] not in seen_ids:
|
|
3000
|
+
seen_ids.add(row["id"])
|
|
3001
|
+
candidate_rows.append(row)
|
|
3002
|
+
|
|
3003
|
+
scored_results = []
|
|
3004
|
+
for row in candidate_rows:
|
|
3005
|
+
haystack = f"{row['title']} {row['summary']} {row['metadata_json']}".lower()
|
|
3006
|
+
|
|
3007
|
+
text_hits = sum(1 for term in terms if term.lower() in haystack)
|
|
3008
|
+
text_score = min(1.0, text_hits / max(len(terms), 1))
|
|
3009
|
+
|
|
3010
|
+
edge_count = conn.execute(
|
|
3011
|
+
"SELECT COUNT(*) AS c FROM edges WHERE from_node=? OR to_node=?",
|
|
3012
|
+
(row["id"], row["id"]),
|
|
3013
|
+
).fetchone()["c"]
|
|
3014
|
+
graph_score = min(1.0, math.log1p(edge_count) / 4.0)
|
|
3015
|
+
|
|
3016
|
+
recency = _recency_score(row["updated_at"], now=now, half_life_days=14.0)
|
|
3017
|
+
|
|
3018
|
+
doc_type_boost = 1.2 if row["type"] in (
|
|
3019
|
+
"Document", "File", "SlideDeck", "Decision",
|
|
3020
|
+
) else 1.0
|
|
3021
|
+
|
|
3022
|
+
hybrid_score = (
|
|
3023
|
+
0.5 * text_score
|
|
3024
|
+
+ 0.3 * graph_score
|
|
3025
|
+
+ 0.2 * recency
|
|
3026
|
+
) * doc_type_boost
|
|
3027
|
+
|
|
3028
|
+
meta = _safe_loads(row["metadata_json"])
|
|
3029
|
+
neighbor_concepts = []
|
|
3030
|
+
neighbor_rows = conn.execute(
|
|
3031
|
+
"""
|
|
3032
|
+
SELECT n.title, n.type FROM edges e
|
|
3033
|
+
JOIN nodes n ON n.id = CASE WHEN e.from_node = ? THEN e.to_node ELSE e.from_node END
|
|
3034
|
+
WHERE (e.from_node = ? OR e.to_node = ?)
|
|
3035
|
+
AND n.type IN ('Concept', 'Feature', 'Decision', 'Task')
|
|
3036
|
+
LIMIT 8
|
|
3037
|
+
""",
|
|
3038
|
+
(row["id"], row["id"], row["id"]),
|
|
3039
|
+
).fetchall()
|
|
3040
|
+
for nr in neighbor_rows:
|
|
3041
|
+
neighbor_concepts.append({"title": nr["title"], "type": nr["type"]})
|
|
3042
|
+
|
|
3043
|
+
scored_results.append({
|
|
3044
|
+
"id": row["id"],
|
|
3045
|
+
"type": row["type"],
|
|
3046
|
+
"title": row["title"],
|
|
3047
|
+
"summary": row["summary"],
|
|
3048
|
+
"metadata": meta,
|
|
3049
|
+
"updated_at": row["updated_at"],
|
|
3050
|
+
"hybrid_score": round(hybrid_score, 4),
|
|
3051
|
+
"scores": {
|
|
3052
|
+
"text": round(text_score, 4),
|
|
3053
|
+
"graph": round(graph_score, 4),
|
|
3054
|
+
"recency": round(recency, 4),
|
|
3055
|
+
},
|
|
3056
|
+
"related_concepts": neighbor_concepts,
|
|
3057
|
+
})
|
|
3058
|
+
|
|
3059
|
+
scored_results.sort(key=lambda x: x["hybrid_score"], reverse=True)
|
|
3060
|
+
return scored_results[:limit]
|
|
3061
|
+
|
|
3062
|
+
def multi_hop_context(self, node_ids: List[str], max_hops: int = 2) -> Dict[str, Any]:
|
|
3063
|
+
"""Multi-hop graph traversal from seed nodes for richer context."""
|
|
3064
|
+
visited_nodes = set()
|
|
3065
|
+
visited_edges = set()
|
|
3066
|
+
all_nodes = []
|
|
3067
|
+
all_edges = []
|
|
3068
|
+
frontier = set(node_ids)
|
|
3069
|
+
|
|
3070
|
+
with self._connect() as conn:
|
|
3071
|
+
for hop in range(max_hops):
|
|
3072
|
+
if not frontier:
|
|
3073
|
+
break
|
|
3074
|
+
next_frontier = set()
|
|
3075
|
+
for nid in frontier:
|
|
3076
|
+
if nid in visited_nodes:
|
|
3077
|
+
continue
|
|
3078
|
+
visited_nodes.add(nid)
|
|
3079
|
+
row = conn.execute(
|
|
3080
|
+
"SELECT id, type, title, summary, metadata_json, updated_at FROM nodes WHERE id=?",
|
|
3081
|
+
(nid,),
|
|
3082
|
+
).fetchone()
|
|
3083
|
+
if row:
|
|
3084
|
+
all_nodes.append({
|
|
3085
|
+
"id": row["id"], "type": row["type"],
|
|
3086
|
+
"title": row["title"], "summary": row["summary"],
|
|
3087
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
3088
|
+
"hop": hop,
|
|
3089
|
+
})
|
|
3090
|
+
edge_rows = conn.execute(
|
|
3091
|
+
"""
|
|
3092
|
+
SELECT id, from_node, to_node, type, weight
|
|
3093
|
+
FROM edges WHERE from_node=? OR to_node=?
|
|
3094
|
+
""",
|
|
3095
|
+
(nid, nid),
|
|
3096
|
+
).fetchall()
|
|
3097
|
+
for er in edge_rows:
|
|
3098
|
+
if er["id"] not in visited_edges:
|
|
3099
|
+
visited_edges.add(er["id"])
|
|
3100
|
+
all_edges.append({
|
|
3101
|
+
"from": er["from_node"], "to": er["to_node"],
|
|
3102
|
+
"type": er["type"], "weight": er["weight"],
|
|
3103
|
+
})
|
|
3104
|
+
other = er["to_node"] if er["from_node"] == nid else er["from_node"]
|
|
3105
|
+
if other not in visited_nodes:
|
|
3106
|
+
next_frontier.add(other)
|
|
3107
|
+
frontier = next_frontier
|
|
3108
|
+
|
|
3109
|
+
return {"nodes": all_nodes, "edges": all_edges}
|