ltcai 3.4.1 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +206 -247
- package/docs/CARRYOVER_AUDIT_v3.6.0.md +61 -0
- package/docs/CHANGELOG.md +32 -0
- package/docs/HANDOVER_v3.6.0.md +46 -0
- package/docs/RUNTIME_HOOK_COVERAGE_v3.5.0.md +56 -0
- package/docs/RUNTIME_HOOK_COVERAGE_v3.6.0.md +49 -0
- package/docs/architecture.md +13 -12
- package/docs/kg-schema.md +55 -0
- package/docs/privacy.md +18 -2
- package/docs/security-model.md +17 -0
- package/kg_schema.py +46 -0
- package/knowledge_graph.py +520 -1
- package/latticeai/__init__.py +1 -1
- package/latticeai/api/auth.py +37 -9
- package/latticeai/api/browser.py +217 -0
- package/latticeai/api/chat.py +4 -1
- package/latticeai/api/computer_use.py +21 -8
- package/latticeai/api/portability.py +93 -0
- package/latticeai/api/tools.py +29 -26
- package/latticeai/core/config.py +3 -0
- package/latticeai/core/marketplace.py +1 -1
- package/latticeai/core/multi_agent.py +1 -1
- package/latticeai/core/oidc.py +205 -0
- package/latticeai/core/security.py +59 -5
- package/latticeai/core/workspace_os.py +1 -1
- package/latticeai/server_app.py +39 -0
- package/latticeai/services/ingestion.py +271 -0
- package/latticeai/services/kg_portability.py +177 -0
- package/package.json +5 -4
- package/requirements.txt +1 -0
- package/scripts/build_vsix.mjs +72 -0
- package/scripts/check_python.py +87 -0
- package/static/css/reference/account.css +1 -1
- package/static/css/reference/admin.css +1 -1
- package/static/css/reference/base.css +8 -5
- package/static/css/reference/chat.css +8 -8
- package/static/css/reference/graph.css +2 -2
- package/static/css/responsive.css +2 -2
- package/static/v3/asset-manifest.json +9 -9
- package/static/v3/css/{lattice.shell.6ceea7c8.css → lattice.shell.8fcc9d33.css} +2 -1
- package/static/v3/css/lattice.shell.css +2 -1
- package/static/v3/js/{app.d086489d.js → app.c541f955.js} +1 -1
- package/static/v3/js/core/{api.12b568ad.js → api.33d6320e.js} +38 -0
- package/static/v3/js/core/api.js +38 -0
- package/static/v3/js/core/{routes.d214b399.js → routes.2ce3815a.js} +1 -1
- package/static/v3/js/core/routes.js +1 -1
- package/static/v3/js/core/{shell.d05266f5.js → shell.8c163e0e.js} +2 -2
- package/static/v3/js/views/knowledge-graph.a96040a5.js +513 -0
- package/static/v3/js/views/knowledge-graph.js +293 -17
- package/static/workspace.css +1 -1
- package/tools/__init__.py +276 -0
- package/tools/commands.py +188 -0
- package/tools/computer.py +185 -0
- package/tools/documents.py +243 -0
- package/tools/filesystem.py +560 -0
- package/tools/knowledge.py +97 -0
- package/tools/local_files.py +69 -0
- package/tools/network.py +66 -0
- package/static/v3/js/views/knowledge-graph.a14ea7e7.js +0 -237
- package/tools.py +0 -1525
package/knowledge_graph.py
CHANGED
|
@@ -942,6 +942,31 @@ class KnowledgeGraphStore:
|
|
|
942
942
|
error_message TEXT,
|
|
943
943
|
metadata_json TEXT NOT NULL CHECK (json_valid(metadata_json))
|
|
944
944
|
);
|
|
945
|
+
-- v3.6.0 Knowledge Graph First: per-ingestion provenance trail.
|
|
946
|
+
-- Append-only audit of where every graph node came from, when it
|
|
947
|
+
-- was captured, how it was processed, and whether it was embedded /
|
|
948
|
+
-- linked / used by an agent. get_provenance() returns the latest row.
|
|
949
|
+
CREATE TABLE IF NOT EXISTS ingestion_provenance (
|
|
950
|
+
id TEXT PRIMARY KEY,
|
|
951
|
+
node_id TEXT NOT NULL,
|
|
952
|
+
source_type TEXT NOT NULL,
|
|
953
|
+
source_uri TEXT,
|
|
954
|
+
content_hash TEXT,
|
|
955
|
+
title TEXT,
|
|
956
|
+
pipeline TEXT NOT NULL,
|
|
957
|
+
owner TEXT,
|
|
958
|
+
workspace_id TEXT,
|
|
959
|
+
captured_at TEXT,
|
|
960
|
+
modified_at TEXT,
|
|
961
|
+
embedded INTEGER NOT NULL DEFAULT 0,
|
|
962
|
+
linked INTEGER NOT NULL DEFAULT 0,
|
|
963
|
+
duplicate INTEGER NOT NULL DEFAULT 0,
|
|
964
|
+
agent_used TEXT,
|
|
965
|
+
chunk_count INTEGER NOT NULL DEFAULT 0,
|
|
966
|
+
permissions_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(permissions_json)),
|
|
967
|
+
metadata_json TEXT NOT NULL DEFAULT '{}' CHECK (json_valid(metadata_json)),
|
|
968
|
+
created_at TEXT NOT NULL
|
|
969
|
+
);
|
|
945
970
|
CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(type);
|
|
946
971
|
CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_node);
|
|
947
972
|
CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_node);
|
|
@@ -954,6 +979,10 @@ class KnowledgeGraphStore:
|
|
|
954
979
|
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_source ON vector_embeddings(source_node);
|
|
955
980
|
CREATE INDEX IF NOT EXISTS idx_vector_embeddings_model ON vector_embeddings(embedding_model);
|
|
956
981
|
CREATE INDEX IF NOT EXISTS idx_vector_index_operations_requested ON vector_index_operations(requested_at);
|
|
982
|
+
CREATE INDEX IF NOT EXISTS idx_provenance_node ON ingestion_provenance(node_id);
|
|
983
|
+
CREATE INDEX IF NOT EXISTS idx_provenance_source_type ON ingestion_provenance(source_type);
|
|
984
|
+
CREATE INDEX IF NOT EXISTS idx_provenance_hash ON ingestion_provenance(content_hash);
|
|
985
|
+
CREATE INDEX IF NOT EXISTS idx_provenance_created ON ingestion_provenance(created_at);
|
|
957
986
|
"""
|
|
958
987
|
)
|
|
959
988
|
conn.execute(
|
|
@@ -2703,12 +2732,20 @@ class KnowledgeGraphStore:
|
|
|
2703
2732
|
uploader: Optional[str] = None,
|
|
2704
2733
|
conversation_id: Optional[str] = None,
|
|
2705
2734
|
extracted: Optional[Dict[str, Any]] = None,
|
|
2735
|
+
source_type: Optional[str] = None,
|
|
2736
|
+
source_uri: Optional[str] = None,
|
|
2737
|
+
captured_at: Optional[str] = None,
|
|
2738
|
+
modified_at: Optional[str] = None,
|
|
2739
|
+
owner: Optional[str] = None,
|
|
2740
|
+
workspace_id: Optional[str] = None,
|
|
2741
|
+
permissions: Optional[Dict[str, Any]] = None,
|
|
2706
2742
|
) -> Dict[str, Any]:
|
|
2707
2743
|
path = Path(path)
|
|
2708
2744
|
data = path.read_bytes()
|
|
2709
2745
|
digest = _sha256_bytes(data)
|
|
2710
2746
|
ext = path.suffix.lower()
|
|
2711
2747
|
filename = original_filename or path.name
|
|
2748
|
+
captured_at = captured_at or _now()
|
|
2712
2749
|
blob_path = self.blob_dir / digest[:2] / f"{digest}{ext}"
|
|
2713
2750
|
blob_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2714
2751
|
if not blob_path.exists():
|
|
@@ -2723,8 +2760,16 @@ class KnowledgeGraphStore:
|
|
|
2723
2760
|
"mime_type": mime_type,
|
|
2724
2761
|
"bytes": len(data),
|
|
2725
2762
|
"sha256": digest,
|
|
2763
|
+
"content_hash": digest,
|
|
2726
2764
|
"blob_path": str(blob_path),
|
|
2727
2765
|
"uploader": uploader,
|
|
2766
|
+
"owner": owner or uploader,
|
|
2767
|
+
"workspace_id": workspace_id,
|
|
2768
|
+
"permissions": permissions or {},
|
|
2769
|
+
"source_type": source_type or "file",
|
|
2770
|
+
"source_uri": source_uri or str(path),
|
|
2771
|
+
"captured_at": captured_at,
|
|
2772
|
+
"modified_at": modified_at,
|
|
2728
2773
|
"conversation_id": conversation_id,
|
|
2729
2774
|
"extracted": {k: v for k, v in (extracted or {}).items() if k != "content"},
|
|
2730
2775
|
"structure": doc_meta,
|
|
@@ -2732,8 +2777,11 @@ class KnowledgeGraphStore:
|
|
|
2732
2777
|
full_text = f"{filename}\n{text}"
|
|
2733
2778
|
concepts = _extract_concepts(full_text, limit=15)
|
|
2734
2779
|
triples = _extract_triples(full_text, concepts)
|
|
2780
|
+
chunk_ids: List[str] = []
|
|
2781
|
+
source_node_id: Optional[str] = None
|
|
2735
2782
|
|
|
2736
2783
|
with self._connect() as conn:
|
|
2784
|
+
duplicate = self._node_exists(conn, file_id)
|
|
2737
2785
|
# ── Document 노드 (점: 명사 — 파일) ────────────────────────────────
|
|
2738
2786
|
self._upsert_node(
|
|
2739
2787
|
conn, file_id, "Document", filename,
|
|
@@ -2742,6 +2790,15 @@ class KnowledgeGraphStore:
|
|
|
2742
2790
|
)
|
|
2743
2791
|
self._ingest_structure_nodes(conn, file_id, filename, doc_meta)
|
|
2744
2792
|
|
|
2793
|
+
# ── SOURCE 노드 + indexed_from (v3.6.0, source_type 지정 시) ──────
|
|
2794
|
+
if source_type:
|
|
2795
|
+
source_node_id = self._attach_source_node(
|
|
2796
|
+
conn, file_id,
|
|
2797
|
+
source_type=source_type, source_uri=source_uri or str(path),
|
|
2798
|
+
title=filename, content_hash=digest, captured_at=captured_at,
|
|
2799
|
+
extra={"owner": owner or uploader, "workspace_id": workspace_id, "ext": ext},
|
|
2800
|
+
)
|
|
2801
|
+
|
|
2745
2802
|
# ── Person 노드 + 동사형 엣지 ─────────────────────────────────────
|
|
2746
2803
|
if uploader:
|
|
2747
2804
|
person_id = f"person:{_slug(uploader)}"
|
|
@@ -2762,6 +2819,7 @@ class KnowledgeGraphStore:
|
|
|
2762
2819
|
# ── RAG chunks (검색용, 그래프 비표시) ────────────────────────────
|
|
2763
2820
|
for index, chunk in enumerate(_chunks(text)):
|
|
2764
2821
|
chunk_id = f"chunk:{_sha256_text(f'{file_id}:{index}:{chunk}')[:24]}"
|
|
2822
|
+
chunk_ids.append(chunk_id)
|
|
2765
2823
|
self._upsert_node(
|
|
2766
2824
|
conn, chunk_id, "Chunk",
|
|
2767
2825
|
f"{filename} chunk {index + 1}",
|
|
@@ -2816,7 +2874,18 @@ class KnowledgeGraphStore:
|
|
|
2816
2874
|
# 선: Document가 Task/Decision을 "포함함"
|
|
2817
2875
|
self._upsert_edge(conn, file_id, sem_id, "포함함", weight=0.9)
|
|
2818
2876
|
|
|
2819
|
-
return {
|
|
2877
|
+
return {
|
|
2878
|
+
"node_id": file_id,
|
|
2879
|
+
"type": "Document",
|
|
2880
|
+
"sha256": digest,
|
|
2881
|
+
"content_hash": digest,
|
|
2882
|
+
"source_node_id": source_node_id,
|
|
2883
|
+
"chunk_ids": chunk_ids,
|
|
2884
|
+
"chunk_count": len(chunk_ids),
|
|
2885
|
+
"duplicate": duplicate,
|
|
2886
|
+
"captured_at": captured_at,
|
|
2887
|
+
"metadata": metadata,
|
|
2888
|
+
}
|
|
2820
2889
|
|
|
2821
2890
|
def ingest_event(
|
|
2822
2891
|
self,
|
|
@@ -2854,6 +2923,449 @@ class KnowledgeGraphStore:
|
|
|
2854
2923
|
self._upsert_edge(conn, person_id, event_id, "triggered", metadata={"event_type": event_type})
|
|
2855
2924
|
return {"node_id": event_id, "type": event_type}
|
|
2856
2925
|
|
|
2926
|
+
# ── v3.6.0 Knowledge Graph First: unified source ingestion + provenance ──────
|
|
2927
|
+
def _node_exists(self, conn: sqlite3.Connection, node_id: str) -> bool:
|
|
2928
|
+
row = conn.execute("SELECT 1 FROM nodes WHERE id = ?", (node_id,)).fetchone()
|
|
2929
|
+
return row is not None
|
|
2930
|
+
|
|
2931
|
+
def node_is_embedded(self, node_id: str) -> bool:
|
|
2932
|
+
"""True when a vector embedding exists for ``node_id`` (RAG-ready)."""
|
|
2933
|
+
with self._connect() as conn:
|
|
2934
|
+
row = conn.execute(
|
|
2935
|
+
"SELECT 1 FROM vector_embeddings WHERE item_id = ? LIMIT 1",
|
|
2936
|
+
(node_id,),
|
|
2937
|
+
).fetchone()
|
|
2938
|
+
return row is not None
|
|
2939
|
+
|
|
2940
|
+
def _attach_source_node(
|
|
2941
|
+
self,
|
|
2942
|
+
conn: sqlite3.Connection,
|
|
2943
|
+
content_node_id: str,
|
|
2944
|
+
*,
|
|
2945
|
+
source_type: str,
|
|
2946
|
+
source_uri: Optional[str] = None,
|
|
2947
|
+
title: Optional[str] = None,
|
|
2948
|
+
content_hash: Optional[str] = None,
|
|
2949
|
+
captured_at: Optional[str] = None,
|
|
2950
|
+
extra: Optional[Dict[str, Any]] = None,
|
|
2951
|
+
) -> str:
|
|
2952
|
+
"""Create the SOURCE node for an ingested item and link it via INDEXED_FROM.
|
|
2953
|
+
|
|
2954
|
+
Every ingested content node points at exactly one SOURCE node, so the
|
|
2955
|
+
graph is always able to explain *where* a node came from. The source id
|
|
2956
|
+
is derived from (source_type, source_uri | content_hash) so re-ingesting
|
|
2957
|
+
the same origin reuses the same SOURCE node (idempotent).
|
|
2958
|
+
"""
|
|
2959
|
+
key = source_uri or content_hash or content_node_id
|
|
2960
|
+
source_id = f"source:{_sha256_text(f'{source_type}|{key}')[:24]}"
|
|
2961
|
+
meta = {
|
|
2962
|
+
"source_type": source_type,
|
|
2963
|
+
"source_uri": source_uri,
|
|
2964
|
+
"content_hash": content_hash,
|
|
2965
|
+
"captured_at": captured_at or _now(),
|
|
2966
|
+
**(extra or {}),
|
|
2967
|
+
}
|
|
2968
|
+
label = title or source_uri or source_type
|
|
2969
|
+
self._upsert_node(
|
|
2970
|
+
conn, source_id, "Source", label,
|
|
2971
|
+
summary=str(source_uri or title or source_type)[:400],
|
|
2972
|
+
metadata=meta,
|
|
2973
|
+
)
|
|
2974
|
+
# 선: 콘텐츠 노드가 "이 출처에서 색인됨" (indexed_from → SOURCE)
|
|
2975
|
+
self._upsert_edge(conn, content_node_id, source_id, "indexed_from",
|
|
2976
|
+
weight=1.0, metadata={"source_type": source_type})
|
|
2977
|
+
return source_id
|
|
2978
|
+
|
|
2979
|
+
def ingest_source(
|
|
2980
|
+
self,
|
|
2981
|
+
*,
|
|
2982
|
+
source_type: str,
|
|
2983
|
+
title: str,
|
|
2984
|
+
text: str,
|
|
2985
|
+
source_uri: Optional[str] = None,
|
|
2986
|
+
owner: Optional[str] = None,
|
|
2987
|
+
workspace_id: Optional[str] = None,
|
|
2988
|
+
permissions: Optional[Dict[str, Any]] = None,
|
|
2989
|
+
captured_at: Optional[str] = None,
|
|
2990
|
+
modified_at: Optional[str] = None,
|
|
2991
|
+
conversation_id: Optional[str] = None,
|
|
2992
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
2993
|
+
) -> Dict[str, Any]:
|
|
2994
|
+
"""Unified text/web ingestion: one shape for URL, browser tab, note, text.
|
|
2995
|
+
|
|
2996
|
+
Creates a content ``Document`` node (idempotent by content hash), a
|
|
2997
|
+
``Source`` node linked via ``indexed_from``, RAG chunks, and extracted
|
|
2998
|
+
Concept/Task/Decision nodes — mirroring ingest_document for non-file
|
|
2999
|
+
sources. Returns the full set of ids the caller needs to record
|
|
3000
|
+
provenance, including ``duplicate`` (was the content already indexed).
|
|
3001
|
+
"""
|
|
3002
|
+
source_type = str(source_type or "text")
|
|
3003
|
+
text = str(text or "")
|
|
3004
|
+
title = _clean_text(str(title or source_uri or source_type))[:240] or source_type
|
|
3005
|
+
captured_at = captured_at or _now()
|
|
3006
|
+
content_hash = _sha256_text(f"{source_type}|{source_uri or ''}|{text}")
|
|
3007
|
+
content_id = f"webdoc:{content_hash[:24]}"
|
|
3008
|
+
full_text = f"{title}\n{text}"
|
|
3009
|
+
node_meta = {
|
|
3010
|
+
"source_type": source_type,
|
|
3011
|
+
"source_uri": source_uri,
|
|
3012
|
+
"content_hash": content_hash,
|
|
3013
|
+
"title": title,
|
|
3014
|
+
"captured_at": captured_at,
|
|
3015
|
+
"modified_at": modified_at,
|
|
3016
|
+
"owner": owner,
|
|
3017
|
+
"workspace_id": workspace_id,
|
|
3018
|
+
"permissions": permissions or {},
|
|
3019
|
+
"chars": len(text),
|
|
3020
|
+
**(metadata or {}),
|
|
3021
|
+
}
|
|
3022
|
+
concepts = _extract_concepts(full_text, limit=15)
|
|
3023
|
+
triples = _extract_triples(full_text, concepts)
|
|
3024
|
+
chunk_ids: List[str] = []
|
|
3025
|
+
|
|
3026
|
+
with self._connect() as conn:
|
|
3027
|
+
duplicate = self._node_exists(conn, content_id)
|
|
3028
|
+
# ── 콘텐츠 노드 (점: 명사 — 문서) ────────────────────────────────
|
|
3029
|
+
self._upsert_node(
|
|
3030
|
+
conn, content_id, "Document", title,
|
|
3031
|
+
summary=(text or title)[:500],
|
|
3032
|
+
metadata=node_meta, raw=node_meta,
|
|
3033
|
+
)
|
|
3034
|
+
# ── SOURCE 노드 + indexed_from 엣지 (출처 추적) ──────────────────
|
|
3035
|
+
source_node_id = self._attach_source_node(
|
|
3036
|
+
conn, content_id,
|
|
3037
|
+
source_type=source_type, source_uri=source_uri, title=title,
|
|
3038
|
+
content_hash=content_hash, captured_at=captured_at,
|
|
3039
|
+
extra={"owner": owner, "workspace_id": workspace_id},
|
|
3040
|
+
)
|
|
3041
|
+
# ── 소유자(Person) + 동사형 엣지 ────────────────────────────────
|
|
3042
|
+
if owner:
|
|
3043
|
+
person_id = f"person:{_slug(owner)}"
|
|
3044
|
+
self._upsert_node(conn, person_id, "Person", owner, metadata={"email": owner})
|
|
3045
|
+
self._upsert_edge(conn, person_id, content_id, "업로드함", weight=1.0)
|
|
3046
|
+
# ── 대화 연결 ───────────────────────────────────────────────────
|
|
3047
|
+
if conversation_id:
|
|
3048
|
+
conv_id = f"conversation:{_slug(conversation_id)}"
|
|
3049
|
+
self._upsert_node(conn, conv_id, "Chat", conversation_id)
|
|
3050
|
+
self._upsert_edge(conn, conv_id, content_id, "언급함", weight=0.8)
|
|
3051
|
+
# ── RAG 청크 ────────────────────────────────────────────────────
|
|
3052
|
+
for index, chunk in enumerate(_chunks(text)):
|
|
3053
|
+
chunk_id = f"chunk:{_sha256_text(f'{content_id}:{index}:{chunk}')[:24]}"
|
|
3054
|
+
chunk_ids.append(chunk_id)
|
|
3055
|
+
self._upsert_node(
|
|
3056
|
+
conn, chunk_id, "Chunk", f"{title} chunk {index + 1}",
|
|
3057
|
+
summary=chunk[:500], metadata={"index": index, "source_node": content_id},
|
|
3058
|
+
)
|
|
3059
|
+
self._upsert_chunk(conn, chunk_id=chunk_id, source_node=content_id,
|
|
3060
|
+
text=chunk, metadata={"index": index, "source_node": content_id})
|
|
3061
|
+
self._upsert_edge(conn, content_id, chunk_id, "포함함")
|
|
3062
|
+
# ── Concept / Feature / Error / Code 노드 + 엣지 ────────────────
|
|
3063
|
+
concept_ids: Dict[str, str] = {}
|
|
3064
|
+
for concept in concepts:
|
|
3065
|
+
node_t = _classify_node_type(concept, full_text)
|
|
3066
|
+
cid = f"{node_t.lower()}:{_slug(concept)}"
|
|
3067
|
+
concept_ids[concept.lower()] = cid
|
|
3068
|
+
self._upsert_node(conn, cid, node_t, concept,
|
|
3069
|
+
metadata={"auto_extracted": True, "source_type": source_type})
|
|
3070
|
+
self._upsert_edge(conn, content_id, cid, "포함함", weight=0.8)
|
|
3071
|
+
for triple in triples:
|
|
3072
|
+
subj_id = concept_ids.get(triple["subject"].lower())
|
|
3073
|
+
obj_id = concept_ids.get(triple["object"].lower())
|
|
3074
|
+
if subj_id and obj_id and subj_id != obj_id:
|
|
3075
|
+
self._upsert_edge(conn, subj_id, obj_id, triple["relation"],
|
|
3076
|
+
weight=1.0, metadata={"context": triple.get("context", "")[:240]})
|
|
3077
|
+
# ── Task / Decision 노드 ────────────────────────────────────────
|
|
3078
|
+
for item in _semantic_items(text):
|
|
3079
|
+
sem_type = item["type"]
|
|
3080
|
+
sem_title = item["title"]
|
|
3081
|
+
sem_id = f"{sem_type.lower()}:{_sha256_text(f'{content_id}:{sem_type}:{sem_title}')[:24]}"
|
|
3082
|
+
self._upsert_node(conn, sem_id, sem_type, sem_title, summary=item["summary"],
|
|
3083
|
+
metadata={"auto_extracted": True, "source_node": content_id}, raw=item)
|
|
3084
|
+
self._upsert_edge(conn, content_id, sem_id, "포함함", weight=0.9)
|
|
3085
|
+
|
|
3086
|
+
return {
|
|
3087
|
+
"node_id": content_id,
|
|
3088
|
+
"type": "Document",
|
|
3089
|
+
"source_node_id": source_node_id,
|
|
3090
|
+
"content_hash": content_hash,
|
|
3091
|
+
"chunk_ids": chunk_ids,
|
|
3092
|
+
"chunk_count": len(chunk_ids),
|
|
3093
|
+
"duplicate": duplicate,
|
|
3094
|
+
"captured_at": captured_at,
|
|
3095
|
+
}
|
|
3096
|
+
|
|
3097
|
+
def record_provenance(
|
|
3098
|
+
self,
|
|
3099
|
+
*,
|
|
3100
|
+
node_id: str,
|
|
3101
|
+
source_type: str,
|
|
3102
|
+
pipeline: str = "unified-ingestion",
|
|
3103
|
+
source_uri: Optional[str] = None,
|
|
3104
|
+
content_hash: Optional[str] = None,
|
|
3105
|
+
title: Optional[str] = None,
|
|
3106
|
+
owner: Optional[str] = None,
|
|
3107
|
+
workspace_id: Optional[str] = None,
|
|
3108
|
+
captured_at: Optional[str] = None,
|
|
3109
|
+
modified_at: Optional[str] = None,
|
|
3110
|
+
embedded: bool = False,
|
|
3111
|
+
linked: bool = False,
|
|
3112
|
+
duplicate: bool = False,
|
|
3113
|
+
agent_used: Optional[str] = None,
|
|
3114
|
+
chunk_count: int = 0,
|
|
3115
|
+
permissions: Optional[Dict[str, Any]] = None,
|
|
3116
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
3117
|
+
) -> Dict[str, Any]:
|
|
3118
|
+
"""Append a provenance record for an ingested node (audit trail)."""
|
|
3119
|
+
now = _now()
|
|
3120
|
+
prov_basis = f"{node_id}|{content_hash or ''}|{now}"
|
|
3121
|
+
prov_id = f"prov:{_sha256_text(prov_basis)[:24]}"
|
|
3122
|
+
with self._connect() as conn:
|
|
3123
|
+
conn.execute(
|
|
3124
|
+
"""
|
|
3125
|
+
INSERT OR REPLACE INTO ingestion_provenance(
|
|
3126
|
+
id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3127
|
+
owner, workspace_id, captured_at, modified_at, embedded, linked,
|
|
3128
|
+
duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
|
|
3129
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3130
|
+
""",
|
|
3131
|
+
(
|
|
3132
|
+
prov_id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3133
|
+
owner, workspace_id, captured_at, modified_at, 1 if embedded else 0,
|
|
3134
|
+
1 if linked else 0, 1 if duplicate else 0, agent_used, int(chunk_count or 0),
|
|
3135
|
+
_json(permissions or {}), _json(metadata or {}), now,
|
|
3136
|
+
),
|
|
3137
|
+
)
|
|
3138
|
+
return {"id": prov_id, "node_id": node_id, "created_at": now}
|
|
3139
|
+
|
|
3140
|
+
@staticmethod
|
|
3141
|
+
def _provenance_row(row: sqlite3.Row) -> Dict[str, Any]:
|
|
3142
|
+
return {
|
|
3143
|
+
"id": row["id"],
|
|
3144
|
+
"node_id": row["node_id"],
|
|
3145
|
+
"source_type": row["source_type"],
|
|
3146
|
+
"source_uri": row["source_uri"],
|
|
3147
|
+
"content_hash": row["content_hash"],
|
|
3148
|
+
"title": row["title"],
|
|
3149
|
+
"pipeline": row["pipeline"],
|
|
3150
|
+
"owner": row["owner"],
|
|
3151
|
+
"workspace_id": row["workspace_id"],
|
|
3152
|
+
"captured_at": row["captured_at"],
|
|
3153
|
+
"modified_at": row["modified_at"],
|
|
3154
|
+
"embedded": bool(row["embedded"]),
|
|
3155
|
+
"linked": bool(row["linked"]),
|
|
3156
|
+
"duplicate": bool(row["duplicate"]),
|
|
3157
|
+
"agent_used": row["agent_used"],
|
|
3158
|
+
"chunk_count": row["chunk_count"],
|
|
3159
|
+
"permissions": _safe_loads(row["permissions_json"]),
|
|
3160
|
+
"metadata": _safe_loads(row["metadata_json"]),
|
|
3161
|
+
"created_at": row["created_at"],
|
|
3162
|
+
}
|
|
3163
|
+
|
|
3164
|
+
def get_provenance(self, node_id: str) -> Optional[Dict[str, Any]]:
|
|
3165
|
+
"""Return the most recent provenance record for a node, or None."""
|
|
3166
|
+
with self._connect() as conn:
|
|
3167
|
+
row = conn.execute(
|
|
3168
|
+
"SELECT * FROM ingestion_provenance WHERE node_id = ? "
|
|
3169
|
+
"ORDER BY created_at DESC, rowid DESC LIMIT 1",
|
|
3170
|
+
(node_id,),
|
|
3171
|
+
).fetchone()
|
|
3172
|
+
return self._provenance_row(row) if row else None
|
|
3173
|
+
|
|
3174
|
+
def list_provenance(self, *, limit: int = 100, source_type: Optional[str] = None) -> Dict[str, Any]:
|
|
3175
|
+
"""Recent provenance records (newest first), optionally by source_type."""
|
|
3176
|
+
limit = max(1, min(int(limit or 100), 1000))
|
|
3177
|
+
with self._connect() as conn:
|
|
3178
|
+
if source_type:
|
|
3179
|
+
rows = conn.execute(
|
|
3180
|
+
"SELECT * FROM ingestion_provenance WHERE source_type = ? "
|
|
3181
|
+
"ORDER BY created_at DESC, rowid DESC LIMIT ?",
|
|
3182
|
+
(source_type, limit),
|
|
3183
|
+
).fetchall()
|
|
3184
|
+
else:
|
|
3185
|
+
rows = conn.execute(
|
|
3186
|
+
"SELECT * FROM ingestion_provenance "
|
|
3187
|
+
"ORDER BY created_at DESC, rowid DESC LIMIT ?",
|
|
3188
|
+
(limit,),
|
|
3189
|
+
).fetchall()
|
|
3190
|
+
return {"items": [self._provenance_row(r) for r in rows], "count": len(rows)}
|
|
3191
|
+
|
|
3192
|
+
def provenance_stats(self) -> Dict[str, Any]:
|
|
3193
|
+
"""Aggregate provenance counts for the Knowledge Graph status surface."""
|
|
3194
|
+
with self._connect() as conn:
|
|
3195
|
+
total = conn.execute("SELECT COUNT(*) AS c FROM ingestion_provenance").fetchone()["c"]
|
|
3196
|
+
by_source = {
|
|
3197
|
+
r["source_type"]: r["c"]
|
|
3198
|
+
for r in conn.execute(
|
|
3199
|
+
"SELECT source_type, COUNT(*) AS c FROM ingestion_provenance GROUP BY source_type"
|
|
3200
|
+
).fetchall()
|
|
3201
|
+
}
|
|
3202
|
+
embedded = conn.execute(
|
|
3203
|
+
"SELECT COUNT(*) AS c FROM ingestion_provenance WHERE embedded = 1"
|
|
3204
|
+
).fetchone()["c"]
|
|
3205
|
+
duplicates = conn.execute(
|
|
3206
|
+
"SELECT COUNT(*) AS c FROM ingestion_provenance WHERE duplicate = 1"
|
|
3207
|
+
).fetchone()["c"]
|
|
3208
|
+
last = conn.execute(
|
|
3209
|
+
"SELECT created_at FROM ingestion_provenance ORDER BY created_at DESC LIMIT 1"
|
|
3210
|
+
).fetchone()
|
|
3211
|
+
return {
|
|
3212
|
+
"total": total,
|
|
3213
|
+
"by_source_type": by_source,
|
|
3214
|
+
"embedded": embedded,
|
|
3215
|
+
"duplicates": duplicates,
|
|
3216
|
+
"last_ingested_at": last["created_at"] if last else None,
|
|
3217
|
+
}
|
|
3218
|
+
|
|
3219
|
+
# ── v3.6.0 portability: logical export / import + binary backup ──────────────
|
|
3220
|
+
def schema_versions(self) -> Dict[str, Any]:
|
|
3221
|
+
"""Versions an exporter stamps and an importer validates against."""
|
|
3222
|
+
try:
|
|
3223
|
+
from kg_schema import EMBED_DIM as _EMBED_DIM, KG_SCHEMA_V2_VERSION as _V2
|
|
3224
|
+
except Exception: # pragma: no cover - kg_schema always importable in practice
|
|
3225
|
+
_EMBED_DIM, _V2 = 1024, 2
|
|
3226
|
+
return {
|
|
3227
|
+
"graph_schema_version": GRAPH_SCHEMA_VERSION,
|
|
3228
|
+
"kg_v2_schema_version": _V2,
|
|
3229
|
+
"projection_version": _PROJECTION_VERSION,
|
|
3230
|
+
"embed_dim": _EMBED_DIM,
|
|
3231
|
+
}
|
|
3232
|
+
|
|
3233
|
+
def export_graph_data(self) -> Dict[str, Any]:
|
|
3234
|
+
"""Raw, lossless logical export of the graph (nodes/edges/chunks/sources/
|
|
3235
|
+
provenance). Vector embeddings are intentionally omitted — they are
|
|
3236
|
+
re-derived on import — so the artifact stays portable and small. Use
|
|
3237
|
+
:meth:`backup_database` for a faithful binary copy incl. embeddings.
|
|
3238
|
+
"""
|
|
3239
|
+
with self._connect() as conn:
|
|
3240
|
+
def rows(table: str):
|
|
3241
|
+
return [dict(r) for r in conn.execute(f"SELECT * FROM {table}").fetchall()]
|
|
3242
|
+
|
|
3243
|
+
data = {
|
|
3244
|
+
"nodes": rows("nodes"),
|
|
3245
|
+
"edges": rows("edges"),
|
|
3246
|
+
"chunks": rows("chunks"),
|
|
3247
|
+
"knowledge_sources": rows("knowledge_sources"),
|
|
3248
|
+
"provenance": rows("ingestion_provenance"),
|
|
3249
|
+
}
|
|
3250
|
+
data["counts"] = {k: len(v) for k, v in data.items()}
|
|
3251
|
+
return data
|
|
3252
|
+
|
|
3253
|
+
def import_graph_data(
|
|
3254
|
+
self, data: Dict[str, Any], *, mode: str = "merge", dry_run: bool = False
|
|
3255
|
+
) -> Dict[str, Any]:
|
|
3256
|
+
"""Import a logical export back into the store.
|
|
3257
|
+
|
|
3258
|
+
``mode='merge'`` upserts on top of existing data (id collisions update);
|
|
3259
|
+
``mode='replace'`` clears the graph first. ``dry_run=True`` reports the
|
|
3260
|
+
plan without writing. Refuses artifacts from a NEWER graph schema than
|
|
3261
|
+
this build.
|
|
3262
|
+
"""
|
|
3263
|
+
nodes = data.get("nodes") or []
|
|
3264
|
+
edges = data.get("edges") or []
|
|
3265
|
+
chunks = data.get("chunks") or []
|
|
3266
|
+
sources = data.get("knowledge_sources") or []
|
|
3267
|
+
provenance = data.get("provenance") or []
|
|
3268
|
+
|
|
3269
|
+
header = data.get("header") or {}
|
|
3270
|
+
incoming_schema = header.get("graph_schema_version")
|
|
3271
|
+
if isinstance(incoming_schema, int) and incoming_schema > GRAPH_SCHEMA_VERSION:
|
|
3272
|
+
raise ValueError(
|
|
3273
|
+
f"Artifact graph_schema_version {incoming_schema} is newer than this "
|
|
3274
|
+
f"build ({GRAPH_SCHEMA_VERSION}); refusing to import."
|
|
3275
|
+
)
|
|
3276
|
+
|
|
3277
|
+
plan = {
|
|
3278
|
+
"mode": mode,
|
|
3279
|
+
"nodes": len(nodes),
|
|
3280
|
+
"edges": len(edges),
|
|
3281
|
+
"chunks": len(chunks),
|
|
3282
|
+
"knowledge_sources": len(sources),
|
|
3283
|
+
"provenance": len(provenance),
|
|
3284
|
+
}
|
|
3285
|
+
if dry_run:
|
|
3286
|
+
plan["dry_run"] = True
|
|
3287
|
+
return plan
|
|
3288
|
+
|
|
3289
|
+
if mode == "replace":
|
|
3290
|
+
self.clear_all()
|
|
3291
|
+
|
|
3292
|
+
with self._connect() as conn:
|
|
3293
|
+
for n in nodes:
|
|
3294
|
+
self._upsert_node(
|
|
3295
|
+
conn, n["id"], n["type"], n.get("title") or "",
|
|
3296
|
+
summary=n.get("summary") or "",
|
|
3297
|
+
metadata=_safe_loads(n.get("metadata_json")),
|
|
3298
|
+
raw=_safe_loads(n.get("raw_json")),
|
|
3299
|
+
)
|
|
3300
|
+
for c in chunks:
|
|
3301
|
+
self._upsert_chunk(
|
|
3302
|
+
conn, chunk_id=c["id"], source_node=c["source_node"],
|
|
3303
|
+
text=c.get("text") or "", metadata=_safe_loads(c.get("metadata_json")),
|
|
3304
|
+
)
|
|
3305
|
+
for e in edges:
|
|
3306
|
+
self._upsert_edge(
|
|
3307
|
+
conn, e["from_node"], e["to_node"], e["type"],
|
|
3308
|
+
weight=float(e.get("weight") or 1.0),
|
|
3309
|
+
metadata=_safe_loads(e.get("metadata_json")),
|
|
3310
|
+
)
|
|
3311
|
+
for s in sources:
|
|
3312
|
+
conn.execute(
|
|
3313
|
+
"""
|
|
3314
|
+
INSERT OR REPLACE INTO knowledge_sources(
|
|
3315
|
+
id, root_path, os_type, drive_id, label, status, include_ocr,
|
|
3316
|
+
watch_enabled, consent_json, created_at, updated_at, last_scanned_at)
|
|
3317
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3318
|
+
""",
|
|
3319
|
+
(
|
|
3320
|
+
s["id"], s["root_path"], s["os_type"], s.get("drive_id"), s.get("label"),
|
|
3321
|
+
s.get("status") or "active", int(s.get("include_ocr") or 0),
|
|
3322
|
+
int(s.get("watch_enabled") or 0), s.get("consent_json") or "{}",
|
|
3323
|
+
s.get("created_at") or _now(), s.get("updated_at") or _now(),
|
|
3324
|
+
s.get("last_scanned_at"),
|
|
3325
|
+
),
|
|
3326
|
+
)
|
|
3327
|
+
for p in provenance:
|
|
3328
|
+
conn.execute(
|
|
3329
|
+
"""
|
|
3330
|
+
INSERT OR REPLACE INTO ingestion_provenance(
|
|
3331
|
+
id, node_id, source_type, source_uri, content_hash, title, pipeline,
|
|
3332
|
+
owner, workspace_id, captured_at, modified_at, embedded, linked,
|
|
3333
|
+
duplicate, agent_used, chunk_count, permissions_json, metadata_json, created_at)
|
|
3334
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
3335
|
+
""",
|
|
3336
|
+
(
|
|
3337
|
+
p["id"], p["node_id"], p["source_type"], p.get("source_uri"),
|
|
3338
|
+
p.get("content_hash"), p.get("title"), p.get("pipeline") or "import",
|
|
3339
|
+
p.get("owner"), p.get("workspace_id"), p.get("captured_at"),
|
|
3340
|
+
p.get("modified_at"), int(p.get("embedded") or 0), int(p.get("linked") or 0),
|
|
3341
|
+
int(p.get("duplicate") or 0), p.get("agent_used"), int(p.get("chunk_count") or 0),
|
|
3342
|
+
p.get("permissions_json") or "{}", p.get("metadata_json") or "{}",
|
|
3343
|
+
p.get("created_at") or _now(),
|
|
3344
|
+
),
|
|
3345
|
+
)
|
|
3346
|
+
plan["imported"] = True
|
|
3347
|
+
return plan
|
|
3348
|
+
|
|
3349
|
+
def backup_database(self, dest_path) -> Path:
|
|
3350
|
+
"""Write a clean, standalone snapshot of the live DB to ``dest_path``.
|
|
3351
|
+
|
|
3352
|
+
Uses ``VACUUM INTO`` (after a full WAL checkpoint) so the snapshot is a
|
|
3353
|
+
defragmented, rollback-journal-mode database with no companion -wal/-shm
|
|
3354
|
+
— which restores cleanly by a plain file copy. Captures all data incl.
|
|
3355
|
+
the vector_embeddings BLOBs.
|
|
3356
|
+
"""
|
|
3357
|
+
dest = Path(dest_path)
|
|
3358
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
3359
|
+
if dest.exists():
|
|
3360
|
+
dest.unlink() # VACUUM INTO requires the target to not exist
|
|
3361
|
+
conn = self._connect()
|
|
3362
|
+
try:
|
|
3363
|
+
conn.execute("PRAGMA wal_checkpoint(FULL)")
|
|
3364
|
+
conn.execute("VACUUM INTO ?", (str(dest),))
|
|
3365
|
+
finally:
|
|
3366
|
+
conn.close()
|
|
3367
|
+
return dest
|
|
3368
|
+
|
|
2857
3369
|
def _ingest_structure_nodes(
|
|
2858
3370
|
self,
|
|
2859
3371
|
conn: sqlite3.Connection,
|
|
@@ -3044,6 +3556,13 @@ class KnowledgeGraphStore:
|
|
|
3044
3556
|
"Feature", # 소프트웨어 기능
|
|
3045
3557
|
"Task", # 할 일
|
|
3046
3558
|
"Decision", # 결정 사항
|
|
3559
|
+
# v3.6.0 Knowledge Graph First — 1급 엔티티를 그래프에 노출
|
|
3560
|
+
"Source", # 수집 출처 (파일/URL/브라우저 탭/git)
|
|
3561
|
+
"Repository", # git 저장소
|
|
3562
|
+
"Meeting", # 회의
|
|
3563
|
+
"Organization", # 조직
|
|
3564
|
+
"Workflow", # 워크플로우
|
|
3565
|
+
"Agent", # 에이전트
|
|
3047
3566
|
)
|
|
3048
3567
|
|
|
3049
3568
|
def list_documents(self, limit: int = 200) -> Dict[str, Any]:
|
package/latticeai/__init__.py
CHANGED