ltcai 0.1.28 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -3
- package/auto_setup.py +605 -0
- package/docs/CHANGELOG.md +30 -0
- package/kg_schema.py +723 -0
- package/package.json +4 -1
- package/server.py +727 -42
- package/static/account.html +5 -616
- package/static/admin.html +236 -1371
- package/static/chat.html +204 -7146
- package/static/graph.html +15 -1436
- package/static/lattice-reference.css +6557 -71
- package/static/scripts/account.js +230 -0
- package/static/scripts/admin.js +1198 -0
- package/static/scripts/chat.js +4634 -0
- package/static/scripts/graph.js +1059 -0
- package/static/sw.js +11 -1
package/kg_schema.py
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lattice AI — Knowledge Graph v2 schema (PPT spec aligned)
|
|
3
|
+
=========================================================
|
|
4
|
+
|
|
5
|
+
명세: ``lattice_ai_full_spec.pptx`` 슬라이드 20~22 (Node / Edge / Data Model)
|
|
6
|
+
|
|
7
|
+
목적
|
|
8
|
+
----
|
|
9
|
+
기존 ``knowledge_graph.py`` 의 자유 문자열 노드/엣지 타입을 **명시 enum + Pydantic
|
|
10
|
+
모델 + SQLite v2 스키마** 로 정식화한다. embedding · confidence · evidence ·
|
|
11
|
+
owner/visibility · createdBy 필드를 1급 시민으로 승격해서, semantic search
|
|
12
|
+
(SIMILAR_TO 엣지 추론) 와 multi-tenant 권한 모델의 기반을 만든다.
|
|
13
|
+
|
|
14
|
+
설계 원칙
|
|
15
|
+
---------
|
|
16
|
+
1. **기존 코드를 깨지 않는다**: 새 테이블 이름은 ``nodes_v2`` / ``edges_v2``
|
|
17
|
+
로 분리. 기존 ``nodes`` / ``edges`` 와 공존한다. 마이그레이션은 별도
|
|
18
|
+
유틸리티(`migrate_legacy_to_v2()`) 로 수행.
|
|
19
|
+
2. **표준 라이브러리만 사용**: Pydantic 이 없는 환경에서도 dataclass 로
|
|
20
|
+
동작하도록 ``from dataclasses import dataclass`` 를 사용한다.
|
|
21
|
+
타입 검증은 ``validate()`` 메서드에서 수동.
|
|
22
|
+
3. **embedding 은 옵셔널이지만 권장**: 차원은 환경 변수
|
|
23
|
+
``LATTICEAI_EMBED_DIM`` (기본 1024). bytes blob 으로 저장.
|
|
24
|
+
4. **마이그레이션 매핑은 명시적**: 한글 동사 → 영문 enum 표가 코드 안에 들어
|
|
25
|
+
있어서 어떤 옛 라벨이 어디로 매핑되는지 한눈에 보인다.
|
|
26
|
+
|
|
27
|
+
사용 예
|
|
28
|
+
-------
|
|
29
|
+
```python
|
|
30
|
+
from kg_schema import (
|
|
31
|
+
KGStoreV2, Node, Edge, NodeType, EdgeType,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
store = KGStoreV2("/Users/me/.ltcai/kg_v2.db")
|
|
35
|
+
store.init_schema()
|
|
36
|
+
|
|
37
|
+
n1 = Node(
|
|
38
|
+
type=NodeType.FILE,
|
|
39
|
+
label="LatticeAI_기획서.pdf",
|
|
40
|
+
attrs={"mime": "application/pdf", "pageCount": 24, "lang": "ko"},
|
|
41
|
+
owner_id="user_seoljun",
|
|
42
|
+
)
|
|
43
|
+
n2 = Node(type=NodeType.CONCEPT, label="MCP")
|
|
44
|
+
store.upsert_node(n1)
|
|
45
|
+
store.upsert_node(n2)
|
|
46
|
+
|
|
47
|
+
store.upsert_edge(Edge(
|
|
48
|
+
source=n1.id, target=n2.id,
|
|
49
|
+
type=EdgeType.MENTIONS,
|
|
50
|
+
weight=0.82, confidence=0.91,
|
|
51
|
+
evidence=["chunk:01HX7K…#p3", "chunk:01HX7K…#p11"],
|
|
52
|
+
created_by="extractor:llm-gemma-3-12b",
|
|
53
|
+
))
|
|
54
|
+
```
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
from __future__ import annotations
|
|
58
|
+
|
|
59
|
+
import json
|
|
60
|
+
import os
|
|
61
|
+
import re
|
|
62
|
+
import sqlite3
|
|
63
|
+
import struct
|
|
64
|
+
import time
|
|
65
|
+
import uuid
|
|
66
|
+
from contextlib import contextmanager
|
|
67
|
+
from dataclasses import dataclass, field, asdict
|
|
68
|
+
from datetime import datetime, timezone
|
|
69
|
+
from enum import Enum
|
|
70
|
+
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ── Schema version ──────────────────────────────────────────────────────────
|
|
74
|
+
KG_SCHEMA_V2_VERSION = 2
|
|
75
|
+
EMBED_DIM = int(os.getenv("LATTICEAI_EMBED_DIM", "1024"))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ── Node / Edge taxonomy (PPT 슬라이드 20·21) ──────────────────────────────
|
|
79
|
+
class NodeType(str, Enum):
|
|
80
|
+
"""워크스페이스의 모든 ‘명사’. PPT 슬라이드 20 카탈로그."""
|
|
81
|
+
CONVERSATION = "CONVERSATION" # 대화 세션 전체
|
|
82
|
+
MESSAGE = "MESSAGE" # 단일 발화
|
|
83
|
+
FILE = "FILE" # 업로드/연결된 파일
|
|
84
|
+
CHUNK = "CHUNK" # 파일의 분할 청크
|
|
85
|
+
CODE_SYMBOL = "CODE_SYMBOL" # 함수·클래스·모듈
|
|
86
|
+
CONCEPT = "CONCEPT" # 추출된 개념 / 태그
|
|
87
|
+
PERSON = "PERSON" # 사용자·협업자
|
|
88
|
+
MODEL = "MODEL" # 로컬/원격 LLM
|
|
89
|
+
TOOL = "TOOL" # MCP 서버·외부 도구
|
|
90
|
+
PROJECT = "PROJECT" # 주제별 작업 공간
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def from_legacy(cls, label: str) -> "NodeType":
|
|
94
|
+
"""legacy ``knowledge_graph.py`` 의 자유 문자열을 정식 enum 으로."""
|
|
95
|
+
m = (label or "").strip().lower()
|
|
96
|
+
return _LEGACY_NODE_MAP.get(m, cls.CONCEPT)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class EdgeType(str, Enum):
|
|
100
|
+
"""노드 사이의 ‘방향성 있고 타입이 명시된’ 관계. PPT 슬라이드 21."""
|
|
101
|
+
CONTAINS = "CONTAINS" # FILE → CHUNK
|
|
102
|
+
MENTIONS = "MENTIONS" # MESSAGE → CONCEPT
|
|
103
|
+
REFERENCES = "REFERENCES" # FILE → FILE / URL
|
|
104
|
+
REPLIES_TO = "REPLIES_TO" # MESSAGE → MESSAGE
|
|
105
|
+
AUTHORED_BY = "AUTHORED_BY" # FILE → PERSON
|
|
106
|
+
USES = "USES" # PROJECT → TOOL / MODEL
|
|
107
|
+
DERIVED_FROM = "DERIVED_FROM" # CHUNK → CHUNK (요약 등)
|
|
108
|
+
SIMILAR_TO = "SIMILAR_TO" # ANY ↔ ANY (의미 유사도)
|
|
109
|
+
DEPENDS_ON = "DEPENDS_ON" # CODE_SYMBOL → CODE_SYMBOL
|
|
110
|
+
TAGGED_AS = "TAGGED_AS" # ANY → CONCEPT
|
|
111
|
+
VERSION_OF = "VERSION_OF" # FILE → FILE (히스토리)
|
|
112
|
+
GRANTS_ACCESS = "GRANTS_ACCESS" # PERSON → RESOURCE
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_legacy(cls, label: str) -> "EdgeType":
|
|
116
|
+
m = (label or "").strip().lower()
|
|
117
|
+
return _LEGACY_EDGE_MAP.get(m, cls.MENTIONS)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# legacy(자유 문자열 / 한글 동사) → enum 매핑 표
|
|
121
|
+
_LEGACY_NODE_MAP: Dict[str, NodeType] = {
|
|
122
|
+
"conversation": NodeType.CONVERSATION,
|
|
123
|
+
"message": NodeType.MESSAGE,
|
|
124
|
+
"airesponse": NodeType.MESSAGE,
|
|
125
|
+
"file": NodeType.FILE,
|
|
126
|
+
"document": NodeType.FILE,
|
|
127
|
+
"page": NodeType.CHUNK,
|
|
128
|
+
"sheet": NodeType.CHUNK,
|
|
129
|
+
"slide": NodeType.CHUNK,
|
|
130
|
+
"chunk": NodeType.CHUNK,
|
|
131
|
+
"code": NodeType.CODE_SYMBOL,
|
|
132
|
+
"concept": NodeType.CONCEPT,
|
|
133
|
+
"feature": NodeType.CONCEPT,
|
|
134
|
+
"error": NodeType.CONCEPT,
|
|
135
|
+
"tag": NodeType.CONCEPT,
|
|
136
|
+
"person": NodeType.PERSON,
|
|
137
|
+
"user": NodeType.PERSON,
|
|
138
|
+
"model": NodeType.MODEL,
|
|
139
|
+
"tool": NodeType.TOOL,
|
|
140
|
+
"mcp": NodeType.TOOL,
|
|
141
|
+
"project": NodeType.PROJECT,
|
|
142
|
+
"workspace": NodeType.PROJECT,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
_LEGACY_EDGE_MAP: Dict[str, EdgeType] = {
|
|
146
|
+
# 한글 동사 (knowledge_graph.py 의 EDGE_VERB)
|
|
147
|
+
"언급함": EdgeType.MENTIONS,
|
|
148
|
+
"포함함": EdgeType.CONTAINS,
|
|
149
|
+
"해결함": EdgeType.REFERENCES,
|
|
150
|
+
"의존함": EdgeType.DEPENDS_ON,
|
|
151
|
+
"설명함": EdgeType.MENTIONS,
|
|
152
|
+
"비교함": EdgeType.SIMILAR_TO,
|
|
153
|
+
"사용함": EdgeType.USES,
|
|
154
|
+
"연결함": EdgeType.REFERENCES,
|
|
155
|
+
"확장함": EdgeType.DERIVED_FROM,
|
|
156
|
+
"생성함": EdgeType.AUTHORED_BY,
|
|
157
|
+
"대체함": EdgeType.VERSION_OF,
|
|
158
|
+
"지원함": EdgeType.USES,
|
|
159
|
+
"발생함": EdgeType.REFERENCES,
|
|
160
|
+
"관련됨": EdgeType.MENTIONS,
|
|
161
|
+
# 영문 별칭
|
|
162
|
+
"mentions": EdgeType.MENTIONS,
|
|
163
|
+
"contains": EdgeType.CONTAINS,
|
|
164
|
+
"references": EdgeType.REFERENCES,
|
|
165
|
+
"replies_to": EdgeType.REPLIES_TO,
|
|
166
|
+
"authored_by": EdgeType.AUTHORED_BY,
|
|
167
|
+
"uses": EdgeType.USES,
|
|
168
|
+
"derived_from": EdgeType.DERIVED_FROM,
|
|
169
|
+
"similar_to": EdgeType.SIMILAR_TO,
|
|
170
|
+
"depends_on": EdgeType.DEPENDS_ON,
|
|
171
|
+
"tagged_as": EdgeType.TAGGED_AS,
|
|
172
|
+
"version_of": EdgeType.VERSION_OF,
|
|
173
|
+
"grants_access": EdgeType.GRANTS_ACCESS,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
# 노드 타입별로 허용되는 source / target 조합 (PPT 카탈로그 그대로)
|
|
177
|
+
# None == 모든 타입 허용
|
|
178
|
+
EDGE_ENDPOINT_RULES: Dict[EdgeType, Tuple[Optional[Sequence[NodeType]], Optional[Sequence[NodeType]]]] = {
|
|
179
|
+
EdgeType.CONTAINS: ((NodeType.FILE,), (NodeType.CHUNK,)),
|
|
180
|
+
EdgeType.MENTIONS: ((NodeType.MESSAGE, NodeType.FILE, NodeType.CHUNK),
|
|
181
|
+
(NodeType.CONCEPT, NodeType.PERSON, NodeType.MODEL, NodeType.TOOL)),
|
|
182
|
+
EdgeType.REFERENCES: ((NodeType.FILE, NodeType.MESSAGE, NodeType.CHUNK),
|
|
183
|
+
(NodeType.FILE, NodeType.MESSAGE, NodeType.CHUNK)),
|
|
184
|
+
EdgeType.REPLIES_TO: ((NodeType.MESSAGE,), (NodeType.MESSAGE,)),
|
|
185
|
+
EdgeType.AUTHORED_BY: ((NodeType.FILE, NodeType.MESSAGE, NodeType.CONVERSATION),
|
|
186
|
+
(NodeType.PERSON,)),
|
|
187
|
+
EdgeType.USES: ((NodeType.PROJECT, NodeType.CONVERSATION),
|
|
188
|
+
(NodeType.TOOL, NodeType.MODEL)),
|
|
189
|
+
EdgeType.DERIVED_FROM: ((NodeType.CHUNK, NodeType.FILE),
|
|
190
|
+
(NodeType.CHUNK, NodeType.FILE)),
|
|
191
|
+
EdgeType.SIMILAR_TO: (None, None),
|
|
192
|
+
EdgeType.DEPENDS_ON: ((NodeType.CODE_SYMBOL,), (NodeType.CODE_SYMBOL,)),
|
|
193
|
+
EdgeType.TAGGED_AS: (None, (NodeType.CONCEPT,)),
|
|
194
|
+
EdgeType.VERSION_OF: ((NodeType.FILE,), (NodeType.FILE,)),
|
|
195
|
+
EdgeType.GRANTS_ACCESS: ((NodeType.PERSON,),
|
|
196
|
+
(NodeType.FILE, NodeType.CONVERSATION, NodeType.PROJECT)),
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ── Models ──────────────────────────────────────────────────────────────────
|
|
201
|
+
class Visibility(str, Enum):
|
|
202
|
+
PRIVATE = "private" # 소유자만
|
|
203
|
+
INTERNAL = "internal" # 같은 조직
|
|
204
|
+
SHARED = "shared" # 명시 공유
|
|
205
|
+
PUBLIC = "public" # 누구나
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _ulid() -> str:
|
|
209
|
+
"""간이 ULID (timestamp + uuid4 base32). 외부 의존성 없이."""
|
|
210
|
+
ts = int(time.time() * 1000)
|
|
211
|
+
rand = uuid.uuid4().int & ((1 << 80) - 1)
|
|
212
|
+
encoded = (ts << 80) | rand
|
|
213
|
+
chars = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" # Crockford
|
|
214
|
+
out: List[str] = []
|
|
215
|
+
for _ in range(26):
|
|
216
|
+
encoded, r = divmod(encoded, 32)
|
|
217
|
+
out.append(chars[r])
|
|
218
|
+
return "".join(reversed(out))
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _now_iso() -> str:
|
|
222
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def encode_embedding(vec: Sequence[float]) -> Optional[bytes]:
|
|
226
|
+
"""list[float] → SQLite BLOB. ``None`` 입력은 None 반환."""
|
|
227
|
+
if vec is None:
|
|
228
|
+
return None
|
|
229
|
+
if len(vec) != EMBED_DIM:
|
|
230
|
+
raise ValueError(
|
|
231
|
+
f"embedding dim mismatch: got {len(vec)}, expected {EMBED_DIM} "
|
|
232
|
+
f"(set LATTICEAI_EMBED_DIM to override)"
|
|
233
|
+
)
|
|
234
|
+
return struct.pack(f"<{EMBED_DIM}f", *vec)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def decode_embedding(blob: Optional[bytes]) -> Optional[List[float]]:
|
|
238
|
+
if not blob:
|
|
239
|
+
return None
|
|
240
|
+
return list(struct.unpack(f"<{EMBED_DIM}f", blob))
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def cosine(a: Sequence[float], b: Sequence[float]) -> float:
|
|
244
|
+
"""단순 코사인 유사도. numpy 없이."""
|
|
245
|
+
if not a or not b:
|
|
246
|
+
return 0.0
|
|
247
|
+
s = sum(x * y for x, y in zip(a, b))
|
|
248
|
+
na = sum(x * x for x in a) ** 0.5
|
|
249
|
+
nb = sum(y * y for y in b) ** 0.5
|
|
250
|
+
return s / (na * nb) if na and nb else 0.0
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@dataclass
|
|
254
|
+
class Node:
|
|
255
|
+
"""PPT 슬라이드 20 의 노드 정의."""
|
|
256
|
+
type: NodeType
|
|
257
|
+
label: str
|
|
258
|
+
id: str = field(default_factory=lambda: f"node:{_ulid()}")
|
|
259
|
+
attrs: Dict[str, Any] = field(default_factory=dict)
|
|
260
|
+
embedding: Optional[List[float]] = None
|
|
261
|
+
owner_id: Optional[str] = None
|
|
262
|
+
visibility: Visibility = Visibility.PRIVATE
|
|
263
|
+
created_at: str = field(default_factory=_now_iso)
|
|
264
|
+
updated_at: str = field(default_factory=_now_iso)
|
|
265
|
+
|
|
266
|
+
def validate(self) -> None:
|
|
267
|
+
if not isinstance(self.type, NodeType):
|
|
268
|
+
raise TypeError(f"Node.type must be NodeType, got {type(self.type)!r}")
|
|
269
|
+
if not self.label or not self.label.strip():
|
|
270
|
+
raise ValueError("Node.label is required and non-empty")
|
|
271
|
+
if len(self.label) > 240:
|
|
272
|
+
raise ValueError("Node.label max length is 240 chars")
|
|
273
|
+
if not isinstance(self.attrs, dict):
|
|
274
|
+
raise TypeError("Node.attrs must be a dict")
|
|
275
|
+
if not isinstance(self.visibility, Visibility):
|
|
276
|
+
raise TypeError("Node.visibility must be Visibility enum")
|
|
277
|
+
if self.embedding is not None and len(self.embedding) != EMBED_DIM:
|
|
278
|
+
raise ValueError(f"Node.embedding dim must be {EMBED_DIM}")
|
|
279
|
+
|
|
280
|
+
def to_json(self) -> Dict[str, Any]:
|
|
281
|
+
d = asdict(self)
|
|
282
|
+
d["type"] = self.type.value
|
|
283
|
+
d["visibility"] = self.visibility.value
|
|
284
|
+
# embedding 은 JSON 직렬화시 length 만 노출 (가독성)
|
|
285
|
+
if self.embedding is not None:
|
|
286
|
+
d["embedding"] = f"[…{len(self.embedding)} dims]"
|
|
287
|
+
return d
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@dataclass
|
|
291
|
+
class Edge:
|
|
292
|
+
"""PPT 슬라이드 21 의 엣지 정의."""
|
|
293
|
+
source: str
|
|
294
|
+
target: str
|
|
295
|
+
type: EdgeType
|
|
296
|
+
id: str = field(default_factory=lambda: f"edge:{_ulid()}")
|
|
297
|
+
weight: float = 1.0 # 강도 0..1
|
|
298
|
+
confidence: float = 1.0 # 추출 신뢰도 0..1
|
|
299
|
+
evidence: List[str] = field(default_factory=list) # 근거(노드/청크 ID)
|
|
300
|
+
created_by: str = "user" # extractor 식별자
|
|
301
|
+
created_at: str = field(default_factory=_now_iso)
|
|
302
|
+
|
|
303
|
+
def validate(self) -> None:
|
|
304
|
+
if not isinstance(self.type, EdgeType):
|
|
305
|
+
raise TypeError("Edge.type must be EdgeType")
|
|
306
|
+
if not self.source or not self.target:
|
|
307
|
+
raise ValueError("Edge.source and Edge.target are required")
|
|
308
|
+
if self.source == self.target and self.type is not EdgeType.SIMILAR_TO:
|
|
309
|
+
# SIMILAR_TO 외에는 자기참조 금지
|
|
310
|
+
raise ValueError(f"self-loop not allowed for {self.type.value}")
|
|
311
|
+
if not (0.0 <= self.weight <= 1.0):
|
|
312
|
+
raise ValueError("Edge.weight must be in [0, 1]")
|
|
313
|
+
if not (0.0 <= self.confidence <= 1.0):
|
|
314
|
+
raise ValueError("Edge.confidence must be in [0, 1]")
|
|
315
|
+
|
|
316
|
+
def to_json(self) -> Dict[str, Any]:
|
|
317
|
+
d = asdict(self)
|
|
318
|
+
d["type"] = self.type.value
|
|
319
|
+
return d
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def validate_endpoints(edge_type: EdgeType, src_type: NodeType, tgt_type: NodeType) -> None:
|
|
323
|
+
"""엣지가 허용된 source/target 타입을 잇고 있는지 검증."""
|
|
324
|
+
rule = EDGE_ENDPOINT_RULES.get(edge_type)
|
|
325
|
+
if rule is None:
|
|
326
|
+
return
|
|
327
|
+
src_allowed, tgt_allowed = rule
|
|
328
|
+
if src_allowed is not None and src_type not in src_allowed:
|
|
329
|
+
raise ValueError(
|
|
330
|
+
f"{edge_type.value}: source must be one of "
|
|
331
|
+
f"{[t.value for t in src_allowed]}, got {src_type.value}"
|
|
332
|
+
)
|
|
333
|
+
if tgt_allowed is not None and tgt_type not in tgt_allowed:
|
|
334
|
+
raise ValueError(
|
|
335
|
+
f"{edge_type.value}: target must be one of "
|
|
336
|
+
f"{[t.value for t in tgt_allowed]}, got {tgt_type.value}"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
# ── SQLite v2 store ─────────────────────────────────────────────────────────
|
|
341
|
+
SCHEMA_SQL = """
|
|
342
|
+
CREATE TABLE IF NOT EXISTS kg_meta (
|
|
343
|
+
key TEXT PRIMARY KEY,
|
|
344
|
+
value TEXT NOT NULL
|
|
345
|
+
);
|
|
346
|
+
|
|
347
|
+
CREATE TABLE IF NOT EXISTS nodes_v2 (
|
|
348
|
+
id TEXT PRIMARY KEY,
|
|
349
|
+
type TEXT NOT NULL,
|
|
350
|
+
label TEXT NOT NULL,
|
|
351
|
+
attrs TEXT NOT NULL DEFAULT '{}',
|
|
352
|
+
embedding BLOB,
|
|
353
|
+
owner_id TEXT,
|
|
354
|
+
visibility TEXT NOT NULL DEFAULT 'private',
|
|
355
|
+
created_at TEXT NOT NULL,
|
|
356
|
+
updated_at TEXT NOT NULL
|
|
357
|
+
);
|
|
358
|
+
|
|
359
|
+
CREATE TABLE IF NOT EXISTS edges_v2 (
|
|
360
|
+
id TEXT PRIMARY KEY,
|
|
361
|
+
source TEXT NOT NULL,
|
|
362
|
+
target TEXT NOT NULL,
|
|
363
|
+
type TEXT NOT NULL,
|
|
364
|
+
weight REAL NOT NULL DEFAULT 1.0,
|
|
365
|
+
confidence REAL NOT NULL DEFAULT 1.0,
|
|
366
|
+
evidence TEXT NOT NULL DEFAULT '[]',
|
|
367
|
+
created_by TEXT NOT NULL DEFAULT 'user',
|
|
368
|
+
created_at TEXT NOT NULL,
|
|
369
|
+
UNIQUE(source, target, type),
|
|
370
|
+
FOREIGN KEY(source) REFERENCES nodes_v2(id) ON DELETE CASCADE,
|
|
371
|
+
FOREIGN KEY(target) REFERENCES nodes_v2(id) ON DELETE CASCADE
|
|
372
|
+
);
|
|
373
|
+
|
|
374
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_v2_type ON nodes_v2(type);
|
|
375
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_v2_owner ON nodes_v2(owner_id);
|
|
376
|
+
CREATE INDEX IF NOT EXISTS idx_edges_v2_source ON edges_v2(source);
|
|
377
|
+
CREATE INDEX IF NOT EXISTS idx_edges_v2_target ON edges_v2(target);
|
|
378
|
+
CREATE INDEX IF NOT EXISTS idx_edges_v2_type ON edges_v2(type);
|
|
379
|
+
"""
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class KGStoreV2:
|
|
383
|
+
"""가벼운 SQLite 기반 v2 스토어. sqlite-vec 가 있으면 벡터 인덱스도 활용,
|
|
384
|
+
없으면 Python cosine 으로 폴백."""
|
|
385
|
+
|
|
386
|
+
def __init__(self, db_path: str):
|
|
387
|
+
self.db_path = db_path
|
|
388
|
+
self._has_vec: Optional[bool] = None
|
|
389
|
+
|
|
390
|
+
@contextmanager
|
|
391
|
+
def _conn(self):
|
|
392
|
+
conn = sqlite3.connect(self.db_path)
|
|
393
|
+
conn.row_factory = sqlite3.Row
|
|
394
|
+
conn.execute("PRAGMA foreign_keys = ON")
|
|
395
|
+
try:
|
|
396
|
+
yield conn
|
|
397
|
+
conn.commit()
|
|
398
|
+
finally:
|
|
399
|
+
conn.close()
|
|
400
|
+
|
|
401
|
+
def init_schema(self) -> None:
|
|
402
|
+
with self._conn() as conn:
|
|
403
|
+
conn.executescript(SCHEMA_SQL)
|
|
404
|
+
conn.execute(
|
|
405
|
+
"INSERT OR REPLACE INTO kg_meta(key, value) VALUES (?, ?)",
|
|
406
|
+
("schema_version", str(KG_SCHEMA_V2_VERSION)),
|
|
407
|
+
)
|
|
408
|
+
conn.execute(
|
|
409
|
+
"INSERT OR REPLACE INTO kg_meta(key, value) VALUES (?, ?)",
|
|
410
|
+
("embed_dim", str(EMBED_DIM)),
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# ── Upsert ───────────────────────────────────────────────
|
|
414
|
+
def upsert_node(self, node: Node) -> str:
|
|
415
|
+
node.validate()
|
|
416
|
+
node.updated_at = _now_iso()
|
|
417
|
+
with self._conn() as conn:
|
|
418
|
+
conn.execute(
|
|
419
|
+
"""
|
|
420
|
+
INSERT INTO nodes_v2(id, type, label, attrs, embedding,
|
|
421
|
+
owner_id, visibility, created_at, updated_at)
|
|
422
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
423
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
424
|
+
type=excluded.type,
|
|
425
|
+
label=excluded.label,
|
|
426
|
+
attrs=excluded.attrs,
|
|
427
|
+
embedding=COALESCE(excluded.embedding, nodes_v2.embedding),
|
|
428
|
+
owner_id=excluded.owner_id,
|
|
429
|
+
visibility=excluded.visibility,
|
|
430
|
+
updated_at=excluded.updated_at
|
|
431
|
+
""",
|
|
432
|
+
(
|
|
433
|
+
node.id, node.type.value, node.label,
|
|
434
|
+
json.dumps(node.attrs, ensure_ascii=False),
|
|
435
|
+
encode_embedding(node.embedding),
|
|
436
|
+
node.owner_id, node.visibility.value,
|
|
437
|
+
node.created_at, node.updated_at,
|
|
438
|
+
),
|
|
439
|
+
)
|
|
440
|
+
return node.id
|
|
441
|
+
|
|
442
|
+
def upsert_edge(self, edge: Edge, *, check_endpoints: bool = True) -> str:
|
|
443
|
+
edge.validate()
|
|
444
|
+
if check_endpoints:
|
|
445
|
+
src = self.get_node(edge.source)
|
|
446
|
+
tgt = self.get_node(edge.target)
|
|
447
|
+
if src is None or tgt is None:
|
|
448
|
+
raise ValueError("Edge endpoints must exist as nodes")
|
|
449
|
+
validate_endpoints(edge.type, src.type, tgt.type)
|
|
450
|
+
with self._conn() as conn:
|
|
451
|
+
conn.execute(
|
|
452
|
+
"""
|
|
453
|
+
INSERT INTO edges_v2(id, source, target, type, weight,
|
|
454
|
+
confidence, evidence, created_by, created_at)
|
|
455
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
456
|
+
ON CONFLICT(source, target, type) DO UPDATE SET
|
|
457
|
+
weight=excluded.weight,
|
|
458
|
+
confidence=excluded.confidence,
|
|
459
|
+
evidence=excluded.evidence,
|
|
460
|
+
created_by=excluded.created_by
|
|
461
|
+
""",
|
|
462
|
+
(
|
|
463
|
+
edge.id, edge.source, edge.target, edge.type.value,
|
|
464
|
+
float(edge.weight), float(edge.confidence),
|
|
465
|
+
json.dumps(edge.evidence, ensure_ascii=False),
|
|
466
|
+
edge.created_by, edge.created_at,
|
|
467
|
+
),
|
|
468
|
+
)
|
|
469
|
+
return edge.id
|
|
470
|
+
|
|
471
|
+
# ── Read ─────────────────────────────────────────────────
|
|
472
|
+
def get_node(self, node_id: str) -> Optional[Node]:
|
|
473
|
+
with self._conn() as conn:
|
|
474
|
+
row = conn.execute(
|
|
475
|
+
"SELECT * FROM nodes_v2 WHERE id = ?", (node_id,)
|
|
476
|
+
).fetchone()
|
|
477
|
+
return _row_to_node(row) if row else None
|
|
478
|
+
|
|
479
|
+
def list_nodes(self, *, type: Optional[NodeType] = None,
|
|
480
|
+
owner_id: Optional[str] = None,
|
|
481
|
+
limit: int = 100) -> List[Node]:
|
|
482
|
+
sql = "SELECT * FROM nodes_v2 WHERE 1=1"
|
|
483
|
+
args: List[Any] = []
|
|
484
|
+
if type is not None:
|
|
485
|
+
sql += " AND type = ?"
|
|
486
|
+
args.append(type.value)
|
|
487
|
+
if owner_id is not None:
|
|
488
|
+
sql += " AND owner_id = ?"
|
|
489
|
+
args.append(owner_id)
|
|
490
|
+
sql += " ORDER BY updated_at DESC LIMIT ?"
|
|
491
|
+
args.append(int(limit))
|
|
492
|
+
with self._conn() as conn:
|
|
493
|
+
rows = conn.execute(sql, args).fetchall()
|
|
494
|
+
return [_row_to_node(r) for r in rows]
|
|
495
|
+
|
|
496
|
+
def neighbors(self, node_id: str, *,
|
|
497
|
+
edge_type: Optional[EdgeType] = None,
|
|
498
|
+
direction: str = "both",
|
|
499
|
+
limit: int = 50) -> List[Tuple[Edge, Node]]:
|
|
500
|
+
"""node_id 에 인접한 (edge, other_node) 페어를 반환."""
|
|
501
|
+
if direction not in ("out", "in", "both"):
|
|
502
|
+
raise ValueError("direction must be 'out' | 'in' | 'both'")
|
|
503
|
+
clauses, args = [], []
|
|
504
|
+
if direction in ("out", "both"):
|
|
505
|
+
clauses.append("source = ?"); args.append(node_id)
|
|
506
|
+
if direction in ("in", "both"):
|
|
507
|
+
clauses.append("target = ?"); args.append(node_id)
|
|
508
|
+
sql = f"SELECT * FROM edges_v2 WHERE ({' OR '.join(clauses)})"
|
|
509
|
+
if edge_type:
|
|
510
|
+
sql += " AND type = ?"; args.append(edge_type.value)
|
|
511
|
+
sql += " ORDER BY weight DESC, confidence DESC LIMIT ?"
|
|
512
|
+
args.append(int(limit))
|
|
513
|
+
with self._conn() as conn:
|
|
514
|
+
edges = [_row_to_edge(r) for r in conn.execute(sql, args).fetchall()]
|
|
515
|
+
out: List[Tuple[Edge, Node]] = []
|
|
516
|
+
for e in edges:
|
|
517
|
+
other_id = e.target if e.source == node_id else e.source
|
|
518
|
+
row = conn.execute(
|
|
519
|
+
"SELECT * FROM nodes_v2 WHERE id = ?", (other_id,)
|
|
520
|
+
).fetchone()
|
|
521
|
+
if row:
|
|
522
|
+
out.append((e, _row_to_node(row)))
|
|
523
|
+
return out
|
|
524
|
+
|
|
525
|
+
def search_similar(self, vec: Sequence[float], *,
|
|
526
|
+
top_k: int = 8,
|
|
527
|
+
type: Optional[NodeType] = None,
|
|
528
|
+
owner_id: Optional[str] = None) -> List[Tuple[Node, float]]:
|
|
529
|
+
"""코사인 기반 semantic search. sqlite-vec 가 없을 때의 폴백 구현."""
|
|
530
|
+
if len(vec) != EMBED_DIM:
|
|
531
|
+
raise ValueError(f"query embedding dim must be {EMBED_DIM}")
|
|
532
|
+
sql = "SELECT * FROM nodes_v2 WHERE embedding IS NOT NULL"
|
|
533
|
+
args: List[Any] = []
|
|
534
|
+
if type is not None:
|
|
535
|
+
sql += " AND type = ?"; args.append(type.value)
|
|
536
|
+
if owner_id is not None:
|
|
537
|
+
sql += " AND owner_id = ?"; args.append(owner_id)
|
|
538
|
+
with self._conn() as conn:
|
|
539
|
+
rows = conn.execute(sql, args).fetchall()
|
|
540
|
+
scored = []
|
|
541
|
+
for r in rows:
|
|
542
|
+
emb = decode_embedding(r["embedding"])
|
|
543
|
+
if emb is None:
|
|
544
|
+
continue
|
|
545
|
+
scored.append((_row_to_node(r), cosine(vec, emb)))
|
|
546
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
547
|
+
return scored[:top_k]
|
|
548
|
+
|
|
549
|
+
# ── Maintenance ──────────────────────────────────────────
|
|
550
|
+
def stats(self) -> Dict[str, Any]:
|
|
551
|
+
with self._conn() as conn:
|
|
552
|
+
n_nodes = conn.execute("SELECT COUNT(*) FROM nodes_v2").fetchone()[0]
|
|
553
|
+
n_edges = conn.execute("SELECT COUNT(*) FROM edges_v2").fetchone()[0]
|
|
554
|
+
per_type = {
|
|
555
|
+
r["type"]: r["c"]
|
|
556
|
+
for r in conn.execute(
|
|
557
|
+
"SELECT type, COUNT(*) AS c FROM nodes_v2 GROUP BY type"
|
|
558
|
+
).fetchall()
|
|
559
|
+
}
|
|
560
|
+
per_edge = {
|
|
561
|
+
r["type"]: r["c"]
|
|
562
|
+
for r in conn.execute(
|
|
563
|
+
"SELECT type, COUNT(*) AS c FROM edges_v2 GROUP BY type"
|
|
564
|
+
).fetchall()
|
|
565
|
+
}
|
|
566
|
+
return {
|
|
567
|
+
"schema_version": KG_SCHEMA_V2_VERSION,
|
|
568
|
+
"embed_dim": EMBED_DIM,
|
|
569
|
+
"nodes": n_nodes,
|
|
570
|
+
"edges": n_edges,
|
|
571
|
+
"by_node_type": per_type,
|
|
572
|
+
"by_edge_type": per_edge,
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
# ── Row → model helpers ────────────────────────────────────────────────────
|
|
577
|
+
def _row_to_node(row: sqlite3.Row) -> Node:
|
|
578
|
+
return Node(
|
|
579
|
+
id=row["id"],
|
|
580
|
+
type=NodeType(row["type"]),
|
|
581
|
+
label=row["label"],
|
|
582
|
+
attrs=json.loads(row["attrs"] or "{}"),
|
|
583
|
+
embedding=decode_embedding(row["embedding"]),
|
|
584
|
+
owner_id=row["owner_id"],
|
|
585
|
+
visibility=Visibility(row["visibility"]),
|
|
586
|
+
created_at=row["created_at"],
|
|
587
|
+
updated_at=row["updated_at"],
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def _row_to_edge(row: sqlite3.Row) -> Edge:
|
|
592
|
+
return Edge(
|
|
593
|
+
id=row["id"],
|
|
594
|
+
source=row["source"],
|
|
595
|
+
target=row["target"],
|
|
596
|
+
type=EdgeType(row["type"]),
|
|
597
|
+
weight=float(row["weight"]),
|
|
598
|
+
confidence=float(row["confidence"]),
|
|
599
|
+
evidence=json.loads(row["evidence"] or "[]"),
|
|
600
|
+
created_by=row["created_by"],
|
|
601
|
+
created_at=row["created_at"],
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
# ── Migration: legacy (nodes/edges) → v2 (nodes_v2/edges_v2) ───────────────
|
|
606
|
+
def migrate_legacy_to_v2(db_path: str, *, dry_run: bool = False) -> Dict[str, int]:
|
|
607
|
+
"""기존 ``knowledge_graph.py`` 가 만든 ``nodes`` / ``edges`` 테이블을
|
|
608
|
+
``nodes_v2`` / ``edges_v2`` 로 복사. ``dry_run`` 이면 카운트만 반환.
|
|
609
|
+
"""
|
|
610
|
+
counters = {"nodes_seen": 0, "nodes_migrated": 0,
|
|
611
|
+
"edges_seen": 0, "edges_migrated": 0, "edges_skipped": 0}
|
|
612
|
+
store = KGStoreV2(db_path)
|
|
613
|
+
store.init_schema()
|
|
614
|
+
with sqlite3.connect(db_path) as conn:
|
|
615
|
+
conn.row_factory = sqlite3.Row
|
|
616
|
+
|
|
617
|
+
# nodes
|
|
618
|
+
try:
|
|
619
|
+
old_nodes = conn.execute(
|
|
620
|
+
"SELECT id, type, title, summary, metadata_json, created_at, updated_at FROM nodes"
|
|
621
|
+
).fetchall()
|
|
622
|
+
except sqlite3.OperationalError:
|
|
623
|
+
old_nodes = []
|
|
624
|
+
for r in old_nodes:
|
|
625
|
+
counters["nodes_seen"] += 1
|
|
626
|
+
if dry_run:
|
|
627
|
+
continue
|
|
628
|
+
node = Node(
|
|
629
|
+
id=r["id"],
|
|
630
|
+
type=NodeType.from_legacy(r["type"]),
|
|
631
|
+
label=r["title"] or "(untitled)",
|
|
632
|
+
attrs=_safe_json(r["metadata_json"]),
|
|
633
|
+
created_at=r["created_at"] or _now_iso(),
|
|
634
|
+
updated_at=r["updated_at"] or _now_iso(),
|
|
635
|
+
)
|
|
636
|
+
try:
|
|
637
|
+
store.upsert_node(node)
|
|
638
|
+
counters["nodes_migrated"] += 1
|
|
639
|
+
except Exception as exc:
|
|
640
|
+
# 잘못된 legacy row 는 스킵
|
|
641
|
+
_log_skip("node", r["id"], exc)
|
|
642
|
+
|
|
643
|
+
# edges
|
|
644
|
+
try:
|
|
645
|
+
old_edges = conn.execute(
|
|
646
|
+
"SELECT id, from_node, to_node, type, weight, metadata_json, created_at FROM edges"
|
|
647
|
+
).fetchall()
|
|
648
|
+
except sqlite3.OperationalError:
|
|
649
|
+
old_edges = []
|
|
650
|
+
for r in old_edges:
|
|
651
|
+
counters["edges_seen"] += 1
|
|
652
|
+
if dry_run:
|
|
653
|
+
continue
|
|
654
|
+
meta = _safe_json(r["metadata_json"])
|
|
655
|
+
edge = Edge(
|
|
656
|
+
id=r["id"],
|
|
657
|
+
source=r["from_node"],
|
|
658
|
+
target=r["to_node"],
|
|
659
|
+
type=EdgeType.from_legacy(r["type"]),
|
|
660
|
+
weight=float(r["weight"] or 1.0),
|
|
661
|
+
confidence=float(meta.get("confidence", 1.0)),
|
|
662
|
+
evidence=list(meta.get("evidence", []) or []),
|
|
663
|
+
created_by=str(meta.get("created_by", "legacy")),
|
|
664
|
+
created_at=r["created_at"] or _now_iso(),
|
|
665
|
+
)
|
|
666
|
+
try:
|
|
667
|
+
store.upsert_edge(edge, check_endpoints=False)
|
|
668
|
+
counters["edges_migrated"] += 1
|
|
669
|
+
except Exception as exc:
|
|
670
|
+
counters["edges_skipped"] += 1
|
|
671
|
+
_log_skip("edge", r["id"], exc)
|
|
672
|
+
return counters
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def _safe_json(raw: Optional[str]) -> Dict[str, Any]:
|
|
676
|
+
if not raw:
|
|
677
|
+
return {}
|
|
678
|
+
try:
|
|
679
|
+
v = json.loads(raw)
|
|
680
|
+
return v if isinstance(v, dict) else {"_raw": v}
|
|
681
|
+
except (ValueError, TypeError):
|
|
682
|
+
return {"_raw": raw}
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def _log_skip(kind: str, ident: str, exc: Exception) -> None:
|
|
686
|
+
# 의도적으로 print: 마이그레이션은 일회성이라 로깅 인프라 의존 안 함
|
|
687
|
+
print(f"[migrate] skip {kind} {ident}: {exc}")
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
# ── CLI ────────────────────────────────────────────────────────────────────
|
|
691
|
+
def _cli() -> int:
|
|
692
|
+
import argparse
|
|
693
|
+
p = argparse.ArgumentParser(prog="kg_schema",
|
|
694
|
+
description="Lattice AI KG v2 utilities")
|
|
695
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
696
|
+
|
|
697
|
+
sub_init = sub.add_parser("init", help="initialize v2 schema in a DB")
|
|
698
|
+
sub_init.add_argument("db", help="path to sqlite db")
|
|
699
|
+
|
|
700
|
+
sub_mig = sub.add_parser("migrate", help="migrate legacy nodes/edges → v2")
|
|
701
|
+
sub_mig.add_argument("db", help="path to sqlite db")
|
|
702
|
+
sub_mig.add_argument("--dry-run", action="store_true")
|
|
703
|
+
|
|
704
|
+
sub_stats = sub.add_parser("stats", help="print store statistics")
|
|
705
|
+
sub_stats.add_argument("db", help="path to sqlite db")
|
|
706
|
+
|
|
707
|
+
args = p.parse_args()
|
|
708
|
+
if args.cmd == "init":
|
|
709
|
+
KGStoreV2(args.db).init_schema()
|
|
710
|
+
print(f"initialized v2 schema in {args.db}")
|
|
711
|
+
return 0
|
|
712
|
+
if args.cmd == "migrate":
|
|
713
|
+
out = migrate_legacy_to_v2(args.db, dry_run=args.dry_run)
|
|
714
|
+
print(json.dumps(out, indent=2, ensure_ascii=False))
|
|
715
|
+
return 0
|
|
716
|
+
if args.cmd == "stats":
|
|
717
|
+
print(json.dumps(KGStoreV2(args.db).stats(), indent=2, ensure_ascii=False))
|
|
718
|
+
return 0
|
|
719
|
+
return 2
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
if __name__ == "__main__":
|
|
723
|
+
raise SystemExit(_cli())
|