ltcai 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -8
- package/docs/CHANGELOG.md +59 -0
- package/kg_schema.py +179 -518
- package/knowledge_graph.py +183 -80
- package/latticeai/core/agent.py +2 -2
- package/latticeai/core/agent_prompts.py +101 -0
- package/latticeai/core/tool_registry.py +288 -0
- package/latticeai/server_app.py +5806 -0
- package/package.json +2 -2
- package/server.py +13 -6259
- package/tools.py +6 -5
package/kg_schema.py
CHANGED
|
@@ -6,51 +6,37 @@ Lattice AI — Knowledge Graph v2 schema (PPT spec aligned)
|
|
|
6
6
|
|
|
7
7
|
목적
|
|
8
8
|
----
|
|
9
|
-
기존 ``knowledge_graph.py`` 의 자유 문자열 노드/엣지 타입을 **명시 enum +
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
(
|
|
9
|
+
기존 ``knowledge_graph.py`` 의 자유 문자열 노드/엣지 타입을 **명시 enum + SQLite v2
|
|
10
|
+
스키마** 로 정식화한다. 이 모듈은 **스키마/초기화/프로젝션 지원** 역할만 담당한다:
|
|
11
|
+
``NodeType``/``EdgeType`` taxonomy + legacy 정규화 매핑, ``nodes_v2``/``edges_v2``
|
|
12
|
+
DDL(``SCHEMA_SQL``), 그리고 ``KGStoreV2``(스키마 init·heal·stats).
|
|
13
|
+
|
|
14
|
+
실제 데이터 read/write 는 ``knowledge_graph.py`` 의 ``KnowledgeGraphStore`` 가
|
|
15
|
+
legacy 테이블에 대한 dual-write 프로젝션(raw SQL) + ``kgv2_*`` 재구성 뷰로 수행한다.
|
|
16
|
+
(과거의 native ``Node``/``Edge`` 모델과 ``KGStoreV2.upsert_*``/``get_node``/
|
|
17
|
+
``search_*`` API 는 production 에서 쓰이지 않아 제거되었다.)
|
|
13
18
|
|
|
14
19
|
설계 원칙
|
|
15
20
|
---------
|
|
16
21
|
1. **기존 코드를 깨지 않는다**: 새 테이블 이름은 ``nodes_v2`` / ``edges_v2``
|
|
17
|
-
로 분리. 기존 ``nodes`` / ``edges`` 와 공존한다.
|
|
18
|
-
|
|
19
|
-
2.
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
4.
|
|
25
|
-
있어서 어떤 옛 라벨이 어디로 매핑되는지 한눈에 보인다.
|
|
22
|
+
로 분리. 기존 ``nodes`` / ``edges`` 와 공존한다. legacy → v2 reprojection 은
|
|
23
|
+
``knowledge_graph.py`` 의 버전 게이트 백필 한 곳에서만 수행한다.
|
|
24
|
+
2. **정규화 + 무손실**: legacy 자유 문자열 타입은 ``NodeType``/``EdgeType``
|
|
25
|
+
superset 으로 정규화해 ``type`` 칼럼에 저장하고, 원본 문자열은 ``legacy_type``
|
|
26
|
+
칼럼에 그대로 보존한다. summary 와 metadata 는 ``attrs._kg`` 패스스루 blob 이
|
|
27
|
+
아니라 전용 ``summary`` 칼럼 / ``attrs``·``metadata`` 칼럼에 1급으로 저장한다.
|
|
28
|
+
3. **표준 라이브러리만 사용**: 외부 의존성 없이 ``sqlite3`` 만으로 동작한다.
|
|
29
|
+
4. **정규화 매핑은 명시적**: 한글 동사/legacy 라벨 → 영문 enum 표가 코드 안에
|
|
30
|
+
들어 있어서 어떤 옛 라벨이 어디로 매핑되는지 한눈에 보인다.
|
|
26
31
|
|
|
27
32
|
사용 예
|
|
28
33
|
-------
|
|
29
34
|
```python
|
|
30
|
-
from kg_schema import
|
|
31
|
-
KGStoreV2, Node, Edge, NodeType, EdgeType,
|
|
32
|
-
)
|
|
35
|
+
from kg_schema import KGStoreV2
|
|
33
36
|
|
|
34
37
|
store = KGStoreV2("/Users/me/.ltcai/kg_v2.db")
|
|
35
|
-
store.init_schema()
|
|
36
|
-
|
|
37
|
-
n1 = Node(
|
|
38
|
-
type=NodeType.FILE,
|
|
39
|
-
label="LatticeAI_기획서.pdf",
|
|
40
|
-
attrs={"mime": "application/pdf", "pageCount": 24, "lang": "ko"},
|
|
41
|
-
owner_id="user_seoljun",
|
|
42
|
-
)
|
|
43
|
-
n2 = Node(type=NodeType.CONCEPT, label="MCP")
|
|
44
|
-
store.upsert_node(n1)
|
|
45
|
-
store.upsert_node(n2)
|
|
46
|
-
|
|
47
|
-
store.upsert_edge(Edge(
|
|
48
|
-
source=n1.id, target=n2.id,
|
|
49
|
-
type=EdgeType.MENTIONS,
|
|
50
|
-
weight=0.82, confidence=0.91,
|
|
51
|
-
evidence=["chunk:01HX7K…#p3", "chunk:01HX7K…#p11"],
|
|
52
|
-
created_by="extractor:llm-gemma-3-12b",
|
|
53
|
-
))
|
|
38
|
+
store.init_schema() # nodes_v2 / edges_v2 생성 + 컬럼 drift self-heal
|
|
39
|
+
print(store.stats()) # {"nodes": ..., "by_node_type": {...}, ...}
|
|
54
40
|
```
|
|
55
41
|
"""
|
|
56
42
|
|
|
@@ -58,17 +44,11 @@ from __future__ import annotations
|
|
|
58
44
|
|
|
59
45
|
import json
|
|
60
46
|
import os
|
|
61
|
-
import re
|
|
62
47
|
import logging
|
|
63
48
|
import sqlite3
|
|
64
|
-
import struct
|
|
65
|
-
import time
|
|
66
|
-
import uuid
|
|
67
49
|
from contextlib import contextmanager
|
|
68
|
-
from dataclasses import dataclass, field, asdict
|
|
69
|
-
from datetime import datetime, timezone
|
|
70
50
|
from enum import Enum
|
|
71
|
-
from typing import Any, Dict,
|
|
51
|
+
from typing import Any, Dict, Optional
|
|
72
52
|
|
|
73
53
|
|
|
74
54
|
# ── Schema version ──────────────────────────────────────────────────────────
|
|
@@ -78,7 +58,15 @@ EMBED_DIM = int(os.getenv("LATTICEAI_EMBED_DIM", "1024"))
|
|
|
78
58
|
|
|
79
59
|
# ── Node / Edge taxonomy (PPT 슬라이드 20·21) ──────────────────────────────
|
|
80
60
|
class NodeType(str, Enum):
|
|
81
|
-
"""워크스페이스의 모든 ‘명사’.
|
|
61
|
+
"""워크스페이스의 모든 ‘명사’.
|
|
62
|
+
|
|
63
|
+
PPT 슬라이드 20 카탈로그(상단 그룹)에 더해, ``knowledge_graph.py`` 가 실제로
|
|
64
|
+
써오던 legacy 자유 문자열 타입을 **무손실 superset**(하단 그룹)으로 1급 enum 화
|
|
65
|
+
한다. 덕분에 ``from_legacy`` 정규화가 의미를 잃지 않고(예: ``Computer`` →
|
|
66
|
+
``COMPUTER``), 알 수 없는/동적(이벤트) 타입만 ``CONCEPT`` 로 폴백한다.
|
|
67
|
+
원본 문자열은 ``nodes_v2.legacy_type`` 에 그대로 보존되므로 정규화는 항상 무손실.
|
|
68
|
+
"""
|
|
69
|
+
# PPT 슬라이드 20 정식 카탈로그
|
|
82
70
|
CONVERSATION = "CONVERSATION" # 대화 세션 전체
|
|
83
71
|
MESSAGE = "MESSAGE" # 단일 발화
|
|
84
72
|
FILE = "FILE" # 업로드/연결된 파일
|
|
@@ -90,10 +78,35 @@ class NodeType(str, Enum):
|
|
|
90
78
|
MODEL = "MODEL" # 로컬/원격 LLM
|
|
91
79
|
TOOL = "TOOL" # MCP 서버·외부 도구
|
|
92
80
|
PROJECT = "PROJECT" # 주제별 작업 공간
|
|
81
|
+
# legacy superset — knowledge_graph.py 가 실제로 생성하던 노드 타입들
|
|
82
|
+
COMPUTER = "COMPUTER" # 내 컴퓨터 (로컬 스캔 루트)
|
|
83
|
+
DRIVE = "DRIVE" # 드라이브 / 볼륨
|
|
84
|
+
FOLDER = "FOLDER" # 폴더
|
|
85
|
+
CODE_FILE = "CODE_FILE" # 코드 파일 (.py/.ts 등)
|
|
86
|
+
SPREADSHEET = "SPREADSHEET" # 엑셀 / CSV
|
|
87
|
+
SLIDE_DECK = "SLIDE_DECK" # 프레젠테이션
|
|
88
|
+
IMAGE = "IMAGE" # 이미지 파일
|
|
89
|
+
IMAGE_TEXT = "IMAGE_TEXT" # OCR 텍스트
|
|
90
|
+
SLIDE = "SLIDE" # 슬라이드 (덱의 한 장)
|
|
91
|
+
PAGE = "PAGE" # 페이지 (문서의 한 면)
|
|
92
|
+
SHEET = "SHEET" # 시트 (스프레드시트의 한 탭)
|
|
93
|
+
SECTION = "SECTION" # 문서 섹션
|
|
94
|
+
CHAT = "CHAT" # 대화 세션(채팅 UI)
|
|
95
|
+
AI_RESPONSE = "AI_RESPONSE" # 어시스턴트 발화
|
|
96
|
+
TOPIC = "TOPIC" # 주제 / 토픽
|
|
97
|
+
FEATURE = "FEATURE" # 소프트웨어 기능
|
|
98
|
+
TASK = "TASK" # 할 일
|
|
99
|
+
DECISION = "DECISION" # 결정 사항
|
|
100
|
+
ERROR = "ERROR" # 오류 / 버그
|
|
101
|
+
EVENT = "EVENT" # 분석/시스템 이벤트(동적 타입 폴백)
|
|
93
102
|
|
|
94
103
|
@classmethod
|
|
95
104
|
def from_legacy(cls, label: str) -> "NodeType":
|
|
96
|
-
"""legacy ``knowledge_graph.py`` 의 자유 문자열을 정식 enum
|
|
105
|
+
"""legacy ``knowledge_graph.py`` 의 자유 문자열을 정식 enum 으로 정규화.
|
|
106
|
+
|
|
107
|
+
매핑이 없는(동적 이벤트 등) 타입은 ``CONCEPT`` 로 폴백하지만, 호출부는
|
|
108
|
+
원본 문자열을 ``legacy_type`` 칼럼에 별도 보존하므로 정보 손실은 없다.
|
|
109
|
+
"""
|
|
97
110
|
m = (label or "").strip().lower()
|
|
98
111
|
return _LEGACY_NODE_MAP.get(m, cls.CONCEPT)
|
|
99
112
|
|
|
@@ -116,28 +129,61 @@ class EdgeType(str, Enum):
|
|
|
116
129
|
INSPIRED_BY = "INSPIRED_BY" # DOCUMENT → DOCUMENT (영감/참조 관계)
|
|
117
130
|
CONTRADICTS = "CONTRADICTS" # DOCUMENT ↔ DOCUMENT (상충 관계)
|
|
118
131
|
EVOLVES_FROM = "EVOLVES_FROM" # DOCUMENT → DOCUMENT (발전/개정 관계)
|
|
132
|
+
# legacy superset — knowledge_graph.py 가 실제로 생성하던 엣지 타입들
|
|
133
|
+
UPLOADED_BY = "UPLOADED_BY" # PERSON → FILE (업로드함)
|
|
134
|
+
WROTE = "WROTE" # PERSON → CONVERSATION (작성함)
|
|
135
|
+
HAS_EVENT = "HAS_EVENT" # CONVERSATION → EVENT (has_event)
|
|
136
|
+
TRIGGERED = "TRIGGERED" # PERSON → EVENT (triggered)
|
|
137
|
+
HAS_SLIDE = "HAS_SLIDE" # SLIDE_DECK → SLIDE (has_slide)
|
|
138
|
+
HAS_PAGE = "HAS_PAGE" # DOCUMENT → PAGE (has_page)
|
|
139
|
+
HAS_SHEET = "HAS_SHEET" # SPREADSHEET → SHEET (has_sheet)
|
|
140
|
+
HAS_CHUNK = "HAS_CHUNK" # FILE → CHUNK (has_chunk)
|
|
141
|
+
CONTAINS_IMAGE = "CONTAINS_IMAGE" # FILE → IMAGE (contains_image)
|
|
142
|
+
CONTAINS_SIGNAL = "CONTAINS_SIGNAL" # NODE → CONCEPT (contains_signal)
|
|
143
|
+
DISCUSSES = "DISCUSSES" # SLIDE/PAGE → TOPIC (discusses)
|
|
144
|
+
IMPLIES = "IMPLIES" # NODE → NODE (implies)
|
|
145
|
+
RELATED_TO = "RELATED_TO" # ANY ↔ ANY (related_to)
|
|
119
146
|
|
|
120
147
|
@classmethod
|
|
121
148
|
def from_legacy(cls, label: str) -> "EdgeType":
|
|
149
|
+
"""legacy 자유 문자열/한글 동사를 정식 enum 으로 정규화.
|
|
150
|
+
|
|
151
|
+
매핑이 없는 동적 타입은 ``MENTIONS`` 로 폴백하지만, 호출부는 원본 문자열을
|
|
152
|
+
``edges_v2.legacy_type`` 에 보존하므로 정보 손실은 없다.
|
|
153
|
+
"""
|
|
122
154
|
m = (label or "").strip().lower()
|
|
123
155
|
return _LEGACY_EDGE_MAP.get(m, cls.MENTIONS)
|
|
124
156
|
|
|
125
157
|
|
|
126
|
-
# legacy(자유 문자열 / 한글 동사) → enum 매핑
|
|
158
|
+
# legacy(자유 문자열 / 한글 동사) → enum 매핑 표.
|
|
159
|
+
# superset 정규화: 알려진 legacy 타입은 1:1 의미 보존 매핑, 미지/동적 타입만 폴백.
|
|
127
160
|
_LEGACY_NODE_MAP: Dict[str, NodeType] = {
|
|
128
161
|
"conversation": NodeType.CONVERSATION,
|
|
162
|
+
"chat": NodeType.CHAT,
|
|
129
163
|
"message": NodeType.MESSAGE,
|
|
130
|
-
"airesponse": NodeType.
|
|
164
|
+
"airesponse": NodeType.AI_RESPONSE,
|
|
131
165
|
"file": NodeType.FILE,
|
|
132
|
-
"
|
|
133
|
-
"
|
|
134
|
-
"
|
|
135
|
-
"
|
|
166
|
+
"codefile": NodeType.CODE_FILE,
|
|
167
|
+
"spreadsheet": NodeType.SPREADSHEET,
|
|
168
|
+
"slidedeck": NodeType.SLIDE_DECK,
|
|
169
|
+
"image": NodeType.IMAGE,
|
|
170
|
+
"imagetext": NodeType.IMAGE_TEXT,
|
|
171
|
+
"computer": NodeType.COMPUTER,
|
|
172
|
+
"drive": NodeType.DRIVE,
|
|
173
|
+
"folder": NodeType.FOLDER,
|
|
174
|
+
"page": NodeType.PAGE,
|
|
175
|
+
"sheet": NodeType.SHEET,
|
|
176
|
+
"slide": NodeType.SLIDE,
|
|
177
|
+
"section": NodeType.SECTION,
|
|
136
178
|
"chunk": NodeType.CHUNK,
|
|
137
179
|
"code": NodeType.CODE_SYMBOL,
|
|
138
180
|
"concept": NodeType.CONCEPT,
|
|
139
|
-
"
|
|
140
|
-
"
|
|
181
|
+
"topic": NodeType.TOPIC,
|
|
182
|
+
"feature": NodeType.FEATURE,
|
|
183
|
+
"task": NodeType.TASK,
|
|
184
|
+
"decision": NodeType.DECISION,
|
|
185
|
+
"error": NodeType.ERROR,
|
|
186
|
+
"event": NodeType.EVENT,
|
|
141
187
|
"tag": NodeType.CONCEPT,
|
|
142
188
|
"person": NodeType.PERSON,
|
|
143
189
|
"user": NodeType.PERSON,
|
|
@@ -167,6 +213,8 @@ _LEGACY_EDGE_MAP: Dict[str, EdgeType] = {
|
|
|
167
213
|
"연결함": EdgeType.REFERENCES,
|
|
168
214
|
"확장함": EdgeType.DERIVED_FROM,
|
|
169
215
|
"생성함": EdgeType.AUTHORED_BY,
|
|
216
|
+
"작성함": EdgeType.WROTE,
|
|
217
|
+
"업로드함": EdgeType.UPLOADED_BY,
|
|
170
218
|
"대체함": EdgeType.VERSION_OF,
|
|
171
219
|
"지원함": EdgeType.USES,
|
|
172
220
|
"발생함": EdgeType.REFERENCES,
|
|
@@ -188,189 +236,26 @@ _LEGACY_EDGE_MAP: Dict[str, EdgeType] = {
|
|
|
188
236
|
"inspired_by": EdgeType.INSPIRED_BY,
|
|
189
237
|
"contradicts": EdgeType.CONTRADICTS,
|
|
190
238
|
"evolves_from": EdgeType.EVOLVES_FROM,
|
|
239
|
+
# legacy superset 별칭 (knowledge_graph.py 가 실제로 쓰던 엣지 타입)
|
|
240
|
+
"uploaded_by": EdgeType.UPLOADED_BY,
|
|
241
|
+
"wrote": EdgeType.WROTE,
|
|
242
|
+
"has_event": EdgeType.HAS_EVENT,
|
|
243
|
+
"triggered": EdgeType.TRIGGERED,
|
|
244
|
+
"has_slide": EdgeType.HAS_SLIDE,
|
|
245
|
+
"has_page": EdgeType.HAS_PAGE,
|
|
246
|
+
"has_sheet": EdgeType.HAS_SHEET,
|
|
247
|
+
"has_chunk": EdgeType.HAS_CHUNK,
|
|
248
|
+
"contains_image": EdgeType.CONTAINS_IMAGE,
|
|
249
|
+
"contains_signal": EdgeType.CONTAINS_SIGNAL,
|
|
250
|
+
"discusses": EdgeType.DISCUSSES,
|
|
251
|
+
"implies": EdgeType.IMPLIES,
|
|
252
|
+
"related_to": EdgeType.RELATED_TO,
|
|
191
253
|
"활용됨": EdgeType.USED_IN,
|
|
192
254
|
"영감받음": EdgeType.INSPIRED_BY,
|
|
193
255
|
"상충함": EdgeType.CONTRADICTS,
|
|
194
256
|
"발전함": EdgeType.EVOLVES_FROM,
|
|
195
257
|
}
|
|
196
258
|
|
|
197
|
-
# 노드 타입별로 허용되는 source / target 조합 (PPT 카탈로그 그대로)
|
|
198
|
-
# None == 모든 타입 허용
|
|
199
|
-
EDGE_ENDPOINT_RULES: Dict[EdgeType, Tuple[Optional[Sequence[NodeType]], Optional[Sequence[NodeType]]]] = {
|
|
200
|
-
EdgeType.CONTAINS: ((NodeType.FILE, NodeType.DOCUMENT),
|
|
201
|
-
(NodeType.CHUNK,)),
|
|
202
|
-
EdgeType.MENTIONS: ((NodeType.MESSAGE, NodeType.FILE, NodeType.CHUNK, NodeType.DOCUMENT),
|
|
203
|
-
(NodeType.CONCEPT, NodeType.PERSON, NodeType.MODEL, NodeType.TOOL)),
|
|
204
|
-
EdgeType.REFERENCES: ((NodeType.FILE, NodeType.MESSAGE, NodeType.CHUNK),
|
|
205
|
-
(NodeType.FILE, NodeType.MESSAGE, NodeType.CHUNK)),
|
|
206
|
-
EdgeType.REPLIES_TO: ((NodeType.MESSAGE,), (NodeType.MESSAGE,)),
|
|
207
|
-
EdgeType.AUTHORED_BY: ((NodeType.FILE, NodeType.MESSAGE, NodeType.CONVERSATION, NodeType.DOCUMENT),
|
|
208
|
-
(NodeType.PERSON,)),
|
|
209
|
-
EdgeType.USES: ((NodeType.PROJECT, NodeType.CONVERSATION),
|
|
210
|
-
(NodeType.TOOL, NodeType.MODEL)),
|
|
211
|
-
EdgeType.DERIVED_FROM: ((NodeType.CHUNK, NodeType.FILE),
|
|
212
|
-
(NodeType.CHUNK, NodeType.FILE)),
|
|
213
|
-
EdgeType.SIMILAR_TO: (None, None),
|
|
214
|
-
EdgeType.DEPENDS_ON: ((NodeType.CODE_SYMBOL,), (NodeType.CODE_SYMBOL,)),
|
|
215
|
-
EdgeType.TAGGED_AS: (None, (NodeType.CONCEPT,)),
|
|
216
|
-
EdgeType.VERSION_OF: ((NodeType.FILE,), (NodeType.FILE,)),
|
|
217
|
-
EdgeType.GRANTS_ACCESS: ((NodeType.PERSON,),
|
|
218
|
-
(NodeType.FILE, NodeType.CONVERSATION, NodeType.PROJECT)),
|
|
219
|
-
EdgeType.USED_IN: ((NodeType.CONCEPT,),
|
|
220
|
-
(NodeType.DOCUMENT, NodeType.FILE)),
|
|
221
|
-
EdgeType.INSPIRED_BY: ((NodeType.DOCUMENT, NodeType.FILE),
|
|
222
|
-
(NodeType.DOCUMENT, NodeType.FILE)),
|
|
223
|
-
EdgeType.CONTRADICTS: ((NodeType.DOCUMENT, NodeType.FILE),
|
|
224
|
-
(NodeType.DOCUMENT, NodeType.FILE)),
|
|
225
|
-
EdgeType.EVOLVES_FROM: ((NodeType.DOCUMENT, NodeType.FILE),
|
|
226
|
-
(NodeType.DOCUMENT, NodeType.FILE)),
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
# ── Models ──────────────────────────────────────────────────────────────────
|
|
231
|
-
class Visibility(str, Enum):
|
|
232
|
-
PRIVATE = "private" # 소유자만
|
|
233
|
-
INTERNAL = "internal" # 같은 조직
|
|
234
|
-
SHARED = "shared" # 명시 공유
|
|
235
|
-
PUBLIC = "public" # 누구나
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def _ulid() -> str:
|
|
239
|
-
"""간이 ULID (timestamp + uuid4 base32). 외부 의존성 없이."""
|
|
240
|
-
ts = int(time.time() * 1000)
|
|
241
|
-
rand = uuid.uuid4().int & ((1 << 80) - 1)
|
|
242
|
-
encoded = (ts << 80) | rand
|
|
243
|
-
chars = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" # Crockford
|
|
244
|
-
out: List[str] = []
|
|
245
|
-
for _ in range(26):
|
|
246
|
-
encoded, r = divmod(encoded, 32)
|
|
247
|
-
out.append(chars[r])
|
|
248
|
-
return "".join(reversed(out))
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
def _now_iso() -> str:
|
|
252
|
-
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
def encode_embedding(vec: Sequence[float]) -> Optional[bytes]:
|
|
256
|
-
"""list[float] → SQLite BLOB. ``None`` 입력은 None 반환."""
|
|
257
|
-
if vec is None:
|
|
258
|
-
return None
|
|
259
|
-
if len(vec) != EMBED_DIM:
|
|
260
|
-
raise ValueError(
|
|
261
|
-
f"embedding dim mismatch: got {len(vec)}, expected {EMBED_DIM} "
|
|
262
|
-
f"(set LATTICEAI_EMBED_DIM to override)"
|
|
263
|
-
)
|
|
264
|
-
return struct.pack(f"<{EMBED_DIM}f", *vec)
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def decode_embedding(blob: Optional[bytes]) -> Optional[List[float]]:
|
|
268
|
-
if not blob:
|
|
269
|
-
return None
|
|
270
|
-
return list(struct.unpack(f"<{EMBED_DIM}f", blob))
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
def cosine(a: Sequence[float], b: Sequence[float]) -> float:
|
|
274
|
-
"""단순 코사인 유사도. numpy 없이."""
|
|
275
|
-
if not a or not b:
|
|
276
|
-
return 0.0
|
|
277
|
-
s = sum(x * y for x, y in zip(a, b))
|
|
278
|
-
na = sum(x * x for x in a) ** 0.5
|
|
279
|
-
nb = sum(y * y for y in b) ** 0.5
|
|
280
|
-
return s / (na * nb) if na and nb else 0.0
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
@dataclass
|
|
284
|
-
class Node:
|
|
285
|
-
"""PPT 슬라이드 20 의 노드 정의."""
|
|
286
|
-
type: NodeType
|
|
287
|
-
label: str
|
|
288
|
-
id: str = field(default_factory=lambda: f"node:{_ulid()}")
|
|
289
|
-
attrs: Dict[str, Any] = field(default_factory=dict)
|
|
290
|
-
embedding: Optional[List[float]] = None
|
|
291
|
-
owner_id: Optional[str] = None
|
|
292
|
-
visibility: Visibility = Visibility.PRIVATE
|
|
293
|
-
created_at: str = field(default_factory=_now_iso)
|
|
294
|
-
updated_at: str = field(default_factory=_now_iso)
|
|
295
|
-
style: Optional[str] = None
|
|
296
|
-
tone: Optional[str] = None
|
|
297
|
-
importance_score: float = 0.0
|
|
298
|
-
last_used: Optional[str] = None
|
|
299
|
-
|
|
300
|
-
def validate(self) -> None:
|
|
301
|
-
if not isinstance(self.type, NodeType):
|
|
302
|
-
raise TypeError(f"Node.type must be NodeType, got {type(self.type)!r}")
|
|
303
|
-
if not self.label or not self.label.strip():
|
|
304
|
-
raise ValueError("Node.label is required and non-empty")
|
|
305
|
-
if len(self.label) > 240:
|
|
306
|
-
raise ValueError("Node.label max length is 240 chars")
|
|
307
|
-
if not isinstance(self.attrs, dict):
|
|
308
|
-
raise TypeError("Node.attrs must be a dict")
|
|
309
|
-
if not isinstance(self.visibility, Visibility):
|
|
310
|
-
raise TypeError("Node.visibility must be Visibility enum")
|
|
311
|
-
if self.embedding is not None and len(self.embedding) != EMBED_DIM:
|
|
312
|
-
raise ValueError(f"Node.embedding dim must be {EMBED_DIM}")
|
|
313
|
-
|
|
314
|
-
def to_json(self) -> Dict[str, Any]:
|
|
315
|
-
d = asdict(self)
|
|
316
|
-
d["type"] = self.type.value
|
|
317
|
-
d["visibility"] = self.visibility.value
|
|
318
|
-
# embedding 은 JSON 직렬화시 length 만 노출 (가독성)
|
|
319
|
-
if self.embedding is not None:
|
|
320
|
-
d["embedding"] = f"[…{len(self.embedding)} dims]"
|
|
321
|
-
return d
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
@dataclass
|
|
325
|
-
class Edge:
|
|
326
|
-
"""PPT 슬라이드 21 의 엣지 정의."""
|
|
327
|
-
source: str
|
|
328
|
-
target: str
|
|
329
|
-
type: EdgeType
|
|
330
|
-
id: str = field(default_factory=lambda: f"edge:{_ulid()}")
|
|
331
|
-
weight: float = 1.0 # 강도 0..1
|
|
332
|
-
confidence: float = 1.0 # 추출 신뢰도 0..1
|
|
333
|
-
evidence: List[str] = field(default_factory=list) # 근거(노드/청크 ID)
|
|
334
|
-
created_by: str = "user" # extractor 식별자
|
|
335
|
-
created_at: str = field(default_factory=_now_iso)
|
|
336
|
-
|
|
337
|
-
def validate(self) -> None:
|
|
338
|
-
if not isinstance(self.type, EdgeType):
|
|
339
|
-
raise TypeError("Edge.type must be EdgeType")
|
|
340
|
-
if not self.source or not self.target:
|
|
341
|
-
raise ValueError("Edge.source and Edge.target are required")
|
|
342
|
-
if self.source == self.target and self.type is not EdgeType.SIMILAR_TO:
|
|
343
|
-
# SIMILAR_TO 외에는 자기참조 금지
|
|
344
|
-
raise ValueError(f"self-loop not allowed for {self.type.value}")
|
|
345
|
-
if not (0.0 <= self.weight <= 1.0):
|
|
346
|
-
raise ValueError("Edge.weight must be in [0, 1]")
|
|
347
|
-
if not (0.0 <= self.confidence <= 1.0):
|
|
348
|
-
raise ValueError("Edge.confidence must be in [0, 1]")
|
|
349
|
-
|
|
350
|
-
def to_json(self) -> Dict[str, Any]:
|
|
351
|
-
d = asdict(self)
|
|
352
|
-
d["type"] = self.type.value
|
|
353
|
-
return d
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
def validate_endpoints(edge_type: EdgeType, src_type: NodeType, tgt_type: NodeType) -> None:
|
|
357
|
-
"""엣지가 허용된 source/target 타입을 잇고 있는지 검증."""
|
|
358
|
-
rule = EDGE_ENDPOINT_RULES.get(edge_type)
|
|
359
|
-
if rule is None:
|
|
360
|
-
return
|
|
361
|
-
src_allowed, tgt_allowed = rule
|
|
362
|
-
if src_allowed is not None and src_type not in src_allowed:
|
|
363
|
-
raise ValueError(
|
|
364
|
-
f"{edge_type.value}: source must be one of "
|
|
365
|
-
f"{[t.value for t in src_allowed]}, got {src_type.value}"
|
|
366
|
-
)
|
|
367
|
-
if tgt_allowed is not None and tgt_type not in tgt_allowed:
|
|
368
|
-
raise ValueError(
|
|
369
|
-
f"{edge_type.value}: target must be one of "
|
|
370
|
-
f"{[t.value for t in tgt_allowed]}, got {tgt_type.value}"
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
|
|
374
259
|
# ── SQLite v2 store ─────────────────────────────────────────────────────────
|
|
375
260
|
SCHEMA_SQL = """
|
|
376
261
|
CREATE TABLE IF NOT EXISTS kg_meta (
|
|
@@ -381,7 +266,9 @@ CREATE TABLE IF NOT EXISTS kg_meta (
|
|
|
381
266
|
CREATE TABLE IF NOT EXISTS nodes_v2 (
|
|
382
267
|
id TEXT PRIMARY KEY,
|
|
383
268
|
type TEXT NOT NULL,
|
|
269
|
+
legacy_type TEXT,
|
|
384
270
|
label TEXT NOT NULL,
|
|
271
|
+
summary TEXT,
|
|
385
272
|
attrs TEXT NOT NULL DEFAULT '{}',
|
|
386
273
|
embedding BLOB,
|
|
387
274
|
owner_id TEXT,
|
|
@@ -399,31 +286,54 @@ CREATE TABLE IF NOT EXISTS edges_v2 (
|
|
|
399
286
|
source TEXT NOT NULL,
|
|
400
287
|
target TEXT NOT NULL,
|
|
401
288
|
type TEXT NOT NULL,
|
|
289
|
+
legacy_type TEXT NOT NULL DEFAULT '',
|
|
402
290
|
weight REAL NOT NULL DEFAULT 1.0,
|
|
403
291
|
confidence REAL NOT NULL DEFAULT 1.0,
|
|
404
292
|
evidence TEXT NOT NULL DEFAULT '[]',
|
|
293
|
+
metadata TEXT NOT NULL DEFAULT '{}',
|
|
405
294
|
created_by TEXT NOT NULL DEFAULT 'user',
|
|
406
295
|
created_at TEXT NOT NULL,
|
|
407
|
-
|
|
296
|
+
-- Edge identity follows the *raw* legacy type, not the normalized type:
|
|
297
|
+
-- two distinct legacy types between the same pair (e.g. "mentions" and
|
|
298
|
+
-- "관련됨") must stay distinct edges even though both normalize to MENTIONS.
|
|
299
|
+
UNIQUE(source, target, legacy_type),
|
|
408
300
|
FOREIGN KEY(source) REFERENCES nodes_v2(id) ON DELETE CASCADE,
|
|
409
301
|
FOREIGN KEY(target) REFERENCES nodes_v2(id) ON DELETE CASCADE
|
|
410
302
|
);
|
|
411
303
|
|
|
412
304
|
CREATE INDEX IF NOT EXISTS idx_nodes_v2_type ON nodes_v2(type);
|
|
305
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_v2_legacy ON nodes_v2(legacy_type);
|
|
413
306
|
CREATE INDEX IF NOT EXISTS idx_nodes_v2_owner ON nodes_v2(owner_id);
|
|
414
307
|
CREATE INDEX IF NOT EXISTS idx_edges_v2_source ON edges_v2(source);
|
|
415
308
|
CREATE INDEX IF NOT EXISTS idx_edges_v2_target ON edges_v2(target);
|
|
416
309
|
CREATE INDEX IF NOT EXISTS idx_edges_v2_type ON edges_v2(type);
|
|
310
|
+
CREATE INDEX IF NOT EXISTS idx_edges_v2_legacy ON edges_v2(legacy_type);
|
|
417
311
|
"""
|
|
418
312
|
|
|
419
313
|
|
|
314
|
+
def _exec_script(conn: sqlite3.Connection, script: str) -> None:
|
|
315
|
+
"""Run a multi-statement SQL script on ``conn`` statement-by-statement.
|
|
316
|
+
|
|
317
|
+
Unlike ``sqlite3.Connection.executescript``, this does NOT issue an implicit
|
|
318
|
+
COMMIT before running, so the statements join the caller's open transaction.
|
|
319
|
+
Safe for our schema/view DDL (no ``;`` inside string literals).
|
|
320
|
+
"""
|
|
321
|
+
for stmt in script.split(";"):
|
|
322
|
+
s = stmt.strip()
|
|
323
|
+
if s:
|
|
324
|
+
conn.execute(s)
|
|
325
|
+
|
|
326
|
+
|
|
420
327
|
class KGStoreV2:
|
|
421
|
-
"""가벼운 SQLite 기반 v2
|
|
422
|
-
|
|
328
|
+
"""가벼운 SQLite 기반 v2 스토어 — **스키마/초기화 지원 전용**.
|
|
329
|
+
|
|
330
|
+
``init_schema`` 으로 ``nodes_v2``/``edges_v2`` 를 생성·heal 하고 ``stats`` 로
|
|
331
|
+
집계를 노출한다. 데이터 read/write 는 ``knowledge_graph.KnowledgeGraphStore``
|
|
332
|
+
프로젝션이 담당하므로 native upsert/get/search API 는 두지 않는다.
|
|
333
|
+
"""
|
|
423
334
|
|
|
424
335
|
def __init__(self, db_path: str):
|
|
425
336
|
self.db_path = db_path
|
|
426
|
-
self._has_vec: Optional[bool] = None
|
|
427
337
|
|
|
428
338
|
@contextmanager
|
|
429
339
|
def _conn(self):
|
|
@@ -439,11 +349,11 @@ class KGStoreV2:
|
|
|
439
349
|
# Columns the current code writes; used to detect schema-evolution drift in
|
|
440
350
|
# v2 tables that an older ``CREATE TABLE IF NOT EXISTS`` left behind.
|
|
441
351
|
_V2_EXPECTED_COLUMNS = {
|
|
442
|
-
"edges_v2": {"id", "source", "target", "type", "
|
|
443
|
-
"evidence", "created_by", "created_at"},
|
|
444
|
-
"nodes_v2": {"id", "type", "
|
|
445
|
-
"
|
|
446
|
-
"importance_score", "last_used"},
|
|
352
|
+
"edges_v2": {"id", "source", "target", "type", "legacy_type", "weight",
|
|
353
|
+
"confidence", "evidence", "metadata", "created_by", "created_at"},
|
|
354
|
+
"nodes_v2": {"id", "type", "legacy_type", "label", "summary", "attrs",
|
|
355
|
+
"embedding", "owner_id", "visibility", "created_at",
|
|
356
|
+
"updated_at", "style", "tone", "importance_score", "last_used"},
|
|
447
357
|
}
|
|
448
358
|
|
|
449
359
|
def _drop_stale_empty_v2_tables(self, conn: sqlite3.Connection) -> None:
|
|
@@ -476,161 +386,31 @@ class KGStoreV2:
|
|
|
476
386
|
table, sorted(missing), count,
|
|
477
387
|
)
|
|
478
388
|
|
|
479
|
-
def init_schema(self) -> None:
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
def
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
type=excluded.type,
|
|
505
|
-
label=excluded.label,
|
|
506
|
-
attrs=excluded.attrs,
|
|
507
|
-
embedding=COALESCE(excluded.embedding, nodes_v2.embedding),
|
|
508
|
-
owner_id=excluded.owner_id,
|
|
509
|
-
visibility=excluded.visibility,
|
|
510
|
-
updated_at=excluded.updated_at,
|
|
511
|
-
style=COALESCE(excluded.style, nodes_v2.style),
|
|
512
|
-
tone=COALESCE(excluded.tone, nodes_v2.tone),
|
|
513
|
-
importance_score=MAX(excluded.importance_score, nodes_v2.importance_score),
|
|
514
|
-
last_used=COALESCE(excluded.last_used, nodes_v2.last_used)
|
|
515
|
-
""",
|
|
516
|
-
(
|
|
517
|
-
node.id, node.type.value, node.label,
|
|
518
|
-
json.dumps(node.attrs, ensure_ascii=False),
|
|
519
|
-
encode_embedding(node.embedding),
|
|
520
|
-
node.owner_id, node.visibility.value,
|
|
521
|
-
node.created_at, node.updated_at,
|
|
522
|
-
node.style, node.tone,
|
|
523
|
-
float(node.importance_score), node.last_used,
|
|
524
|
-
),
|
|
525
|
-
)
|
|
526
|
-
return node.id
|
|
527
|
-
|
|
528
|
-
def upsert_edge(self, edge: Edge, *, check_endpoints: bool = True) -> str:
|
|
529
|
-
edge.validate()
|
|
530
|
-
if check_endpoints:
|
|
531
|
-
src = self.get_node(edge.source)
|
|
532
|
-
tgt = self.get_node(edge.target)
|
|
533
|
-
if src is None or tgt is None:
|
|
534
|
-
raise ValueError("Edge endpoints must exist as nodes")
|
|
535
|
-
validate_endpoints(edge.type, src.type, tgt.type)
|
|
536
|
-
with self._conn() as conn:
|
|
537
|
-
conn.execute(
|
|
538
|
-
"""
|
|
539
|
-
INSERT INTO edges_v2(id, source, target, type, weight,
|
|
540
|
-
confidence, evidence, created_by, created_at)
|
|
541
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
542
|
-
ON CONFLICT(source, target, type) DO UPDATE SET
|
|
543
|
-
weight=excluded.weight,
|
|
544
|
-
confidence=excluded.confidence,
|
|
545
|
-
evidence=excluded.evidence,
|
|
546
|
-
created_by=excluded.created_by
|
|
547
|
-
""",
|
|
548
|
-
(
|
|
549
|
-
edge.id, edge.source, edge.target, edge.type.value,
|
|
550
|
-
float(edge.weight), float(edge.confidence),
|
|
551
|
-
json.dumps(edge.evidence, ensure_ascii=False),
|
|
552
|
-
edge.created_by, edge.created_at,
|
|
553
|
-
),
|
|
554
|
-
)
|
|
555
|
-
return edge.id
|
|
556
|
-
|
|
557
|
-
# ── Read ─────────────────────────────────────────────────
|
|
558
|
-
def get_node(self, node_id: str) -> Optional[Node]:
|
|
559
|
-
with self._conn() as conn:
|
|
560
|
-
row = conn.execute(
|
|
561
|
-
"SELECT * FROM nodes_v2 WHERE id = ?", (node_id,)
|
|
562
|
-
).fetchone()
|
|
563
|
-
return _row_to_node(row) if row else None
|
|
564
|
-
|
|
565
|
-
def list_nodes(self, *, type: Optional[NodeType] = None,
|
|
566
|
-
owner_id: Optional[str] = None,
|
|
567
|
-
limit: int = 100) -> List[Node]:
|
|
568
|
-
sql = "SELECT * FROM nodes_v2 WHERE 1=1"
|
|
569
|
-
args: List[Any] = []
|
|
570
|
-
if type is not None:
|
|
571
|
-
sql += " AND type = ?"
|
|
572
|
-
args.append(type.value)
|
|
573
|
-
if owner_id is not None:
|
|
574
|
-
sql += " AND owner_id = ?"
|
|
575
|
-
args.append(owner_id)
|
|
576
|
-
sql += " ORDER BY updated_at DESC LIMIT ?"
|
|
577
|
-
args.append(int(limit))
|
|
578
|
-
with self._conn() as conn:
|
|
579
|
-
rows = conn.execute(sql, args).fetchall()
|
|
580
|
-
return [_row_to_node(r) for r in rows]
|
|
581
|
-
|
|
582
|
-
def neighbors(self, node_id: str, *,
|
|
583
|
-
edge_type: Optional[EdgeType] = None,
|
|
584
|
-
direction: str = "both",
|
|
585
|
-
limit: int = 50) -> List[Tuple[Edge, Node]]:
|
|
586
|
-
"""node_id 에 인접한 (edge, other_node) 페어를 반환."""
|
|
587
|
-
if direction not in ("out", "in", "both"):
|
|
588
|
-
raise ValueError("direction must be 'out' | 'in' | 'both'")
|
|
589
|
-
clauses, args = [], []
|
|
590
|
-
if direction in ("out", "both"):
|
|
591
|
-
clauses.append("source = ?"); args.append(node_id)
|
|
592
|
-
if direction in ("in", "both"):
|
|
593
|
-
clauses.append("target = ?"); args.append(node_id)
|
|
594
|
-
sql = f"SELECT * FROM edges_v2 WHERE ({' OR '.join(clauses)})"
|
|
595
|
-
if edge_type:
|
|
596
|
-
sql += " AND type = ?"; args.append(edge_type.value)
|
|
597
|
-
sql += " ORDER BY weight DESC, confidence DESC LIMIT ?"
|
|
598
|
-
args.append(int(limit))
|
|
599
|
-
with self._conn() as conn:
|
|
600
|
-
edges = [_row_to_edge(r) for r in conn.execute(sql, args).fetchall()]
|
|
601
|
-
out: List[Tuple[Edge, Node]] = []
|
|
602
|
-
for e in edges:
|
|
603
|
-
other_id = e.target if e.source == node_id else e.source
|
|
604
|
-
row = conn.execute(
|
|
605
|
-
"SELECT * FROM nodes_v2 WHERE id = ?", (other_id,)
|
|
606
|
-
).fetchone()
|
|
607
|
-
if row:
|
|
608
|
-
out.append((e, _row_to_node(row)))
|
|
609
|
-
return out
|
|
610
|
-
|
|
611
|
-
def search_similar(self, vec: Sequence[float], *,
|
|
612
|
-
top_k: int = 8,
|
|
613
|
-
type: Optional[NodeType] = None,
|
|
614
|
-
owner_id: Optional[str] = None) -> List[Tuple[Node, float]]:
|
|
615
|
-
"""코사인 기반 semantic search. sqlite-vec 가 없을 때의 폴백 구현."""
|
|
616
|
-
if len(vec) != EMBED_DIM:
|
|
617
|
-
raise ValueError(f"query embedding dim must be {EMBED_DIM}")
|
|
618
|
-
sql = "SELECT * FROM nodes_v2 WHERE embedding IS NOT NULL"
|
|
619
|
-
args: List[Any] = []
|
|
620
|
-
if type is not None:
|
|
621
|
-
sql += " AND type = ?"; args.append(type.value)
|
|
622
|
-
if owner_id is not None:
|
|
623
|
-
sql += " AND owner_id = ?"; args.append(owner_id)
|
|
624
|
-
with self._conn() as conn:
|
|
625
|
-
rows = conn.execute(sql, args).fetchall()
|
|
626
|
-
scored = []
|
|
627
|
-
for r in rows:
|
|
628
|
-
emb = decode_embedding(r["embedding"])
|
|
629
|
-
if emb is None:
|
|
630
|
-
continue
|
|
631
|
-
scored.append((_row_to_node(r), cosine(vec, emb)))
|
|
632
|
-
scored.sort(key=lambda x: x[1], reverse=True)
|
|
633
|
-
return scored[:top_k]
|
|
389
|
+
def init_schema(self, conn: Optional[sqlite3.Connection] = None) -> None:
|
|
390
|
+
"""Create the v2 schema and record metadata.
|
|
391
|
+
|
|
392
|
+
Pass ``conn`` to run inside the caller's open transaction (used by the
|
|
393
|
+
atomic knowledge_graph migration); otherwise a private connection is
|
|
394
|
+
opened and committed. Uses ``_exec_script`` rather than
|
|
395
|
+
``executescript`` so it never force-commits the caller's transaction.
|
|
396
|
+
"""
|
|
397
|
+
if conn is not None:
|
|
398
|
+
self._init_schema_on(conn)
|
|
399
|
+
return
|
|
400
|
+
with self._conn() as own:
|
|
401
|
+
self._init_schema_on(own)
|
|
402
|
+
|
|
403
|
+
def _init_schema_on(self, conn: sqlite3.Connection) -> None:
|
|
404
|
+
self._drop_stale_empty_v2_tables(conn)
|
|
405
|
+
_exec_script(conn, SCHEMA_SQL)
|
|
406
|
+
conn.execute(
|
|
407
|
+
"INSERT OR REPLACE INTO kg_meta(key, value) VALUES (?, ?)",
|
|
408
|
+
("schema_version", str(KG_SCHEMA_V2_VERSION)),
|
|
409
|
+
)
|
|
410
|
+
conn.execute(
|
|
411
|
+
"INSERT OR REPLACE INTO kg_meta(key, value) VALUES (?, ?)",
|
|
412
|
+
("embed_dim", str(EMBED_DIM)),
|
|
413
|
+
)
|
|
634
414
|
|
|
635
415
|
# ── Maintenance ──────────────────────────────────────────
|
|
636
416
|
def stats(self) -> Dict[str, Any]:
|
|
@@ -659,123 +439,12 @@ class KGStoreV2:
|
|
|
659
439
|
}
|
|
660
440
|
|
|
661
441
|
|
|
662
|
-
#
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
label=row["label"],
|
|
669
|
-
attrs=json.loads(row["attrs"] or "{}"),
|
|
670
|
-
embedding=decode_embedding(row["embedding"]),
|
|
671
|
-
owner_id=row["owner_id"],
|
|
672
|
-
visibility=Visibility(row["visibility"]),
|
|
673
|
-
created_at=row["created_at"],
|
|
674
|
-
updated_at=row["updated_at"],
|
|
675
|
-
style=row["style"] if "style" in keys else None,
|
|
676
|
-
tone=row["tone"] if "tone" in keys else None,
|
|
677
|
-
importance_score=float(row["importance_score"]) if "importance_score" in keys else 0.0,
|
|
678
|
-
last_used=row["last_used"] if "last_used" in keys else None,
|
|
679
|
-
)
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
def _row_to_edge(row: sqlite3.Row) -> Edge:
|
|
683
|
-
return Edge(
|
|
684
|
-
id=row["id"],
|
|
685
|
-
source=row["source"],
|
|
686
|
-
target=row["target"],
|
|
687
|
-
type=EdgeType(row["type"]),
|
|
688
|
-
weight=float(row["weight"]),
|
|
689
|
-
confidence=float(row["confidence"]),
|
|
690
|
-
evidence=json.loads(row["evidence"] or "[]"),
|
|
691
|
-
created_by=row["created_by"],
|
|
692
|
-
created_at=row["created_at"],
|
|
693
|
-
)
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
# ── Migration: legacy (nodes/edges) → v2 (nodes_v2/edges_v2) ───────────────
|
|
697
|
-
def migrate_legacy_to_v2(db_path: str, *, dry_run: bool = False) -> Dict[str, int]:
|
|
698
|
-
"""기존 ``knowledge_graph.py`` 가 만든 ``nodes`` / ``edges`` 테이블을
|
|
699
|
-
``nodes_v2`` / ``edges_v2`` 로 복사. ``dry_run`` 이면 카운트만 반환.
|
|
700
|
-
"""
|
|
701
|
-
counters = {"nodes_seen": 0, "nodes_migrated": 0,
|
|
702
|
-
"edges_seen": 0, "edges_migrated": 0, "edges_skipped": 0}
|
|
703
|
-
store = KGStoreV2(db_path)
|
|
704
|
-
store.init_schema()
|
|
705
|
-
with sqlite3.connect(db_path) as conn:
|
|
706
|
-
conn.row_factory = sqlite3.Row
|
|
707
|
-
|
|
708
|
-
# nodes
|
|
709
|
-
try:
|
|
710
|
-
old_nodes = conn.execute(
|
|
711
|
-
"SELECT id, type, title, summary, metadata_json, created_at, updated_at FROM nodes"
|
|
712
|
-
).fetchall()
|
|
713
|
-
except sqlite3.OperationalError:
|
|
714
|
-
old_nodes = []
|
|
715
|
-
for r in old_nodes:
|
|
716
|
-
counters["nodes_seen"] += 1
|
|
717
|
-
if dry_run:
|
|
718
|
-
continue
|
|
719
|
-
node = Node(
|
|
720
|
-
id=r["id"],
|
|
721
|
-
type=NodeType.from_legacy(r["type"]),
|
|
722
|
-
label=r["title"] or "(untitled)",
|
|
723
|
-
attrs=_safe_json(r["metadata_json"]),
|
|
724
|
-
created_at=r["created_at"] or _now_iso(),
|
|
725
|
-
updated_at=r["updated_at"] or _now_iso(),
|
|
726
|
-
)
|
|
727
|
-
try:
|
|
728
|
-
store.upsert_node(node)
|
|
729
|
-
counters["nodes_migrated"] += 1
|
|
730
|
-
except Exception as exc:
|
|
731
|
-
# 잘못된 legacy row 는 스킵
|
|
732
|
-
_log_skip("node", r["id"], exc)
|
|
733
|
-
|
|
734
|
-
# edges
|
|
735
|
-
try:
|
|
736
|
-
old_edges = conn.execute(
|
|
737
|
-
"SELECT id, from_node, to_node, type, weight, metadata_json, created_at FROM edges"
|
|
738
|
-
).fetchall()
|
|
739
|
-
except sqlite3.OperationalError:
|
|
740
|
-
old_edges = []
|
|
741
|
-
for r in old_edges:
|
|
742
|
-
counters["edges_seen"] += 1
|
|
743
|
-
if dry_run:
|
|
744
|
-
continue
|
|
745
|
-
meta = _safe_json(r["metadata_json"])
|
|
746
|
-
edge = Edge(
|
|
747
|
-
id=r["id"],
|
|
748
|
-
source=r["from_node"],
|
|
749
|
-
target=r["to_node"],
|
|
750
|
-
type=EdgeType.from_legacy(r["type"]),
|
|
751
|
-
weight=float(r["weight"] or 1.0),
|
|
752
|
-
confidence=float(meta.get("confidence", 1.0)),
|
|
753
|
-
evidence=list(meta.get("evidence", []) or []),
|
|
754
|
-
created_by=str(meta.get("created_by", "legacy")),
|
|
755
|
-
created_at=r["created_at"] or _now_iso(),
|
|
756
|
-
)
|
|
757
|
-
try:
|
|
758
|
-
store.upsert_edge(edge, check_endpoints=False)
|
|
759
|
-
counters["edges_migrated"] += 1
|
|
760
|
-
except Exception as exc:
|
|
761
|
-
counters["edges_skipped"] += 1
|
|
762
|
-
_log_skip("edge", r["id"], exc)
|
|
763
|
-
return counters
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
def _safe_json(raw: Optional[str]) -> Dict[str, Any]:
|
|
767
|
-
if not raw:
|
|
768
|
-
return {}
|
|
769
|
-
try:
|
|
770
|
-
v = json.loads(raw)
|
|
771
|
-
return v if isinstance(v, dict) else {"_raw": v}
|
|
772
|
-
except (ValueError, TypeError):
|
|
773
|
-
return {"_raw": raw}
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
def _log_skip(kind: str, ident: str, exc: Exception) -> None:
|
|
777
|
-
# 의도적으로 print: 마이그레이션은 일회성이라 로깅 인프라 의존 안 함
|
|
778
|
-
print(f"[migrate] skip {kind} {ident}: {exc}")
|
|
442
|
+
# NOTE: legacy → v2 reprojection lives in ``knowledge_graph.py``
|
|
443
|
+
# (``KnowledgeGraphStore._backfill_v2_if_needed`` / ``_v2_project_node``/_edge),
|
|
444
|
+
# which is the single live, version-gated migration path. The old standalone
|
|
445
|
+
# ``migrate_legacy_to_v2()`` helper + CLI ``migrate`` subcommand were removed as
|
|
446
|
+
# dead code (no callers); the normalized projection now writes the first-class
|
|
447
|
+
# ``legacy_type``/``summary``/``metadata`` columns directly.
|
|
779
448
|
|
|
780
449
|
|
|
781
450
|
# ── CLI ────────────────────────────────────────────────────────────────────
|
|
@@ -788,10 +457,6 @@ def _cli() -> int:
|
|
|
788
457
|
sub_init = sub.add_parser("init", help="initialize v2 schema in a DB")
|
|
789
458
|
sub_init.add_argument("db", help="path to sqlite db")
|
|
790
459
|
|
|
791
|
-
sub_mig = sub.add_parser("migrate", help="migrate legacy nodes/edges → v2")
|
|
792
|
-
sub_mig.add_argument("db", help="path to sqlite db")
|
|
793
|
-
sub_mig.add_argument("--dry-run", action="store_true")
|
|
794
|
-
|
|
795
460
|
sub_stats = sub.add_parser("stats", help="print store statistics")
|
|
796
461
|
sub_stats.add_argument("db", help="path to sqlite db")
|
|
797
462
|
|
|
@@ -800,10 +465,6 @@ def _cli() -> int:
|
|
|
800
465
|
KGStoreV2(args.db).init_schema()
|
|
801
466
|
print(f"initialized v2 schema in {args.db}")
|
|
802
467
|
return 0
|
|
803
|
-
if args.cmd == "migrate":
|
|
804
|
-
out = migrate_legacy_to_v2(args.db, dry_run=args.dry_run)
|
|
805
|
-
print(json.dumps(out, indent=2, ensure_ascii=False))
|
|
806
|
-
return 0
|
|
807
468
|
if args.cmd == "stats":
|
|
808
469
|
print(json.dumps(KGStoreV2(args.db).stats(), indent=2, ensure_ascii=False))
|
|
809
470
|
return 0
|