ltcai 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/kg_schema.py CHANGED
@@ -6,51 +6,37 @@ Lattice AI — Knowledge Graph v2 schema (PPT spec aligned)
6
6
 
7
7
  목적
8
8
  ----
9
- 기존 ``knowledge_graph.py`` 의 자유 문자열 노드/엣지 타입을 **명시 enum + Pydantic
10
- 모델 + SQLite v2 스키마** 로 정식화한다. embedding · confidence · evidence ·
11
- owner/visibility · createdBy 필드를 1급 시민으로 승격해서, semantic search
12
- (SIMILAR_TO 엣지 추론) multi-tenant 권한 모델의 기반을 만든다.
9
+ 기존 ``knowledge_graph.py`` 의 자유 문자열 노드/엣지 타입을 **명시 enum + SQLite v2
10
+ 스키마** 로 정식화한다. 모듈은 **스키마/초기화/프로젝션 지원** 역할만 담당한다:
11
+ ``NodeType``/``EdgeType`` taxonomy + legacy 정규화 매핑, ``nodes_v2``/``edges_v2``
12
+ DDL(``SCHEMA_SQL``), 그리고 ``KGStoreV2``(스키마 init·heal·stats).
13
+
14
+ 실제 데이터 read/write 는 ``knowledge_graph.py`` 의 ``KnowledgeGraphStore`` 가
15
+ legacy 테이블에 대한 dual-write 프로젝션(raw SQL) + ``kgv2_*`` 재구성 뷰로 수행한다.
16
+ (과거의 native ``Node``/``Edge`` 모델과 ``KGStoreV2.upsert_*``/``get_node``/
17
+ ``search_*`` API 는 production 에서 쓰이지 않아 제거되었다.)
13
18
 
14
19
  설계 원칙
15
20
  ---------
16
21
  1. **기존 코드를 깨지 않는다**: 새 테이블 이름은 ``nodes_v2`` / ``edges_v2``
17
- 로 분리. 기존 ``nodes`` / ``edges`` 와 공존한다. 마이그레이션은 별도
18
- 유틸리티(`migrate_legacy_to_v2()`) 수행.
19
- 2. **표준 라이브러리만 사용**: Pydantic 없는 환경에서도 dataclass 로
20
- 동작하도록 ``from dataclasses import dataclass`` 사용한다.
21
- 타입 검증은 ``validate()`` 메서드에서 수동.
22
- 3. **embedding 옵셔널이지만 권장**: 차원은 환경 변수
23
- ``LATTICEAI_EMBED_DIM`` (기본 1024). bytes blob 으로 저장.
24
- 4. **마이그레이션 매핑은 명시적**: 한글 동사 → 영문 enum 표가 코드 안에 들어
25
- 있어서 어떤 옛 라벨이 어디로 매핑되는지 한눈에 보인다.
22
+ 로 분리. 기존 ``nodes`` / ``edges`` 와 공존한다. legacy → v2 reprojection 은
23
+ ``knowledge_graph.py`` 버전 게이트 백필 한 곳에서만 수행한다.
24
+ 2. **정규화 + 무손실**: legacy 자유 문자열 타입은 ``NodeType``/``EdgeType``
25
+ superset 으로 정규화해 ``type`` 칼럼에 저장하고, 원본 문자열은 ``legacy_type``
26
+ 칼럼에 그대로 보존한다. summary 와 metadata 는 ``attrs._kg`` 패스스루 blob 이
27
+ 아니라 전용 ``summary`` 칼럼 / ``attrs``·``metadata`` 칼럼에 1급으로 저장한다.
28
+ 3. **표준 라이브러리만 사용**: 외부 의존성 없이 ``sqlite3`` 만으로 동작한다.
29
+ 4. **정규화 매핑은 명시적**: 한글 동사/legacy 라벨 → 영문 enum 표가 코드 안에
30
+ 들어 있어서 어떤 옛 라벨이 어디로 매핑되는지 한눈에 보인다.
26
31
 
27
32
  사용 예
28
33
  -------
29
34
  ```python
30
- from kg_schema import (
31
- KGStoreV2, Node, Edge, NodeType, EdgeType,
32
- )
35
+ from kg_schema import KGStoreV2
33
36
 
34
37
  store = KGStoreV2("/Users/me/.ltcai/kg_v2.db")
35
- store.init_schema()
36
-
37
- n1 = Node(
38
- type=NodeType.FILE,
39
- label="LatticeAI_기획서.pdf",
40
- attrs={"mime": "application/pdf", "pageCount": 24, "lang": "ko"},
41
- owner_id="user_seoljun",
42
- )
43
- n2 = Node(type=NodeType.CONCEPT, label="MCP")
44
- store.upsert_node(n1)
45
- store.upsert_node(n2)
46
-
47
- store.upsert_edge(Edge(
48
- source=n1.id, target=n2.id,
49
- type=EdgeType.MENTIONS,
50
- weight=0.82, confidence=0.91,
51
- evidence=["chunk:01HX7K…#p3", "chunk:01HX7K…#p11"],
52
- created_by="extractor:llm-gemma-3-12b",
53
- ))
38
+ store.init_schema() # nodes_v2 / edges_v2 생성 + 컬럼 drift self-heal
39
+ print(store.stats()) # {"nodes": ..., "by_node_type": {...}, ...}
54
40
  ```
55
41
  """
56
42
 
@@ -58,17 +44,11 @@ from __future__ import annotations
58
44
 
59
45
  import json
60
46
  import os
61
- import re
62
47
  import logging
63
48
  import sqlite3
64
- import struct
65
- import time
66
- import uuid
67
49
  from contextlib import contextmanager
68
- from dataclasses import dataclass, field, asdict
69
- from datetime import datetime, timezone
70
50
  from enum import Enum
71
- from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
51
+ from typing import Any, Dict, Optional
72
52
 
73
53
 
74
54
  # ── Schema version ──────────────────────────────────────────────────────────
@@ -78,7 +58,15 @@ EMBED_DIM = int(os.getenv("LATTICEAI_EMBED_DIM", "1024"))
78
58
 
79
59
  # ── Node / Edge taxonomy (PPT 슬라이드 20·21) ──────────────────────────────
80
60
  class NodeType(str, Enum):
81
- """워크스페이스의 모든 ‘명사’. PPT 슬라이드 20 카탈로그."""
61
+ """워크스페이스의 모든 ‘명사’.
62
+
63
+ PPT 슬라이드 20 카탈로그(상단 그룹)에 더해, ``knowledge_graph.py`` 가 실제로
64
+ 써오던 legacy 자유 문자열 타입을 **무손실 superset**(하단 그룹)으로 1급 enum 화
65
+ 한다. 덕분에 ``from_legacy`` 정규화가 의미를 잃지 않고(예: ``Computer`` →
66
+ ``COMPUTER``), 알 수 없는/동적(이벤트) 타입만 ``CONCEPT`` 로 폴백한다.
67
+ 원본 문자열은 ``nodes_v2.legacy_type`` 에 그대로 보존되므로 정규화는 항상 무손실.
68
+ """
69
+ # PPT 슬라이드 20 정식 카탈로그
82
70
  CONVERSATION = "CONVERSATION" # 대화 세션 전체
83
71
  MESSAGE = "MESSAGE" # 단일 발화
84
72
  FILE = "FILE" # 업로드/연결된 파일
@@ -90,10 +78,35 @@ class NodeType(str, Enum):
90
78
  MODEL = "MODEL" # 로컬/원격 LLM
91
79
  TOOL = "TOOL" # MCP 서버·외부 도구
92
80
  PROJECT = "PROJECT" # 주제별 작업 공간
81
+ # legacy superset — knowledge_graph.py 가 실제로 생성하던 노드 타입들
82
+ COMPUTER = "COMPUTER" # 내 컴퓨터 (로컬 스캔 루트)
83
+ DRIVE = "DRIVE" # 드라이브 / 볼륨
84
+ FOLDER = "FOLDER" # 폴더
85
+ CODE_FILE = "CODE_FILE" # 코드 파일 (.py/.ts 등)
86
+ SPREADSHEET = "SPREADSHEET" # 엑셀 / CSV
87
+ SLIDE_DECK = "SLIDE_DECK" # 프레젠테이션
88
+ IMAGE = "IMAGE" # 이미지 파일
89
+ IMAGE_TEXT = "IMAGE_TEXT" # OCR 텍스트
90
+ SLIDE = "SLIDE" # 슬라이드 (덱의 한 장)
91
+ PAGE = "PAGE" # 페이지 (문서의 한 면)
92
+ SHEET = "SHEET" # 시트 (스프레드시트의 한 탭)
93
+ SECTION = "SECTION" # 문서 섹션
94
+ CHAT = "CHAT" # 대화 세션(채팅 UI)
95
+ AI_RESPONSE = "AI_RESPONSE" # 어시스턴트 발화
96
+ TOPIC = "TOPIC" # 주제 / 토픽
97
+ FEATURE = "FEATURE" # 소프트웨어 기능
98
+ TASK = "TASK" # 할 일
99
+ DECISION = "DECISION" # 결정 사항
100
+ ERROR = "ERROR" # 오류 / 버그
101
+ EVENT = "EVENT" # 분석/시스템 이벤트(동적 타입 폴백)
93
102
 
94
103
  @classmethod
95
104
  def from_legacy(cls, label: str) -> "NodeType":
96
- """legacy ``knowledge_graph.py`` 의 자유 문자열을 정식 enum 으로."""
105
+ """legacy ``knowledge_graph.py`` 의 자유 문자열을 정식 enum 으로 정규화.
106
+
107
+ 매핑이 없는(동적 이벤트 등) 타입은 ``CONCEPT`` 로 폴백하지만, 호출부는
108
+ 원본 문자열을 ``legacy_type`` 칼럼에 별도 보존하므로 정보 손실은 없다.
109
+ """
97
110
  m = (label or "").strip().lower()
98
111
  return _LEGACY_NODE_MAP.get(m, cls.CONCEPT)
99
112
 
@@ -116,28 +129,61 @@ class EdgeType(str, Enum):
116
129
  INSPIRED_BY = "INSPIRED_BY" # DOCUMENT → DOCUMENT (영감/참조 관계)
117
130
  CONTRADICTS = "CONTRADICTS" # DOCUMENT ↔ DOCUMENT (상충 관계)
118
131
  EVOLVES_FROM = "EVOLVES_FROM" # DOCUMENT → DOCUMENT (발전/개정 관계)
132
+ # legacy superset — knowledge_graph.py 가 실제로 생성하던 엣지 타입들
133
+ UPLOADED_BY = "UPLOADED_BY" # PERSON → FILE (업로드함)
134
+ WROTE = "WROTE" # PERSON → CONVERSATION (작성함)
135
+ HAS_EVENT = "HAS_EVENT" # CONVERSATION → EVENT (has_event)
136
+ TRIGGERED = "TRIGGERED" # PERSON → EVENT (triggered)
137
+ HAS_SLIDE = "HAS_SLIDE" # SLIDE_DECK → SLIDE (has_slide)
138
+ HAS_PAGE = "HAS_PAGE" # DOCUMENT → PAGE (has_page)
139
+ HAS_SHEET = "HAS_SHEET" # SPREADSHEET → SHEET (has_sheet)
140
+ HAS_CHUNK = "HAS_CHUNK" # FILE → CHUNK (has_chunk)
141
+ CONTAINS_IMAGE = "CONTAINS_IMAGE" # FILE → IMAGE (contains_image)
142
+ CONTAINS_SIGNAL = "CONTAINS_SIGNAL" # NODE → CONCEPT (contains_signal)
143
+ DISCUSSES = "DISCUSSES" # SLIDE/PAGE → TOPIC (discusses)
144
+ IMPLIES = "IMPLIES" # NODE → NODE (implies)
145
+ RELATED_TO = "RELATED_TO" # ANY ↔ ANY (related_to)
119
146
 
120
147
  @classmethod
121
148
  def from_legacy(cls, label: str) -> "EdgeType":
149
+ """legacy 자유 문자열/한글 동사를 정식 enum 으로 정규화.
150
+
151
+ 매핑이 없는 동적 타입은 ``MENTIONS`` 로 폴백하지만, 호출부는 원본 문자열을
152
+ ``edges_v2.legacy_type`` 에 보존하므로 정보 손실은 없다.
153
+ """
122
154
  m = (label or "").strip().lower()
123
155
  return _LEGACY_EDGE_MAP.get(m, cls.MENTIONS)
124
156
 
125
157
 
126
- # legacy(자유 문자열 / 한글 동사) → enum 매핑
158
+ # legacy(자유 문자열 / 한글 동사) → enum 매핑 표.
159
+ # superset 정규화: 알려진 legacy 타입은 1:1 의미 보존 매핑, 미지/동적 타입만 폴백.
127
160
  _LEGACY_NODE_MAP: Dict[str, NodeType] = {
128
161
  "conversation": NodeType.CONVERSATION,
162
+ "chat": NodeType.CHAT,
129
163
  "message": NodeType.MESSAGE,
130
- "airesponse": NodeType.MESSAGE,
164
+ "airesponse": NodeType.AI_RESPONSE,
131
165
  "file": NodeType.FILE,
132
- "document": NodeType.FILE,
133
- "page": NodeType.CHUNK,
134
- "sheet": NodeType.CHUNK,
135
- "slide": NodeType.CHUNK,
166
+ "codefile": NodeType.CODE_FILE,
167
+ "spreadsheet": NodeType.SPREADSHEET,
168
+ "slidedeck": NodeType.SLIDE_DECK,
169
+ "image": NodeType.IMAGE,
170
+ "imagetext": NodeType.IMAGE_TEXT,
171
+ "computer": NodeType.COMPUTER,
172
+ "drive": NodeType.DRIVE,
173
+ "folder": NodeType.FOLDER,
174
+ "page": NodeType.PAGE,
175
+ "sheet": NodeType.SHEET,
176
+ "slide": NodeType.SLIDE,
177
+ "section": NodeType.SECTION,
136
178
  "chunk": NodeType.CHUNK,
137
179
  "code": NodeType.CODE_SYMBOL,
138
180
  "concept": NodeType.CONCEPT,
139
- "feature": NodeType.CONCEPT,
140
- "error": NodeType.CONCEPT,
181
+ "topic": NodeType.TOPIC,
182
+ "feature": NodeType.FEATURE,
183
+ "task": NodeType.TASK,
184
+ "decision": NodeType.DECISION,
185
+ "error": NodeType.ERROR,
186
+ "event": NodeType.EVENT,
141
187
  "tag": NodeType.CONCEPT,
142
188
  "person": NodeType.PERSON,
143
189
  "user": NodeType.PERSON,
@@ -167,6 +213,8 @@ _LEGACY_EDGE_MAP: Dict[str, EdgeType] = {
167
213
  "연결함": EdgeType.REFERENCES,
168
214
  "확장함": EdgeType.DERIVED_FROM,
169
215
  "생성함": EdgeType.AUTHORED_BY,
216
+ "작성함": EdgeType.WROTE,
217
+ "업로드함": EdgeType.UPLOADED_BY,
170
218
  "대체함": EdgeType.VERSION_OF,
171
219
  "지원함": EdgeType.USES,
172
220
  "발생함": EdgeType.REFERENCES,
@@ -188,189 +236,26 @@ _LEGACY_EDGE_MAP: Dict[str, EdgeType] = {
188
236
  "inspired_by": EdgeType.INSPIRED_BY,
189
237
  "contradicts": EdgeType.CONTRADICTS,
190
238
  "evolves_from": EdgeType.EVOLVES_FROM,
239
+ # legacy superset 별칭 (knowledge_graph.py 가 실제로 쓰던 엣지 타입)
240
+ "uploaded_by": EdgeType.UPLOADED_BY,
241
+ "wrote": EdgeType.WROTE,
242
+ "has_event": EdgeType.HAS_EVENT,
243
+ "triggered": EdgeType.TRIGGERED,
244
+ "has_slide": EdgeType.HAS_SLIDE,
245
+ "has_page": EdgeType.HAS_PAGE,
246
+ "has_sheet": EdgeType.HAS_SHEET,
247
+ "has_chunk": EdgeType.HAS_CHUNK,
248
+ "contains_image": EdgeType.CONTAINS_IMAGE,
249
+ "contains_signal": EdgeType.CONTAINS_SIGNAL,
250
+ "discusses": EdgeType.DISCUSSES,
251
+ "implies": EdgeType.IMPLIES,
252
+ "related_to": EdgeType.RELATED_TO,
191
253
  "활용됨": EdgeType.USED_IN,
192
254
  "영감받음": EdgeType.INSPIRED_BY,
193
255
  "상충함": EdgeType.CONTRADICTS,
194
256
  "발전함": EdgeType.EVOLVES_FROM,
195
257
  }
196
258
 
197
- # 노드 타입별로 허용되는 source / target 조합 (PPT 카탈로그 그대로)
198
- # None == 모든 타입 허용
199
- EDGE_ENDPOINT_RULES: Dict[EdgeType, Tuple[Optional[Sequence[NodeType]], Optional[Sequence[NodeType]]]] = {
200
- EdgeType.CONTAINS: ((NodeType.FILE, NodeType.DOCUMENT),
201
- (NodeType.CHUNK,)),
202
- EdgeType.MENTIONS: ((NodeType.MESSAGE, NodeType.FILE, NodeType.CHUNK, NodeType.DOCUMENT),
203
- (NodeType.CONCEPT, NodeType.PERSON, NodeType.MODEL, NodeType.TOOL)),
204
- EdgeType.REFERENCES: ((NodeType.FILE, NodeType.MESSAGE, NodeType.CHUNK),
205
- (NodeType.FILE, NodeType.MESSAGE, NodeType.CHUNK)),
206
- EdgeType.REPLIES_TO: ((NodeType.MESSAGE,), (NodeType.MESSAGE,)),
207
- EdgeType.AUTHORED_BY: ((NodeType.FILE, NodeType.MESSAGE, NodeType.CONVERSATION, NodeType.DOCUMENT),
208
- (NodeType.PERSON,)),
209
- EdgeType.USES: ((NodeType.PROJECT, NodeType.CONVERSATION),
210
- (NodeType.TOOL, NodeType.MODEL)),
211
- EdgeType.DERIVED_FROM: ((NodeType.CHUNK, NodeType.FILE),
212
- (NodeType.CHUNK, NodeType.FILE)),
213
- EdgeType.SIMILAR_TO: (None, None),
214
- EdgeType.DEPENDS_ON: ((NodeType.CODE_SYMBOL,), (NodeType.CODE_SYMBOL,)),
215
- EdgeType.TAGGED_AS: (None, (NodeType.CONCEPT,)),
216
- EdgeType.VERSION_OF: ((NodeType.FILE,), (NodeType.FILE,)),
217
- EdgeType.GRANTS_ACCESS: ((NodeType.PERSON,),
218
- (NodeType.FILE, NodeType.CONVERSATION, NodeType.PROJECT)),
219
- EdgeType.USED_IN: ((NodeType.CONCEPT,),
220
- (NodeType.DOCUMENT, NodeType.FILE)),
221
- EdgeType.INSPIRED_BY: ((NodeType.DOCUMENT, NodeType.FILE),
222
- (NodeType.DOCUMENT, NodeType.FILE)),
223
- EdgeType.CONTRADICTS: ((NodeType.DOCUMENT, NodeType.FILE),
224
- (NodeType.DOCUMENT, NodeType.FILE)),
225
- EdgeType.EVOLVES_FROM: ((NodeType.DOCUMENT, NodeType.FILE),
226
- (NodeType.DOCUMENT, NodeType.FILE)),
227
- }
228
-
229
-
230
- # ── Models ──────────────────────────────────────────────────────────────────
231
- class Visibility(str, Enum):
232
- PRIVATE = "private" # 소유자만
233
- INTERNAL = "internal" # 같은 조직
234
- SHARED = "shared" # 명시 공유
235
- PUBLIC = "public" # 누구나
236
-
237
-
238
- def _ulid() -> str:
239
- """간이 ULID (timestamp + uuid4 base32). 외부 의존성 없이."""
240
- ts = int(time.time() * 1000)
241
- rand = uuid.uuid4().int & ((1 << 80) - 1)
242
- encoded = (ts << 80) | rand
243
- chars = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" # Crockford
244
- out: List[str] = []
245
- for _ in range(26):
246
- encoded, r = divmod(encoded, 32)
247
- out.append(chars[r])
248
- return "".join(reversed(out))
249
-
250
-
251
- def _now_iso() -> str:
252
- return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
253
-
254
-
255
- def encode_embedding(vec: Sequence[float]) -> Optional[bytes]:
256
- """list[float] → SQLite BLOB. ``None`` 입력은 None 반환."""
257
- if vec is None:
258
- return None
259
- if len(vec) != EMBED_DIM:
260
- raise ValueError(
261
- f"embedding dim mismatch: got {len(vec)}, expected {EMBED_DIM} "
262
- f"(set LATTICEAI_EMBED_DIM to override)"
263
- )
264
- return struct.pack(f"<{EMBED_DIM}f", *vec)
265
-
266
-
267
- def decode_embedding(blob: Optional[bytes]) -> Optional[List[float]]:
268
- if not blob:
269
- return None
270
- return list(struct.unpack(f"<{EMBED_DIM}f", blob))
271
-
272
-
273
- def cosine(a: Sequence[float], b: Sequence[float]) -> float:
274
- """단순 코사인 유사도. numpy 없이."""
275
- if not a or not b:
276
- return 0.0
277
- s = sum(x * y for x, y in zip(a, b))
278
- na = sum(x * x for x in a) ** 0.5
279
- nb = sum(y * y for y in b) ** 0.5
280
- return s / (na * nb) if na and nb else 0.0
281
-
282
-
283
- @dataclass
284
- class Node:
285
- """PPT 슬라이드 20 의 노드 정의."""
286
- type: NodeType
287
- label: str
288
- id: str = field(default_factory=lambda: f"node:{_ulid()}")
289
- attrs: Dict[str, Any] = field(default_factory=dict)
290
- embedding: Optional[List[float]] = None
291
- owner_id: Optional[str] = None
292
- visibility: Visibility = Visibility.PRIVATE
293
- created_at: str = field(default_factory=_now_iso)
294
- updated_at: str = field(default_factory=_now_iso)
295
- style: Optional[str] = None
296
- tone: Optional[str] = None
297
- importance_score: float = 0.0
298
- last_used: Optional[str] = None
299
-
300
- def validate(self) -> None:
301
- if not isinstance(self.type, NodeType):
302
- raise TypeError(f"Node.type must be NodeType, got {type(self.type)!r}")
303
- if not self.label or not self.label.strip():
304
- raise ValueError("Node.label is required and non-empty")
305
- if len(self.label) > 240:
306
- raise ValueError("Node.label max length is 240 chars")
307
- if not isinstance(self.attrs, dict):
308
- raise TypeError("Node.attrs must be a dict")
309
- if not isinstance(self.visibility, Visibility):
310
- raise TypeError("Node.visibility must be Visibility enum")
311
- if self.embedding is not None and len(self.embedding) != EMBED_DIM:
312
- raise ValueError(f"Node.embedding dim must be {EMBED_DIM}")
313
-
314
- def to_json(self) -> Dict[str, Any]:
315
- d = asdict(self)
316
- d["type"] = self.type.value
317
- d["visibility"] = self.visibility.value
318
- # embedding 은 JSON 직렬화시 length 만 노출 (가독성)
319
- if self.embedding is not None:
320
- d["embedding"] = f"[…{len(self.embedding)} dims]"
321
- return d
322
-
323
-
324
- @dataclass
325
- class Edge:
326
- """PPT 슬라이드 21 의 엣지 정의."""
327
- source: str
328
- target: str
329
- type: EdgeType
330
- id: str = field(default_factory=lambda: f"edge:{_ulid()}")
331
- weight: float = 1.0 # 강도 0..1
332
- confidence: float = 1.0 # 추출 신뢰도 0..1
333
- evidence: List[str] = field(default_factory=list) # 근거(노드/청크 ID)
334
- created_by: str = "user" # extractor 식별자
335
- created_at: str = field(default_factory=_now_iso)
336
-
337
- def validate(self) -> None:
338
- if not isinstance(self.type, EdgeType):
339
- raise TypeError("Edge.type must be EdgeType")
340
- if not self.source or not self.target:
341
- raise ValueError("Edge.source and Edge.target are required")
342
- if self.source == self.target and self.type is not EdgeType.SIMILAR_TO:
343
- # SIMILAR_TO 외에는 자기참조 금지
344
- raise ValueError(f"self-loop not allowed for {self.type.value}")
345
- if not (0.0 <= self.weight <= 1.0):
346
- raise ValueError("Edge.weight must be in [0, 1]")
347
- if not (0.0 <= self.confidence <= 1.0):
348
- raise ValueError("Edge.confidence must be in [0, 1]")
349
-
350
- def to_json(self) -> Dict[str, Any]:
351
- d = asdict(self)
352
- d["type"] = self.type.value
353
- return d
354
-
355
-
356
- def validate_endpoints(edge_type: EdgeType, src_type: NodeType, tgt_type: NodeType) -> None:
357
- """엣지가 허용된 source/target 타입을 잇고 있는지 검증."""
358
- rule = EDGE_ENDPOINT_RULES.get(edge_type)
359
- if rule is None:
360
- return
361
- src_allowed, tgt_allowed = rule
362
- if src_allowed is not None and src_type not in src_allowed:
363
- raise ValueError(
364
- f"{edge_type.value}: source must be one of "
365
- f"{[t.value for t in src_allowed]}, got {src_type.value}"
366
- )
367
- if tgt_allowed is not None and tgt_type not in tgt_allowed:
368
- raise ValueError(
369
- f"{edge_type.value}: target must be one of "
370
- f"{[t.value for t in tgt_allowed]}, got {tgt_type.value}"
371
- )
372
-
373
-
374
259
  # ── SQLite v2 store ─────────────────────────────────────────────────────────
375
260
  SCHEMA_SQL = """
376
261
  CREATE TABLE IF NOT EXISTS kg_meta (
@@ -381,7 +266,9 @@ CREATE TABLE IF NOT EXISTS kg_meta (
381
266
  CREATE TABLE IF NOT EXISTS nodes_v2 (
382
267
  id TEXT PRIMARY KEY,
383
268
  type TEXT NOT NULL,
269
+ legacy_type TEXT,
384
270
  label TEXT NOT NULL,
271
+ summary TEXT,
385
272
  attrs TEXT NOT NULL DEFAULT '{}',
386
273
  embedding BLOB,
387
274
  owner_id TEXT,
@@ -399,31 +286,54 @@ CREATE TABLE IF NOT EXISTS edges_v2 (
399
286
  source TEXT NOT NULL,
400
287
  target TEXT NOT NULL,
401
288
  type TEXT NOT NULL,
289
+ legacy_type TEXT NOT NULL DEFAULT '',
402
290
  weight REAL NOT NULL DEFAULT 1.0,
403
291
  confidence REAL NOT NULL DEFAULT 1.0,
404
292
  evidence TEXT NOT NULL DEFAULT '[]',
293
+ metadata TEXT NOT NULL DEFAULT '{}',
405
294
  created_by TEXT NOT NULL DEFAULT 'user',
406
295
  created_at TEXT NOT NULL,
407
- UNIQUE(source, target, type),
296
+ -- Edge identity follows the *raw* legacy type, not the normalized type:
297
+ -- two distinct legacy types between the same pair (e.g. "mentions" and
298
+ -- "관련됨") must stay distinct edges even though both normalize to MENTIONS.
299
+ UNIQUE(source, target, legacy_type),
408
300
  FOREIGN KEY(source) REFERENCES nodes_v2(id) ON DELETE CASCADE,
409
301
  FOREIGN KEY(target) REFERENCES nodes_v2(id) ON DELETE CASCADE
410
302
  );
411
303
 
412
304
  CREATE INDEX IF NOT EXISTS idx_nodes_v2_type ON nodes_v2(type);
305
+ CREATE INDEX IF NOT EXISTS idx_nodes_v2_legacy ON nodes_v2(legacy_type);
413
306
  CREATE INDEX IF NOT EXISTS idx_nodes_v2_owner ON nodes_v2(owner_id);
414
307
  CREATE INDEX IF NOT EXISTS idx_edges_v2_source ON edges_v2(source);
415
308
  CREATE INDEX IF NOT EXISTS idx_edges_v2_target ON edges_v2(target);
416
309
  CREATE INDEX IF NOT EXISTS idx_edges_v2_type ON edges_v2(type);
310
+ CREATE INDEX IF NOT EXISTS idx_edges_v2_legacy ON edges_v2(legacy_type);
417
311
  """
418
312
 
419
313
 
314
+ def _exec_script(conn: sqlite3.Connection, script: str) -> None:
315
+ """Run a multi-statement SQL script on ``conn`` statement-by-statement.
316
+
317
+ Unlike ``sqlite3.Connection.executescript``, this does NOT issue an implicit
318
+ COMMIT before running, so the statements join the caller's open transaction.
319
+ Safe for our schema/view DDL (no ``;`` inside string literals).
320
+ """
321
+ for stmt in script.split(";"):
322
+ s = stmt.strip()
323
+ if s:
324
+ conn.execute(s)
325
+
326
+
420
327
  class KGStoreV2:
421
- """가벼운 SQLite 기반 v2 스토어. sqlite-vec 있으면 벡터 인덱스도 활용,
422
- 없으면 Python cosine 으로 폴백."""
328
+ """가벼운 SQLite 기반 v2 스토어 **스키마/초기화 지원 전용**.
329
+
330
+ ``init_schema`` 으로 ``nodes_v2``/``edges_v2`` 를 생성·heal 하고 ``stats`` 로
331
+ 집계를 노출한다. 데이터 read/write 는 ``knowledge_graph.KnowledgeGraphStore``
332
+ 프로젝션이 담당하므로 native upsert/get/search API 는 두지 않는다.
333
+ """
423
334
 
424
335
  def __init__(self, db_path: str):
425
336
  self.db_path = db_path
426
- self._has_vec: Optional[bool] = None
427
337
 
428
338
  @contextmanager
429
339
  def _conn(self):
@@ -439,11 +349,11 @@ class KGStoreV2:
439
349
  # Columns the current code writes; used to detect schema-evolution drift in
440
350
  # v2 tables that an older ``CREATE TABLE IF NOT EXISTS`` left behind.
441
351
  _V2_EXPECTED_COLUMNS = {
442
- "edges_v2": {"id", "source", "target", "type", "weight", "confidence",
443
- "evidence", "created_by", "created_at"},
444
- "nodes_v2": {"id", "type", "label", "attrs", "embedding", "owner_id",
445
- "visibility", "created_at", "updated_at", "style", "tone",
446
- "importance_score", "last_used"},
352
+ "edges_v2": {"id", "source", "target", "type", "legacy_type", "weight",
353
+ "confidence", "evidence", "metadata", "created_by", "created_at"},
354
+ "nodes_v2": {"id", "type", "legacy_type", "label", "summary", "attrs",
355
+ "embedding", "owner_id", "visibility", "created_at",
356
+ "updated_at", "style", "tone", "importance_score", "last_used"},
447
357
  }
448
358
 
449
359
  def _drop_stale_empty_v2_tables(self, conn: sqlite3.Connection) -> None:
@@ -476,161 +386,31 @@ class KGStoreV2:
476
386
  table, sorted(missing), count,
477
387
  )
478
388
 
479
- def init_schema(self) -> None:
480
- with self._conn() as conn:
481
- self._drop_stale_empty_v2_tables(conn)
482
- conn.executescript(SCHEMA_SQL)
483
- conn.execute(
484
- "INSERT OR REPLACE INTO kg_meta(key, value) VALUES (?, ?)",
485
- ("schema_version", str(KG_SCHEMA_V2_VERSION)),
486
- )
487
- conn.execute(
488
- "INSERT OR REPLACE INTO kg_meta(key, value) VALUES (?, ?)",
489
- ("embed_dim", str(EMBED_DIM)),
490
- )
491
-
492
- # ── Upsert ───────────────────────────────────────────────
493
- def upsert_node(self, node: Node) -> str:
494
- node.validate()
495
- node.updated_at = _now_iso()
496
- with self._conn() as conn:
497
- conn.execute(
498
- """
499
- INSERT INTO nodes_v2(id, type, label, attrs, embedding,
500
- owner_id, visibility, created_at, updated_at,
501
- style, tone, importance_score, last_used)
502
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
503
- ON CONFLICT(id) DO UPDATE SET
504
- type=excluded.type,
505
- label=excluded.label,
506
- attrs=excluded.attrs,
507
- embedding=COALESCE(excluded.embedding, nodes_v2.embedding),
508
- owner_id=excluded.owner_id,
509
- visibility=excluded.visibility,
510
- updated_at=excluded.updated_at,
511
- style=COALESCE(excluded.style, nodes_v2.style),
512
- tone=COALESCE(excluded.tone, nodes_v2.tone),
513
- importance_score=MAX(excluded.importance_score, nodes_v2.importance_score),
514
- last_used=COALESCE(excluded.last_used, nodes_v2.last_used)
515
- """,
516
- (
517
- node.id, node.type.value, node.label,
518
- json.dumps(node.attrs, ensure_ascii=False),
519
- encode_embedding(node.embedding),
520
- node.owner_id, node.visibility.value,
521
- node.created_at, node.updated_at,
522
- node.style, node.tone,
523
- float(node.importance_score), node.last_used,
524
- ),
525
- )
526
- return node.id
527
-
528
- def upsert_edge(self, edge: Edge, *, check_endpoints: bool = True) -> str:
529
- edge.validate()
530
- if check_endpoints:
531
- src = self.get_node(edge.source)
532
- tgt = self.get_node(edge.target)
533
- if src is None or tgt is None:
534
- raise ValueError("Edge endpoints must exist as nodes")
535
- validate_endpoints(edge.type, src.type, tgt.type)
536
- with self._conn() as conn:
537
- conn.execute(
538
- """
539
- INSERT INTO edges_v2(id, source, target, type, weight,
540
- confidence, evidence, created_by, created_at)
541
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
542
- ON CONFLICT(source, target, type) DO UPDATE SET
543
- weight=excluded.weight,
544
- confidence=excluded.confidence,
545
- evidence=excluded.evidence,
546
- created_by=excluded.created_by
547
- """,
548
- (
549
- edge.id, edge.source, edge.target, edge.type.value,
550
- float(edge.weight), float(edge.confidence),
551
- json.dumps(edge.evidence, ensure_ascii=False),
552
- edge.created_by, edge.created_at,
553
- ),
554
- )
555
- return edge.id
556
-
557
- # ── Read ─────────────────────────────────────────────────
558
- def get_node(self, node_id: str) -> Optional[Node]:
559
- with self._conn() as conn:
560
- row = conn.execute(
561
- "SELECT * FROM nodes_v2 WHERE id = ?", (node_id,)
562
- ).fetchone()
563
- return _row_to_node(row) if row else None
564
-
565
- def list_nodes(self, *, type: Optional[NodeType] = None,
566
- owner_id: Optional[str] = None,
567
- limit: int = 100) -> List[Node]:
568
- sql = "SELECT * FROM nodes_v2 WHERE 1=1"
569
- args: List[Any] = []
570
- if type is not None:
571
- sql += " AND type = ?"
572
- args.append(type.value)
573
- if owner_id is not None:
574
- sql += " AND owner_id = ?"
575
- args.append(owner_id)
576
- sql += " ORDER BY updated_at DESC LIMIT ?"
577
- args.append(int(limit))
578
- with self._conn() as conn:
579
- rows = conn.execute(sql, args).fetchall()
580
- return [_row_to_node(r) for r in rows]
581
-
582
- def neighbors(self, node_id: str, *,
583
- edge_type: Optional[EdgeType] = None,
584
- direction: str = "both",
585
- limit: int = 50) -> List[Tuple[Edge, Node]]:
586
- """node_id 에 인접한 (edge, other_node) 페어를 반환."""
587
- if direction not in ("out", "in", "both"):
588
- raise ValueError("direction must be 'out' | 'in' | 'both'")
589
- clauses, args = [], []
590
- if direction in ("out", "both"):
591
- clauses.append("source = ?"); args.append(node_id)
592
- if direction in ("in", "both"):
593
- clauses.append("target = ?"); args.append(node_id)
594
- sql = f"SELECT * FROM edges_v2 WHERE ({' OR '.join(clauses)})"
595
- if edge_type:
596
- sql += " AND type = ?"; args.append(edge_type.value)
597
- sql += " ORDER BY weight DESC, confidence DESC LIMIT ?"
598
- args.append(int(limit))
599
- with self._conn() as conn:
600
- edges = [_row_to_edge(r) for r in conn.execute(sql, args).fetchall()]
601
- out: List[Tuple[Edge, Node]] = []
602
- for e in edges:
603
- other_id = e.target if e.source == node_id else e.source
604
- row = conn.execute(
605
- "SELECT * FROM nodes_v2 WHERE id = ?", (other_id,)
606
- ).fetchone()
607
- if row:
608
- out.append((e, _row_to_node(row)))
609
- return out
610
-
611
- def search_similar(self, vec: Sequence[float], *,
612
- top_k: int = 8,
613
- type: Optional[NodeType] = None,
614
- owner_id: Optional[str] = None) -> List[Tuple[Node, float]]:
615
- """코사인 기반 semantic search. sqlite-vec 가 없을 때의 폴백 구현."""
616
- if len(vec) != EMBED_DIM:
617
- raise ValueError(f"query embedding dim must be {EMBED_DIM}")
618
- sql = "SELECT * FROM nodes_v2 WHERE embedding IS NOT NULL"
619
- args: List[Any] = []
620
- if type is not None:
621
- sql += " AND type = ?"; args.append(type.value)
622
- if owner_id is not None:
623
- sql += " AND owner_id = ?"; args.append(owner_id)
624
- with self._conn() as conn:
625
- rows = conn.execute(sql, args).fetchall()
626
- scored = []
627
- for r in rows:
628
- emb = decode_embedding(r["embedding"])
629
- if emb is None:
630
- continue
631
- scored.append((_row_to_node(r), cosine(vec, emb)))
632
- scored.sort(key=lambda x: x[1], reverse=True)
633
- return scored[:top_k]
389
+ def init_schema(self, conn: Optional[sqlite3.Connection] = None) -> None:
390
+ """Create the v2 schema and record metadata.
391
+
392
+ Pass ``conn`` to run inside the caller's open transaction (used by the
393
+ atomic knowledge_graph migration); otherwise a private connection is
394
+ opened and committed. Uses ``_exec_script`` rather than
395
+ ``executescript`` so it never force-commits the caller's transaction.
396
+ """
397
+ if conn is not None:
398
+ self._init_schema_on(conn)
399
+ return
400
+ with self._conn() as own:
401
+ self._init_schema_on(own)
402
+
403
+ def _init_schema_on(self, conn: sqlite3.Connection) -> None:
404
+ self._drop_stale_empty_v2_tables(conn)
405
+ _exec_script(conn, SCHEMA_SQL)
406
+ conn.execute(
407
+ "INSERT OR REPLACE INTO kg_meta(key, value) VALUES (?, ?)",
408
+ ("schema_version", str(KG_SCHEMA_V2_VERSION)),
409
+ )
410
+ conn.execute(
411
+ "INSERT OR REPLACE INTO kg_meta(key, value) VALUES (?, ?)",
412
+ ("embed_dim", str(EMBED_DIM)),
413
+ )
634
414
 
635
415
  # ── Maintenance ──────────────────────────────────────────
636
416
  def stats(self) -> Dict[str, Any]:
@@ -659,123 +439,12 @@ class KGStoreV2:
659
439
  }
660
440
 
661
441
 
662
- # ── Rowmodel helpers ────────────────────────────────────────────────────
663
- def _row_to_node(row: sqlite3.Row) -> Node:
664
- keys = row.keys() if hasattr(row, "keys") else []
665
- return Node(
666
- id=row["id"],
667
- type=NodeType(row["type"]),
668
- label=row["label"],
669
- attrs=json.loads(row["attrs"] or "{}"),
670
- embedding=decode_embedding(row["embedding"]),
671
- owner_id=row["owner_id"],
672
- visibility=Visibility(row["visibility"]),
673
- created_at=row["created_at"],
674
- updated_at=row["updated_at"],
675
- style=row["style"] if "style" in keys else None,
676
- tone=row["tone"] if "tone" in keys else None,
677
- importance_score=float(row["importance_score"]) if "importance_score" in keys else 0.0,
678
- last_used=row["last_used"] if "last_used" in keys else None,
679
- )
680
-
681
-
682
- def _row_to_edge(row: sqlite3.Row) -> Edge:
683
- return Edge(
684
- id=row["id"],
685
- source=row["source"],
686
- target=row["target"],
687
- type=EdgeType(row["type"]),
688
- weight=float(row["weight"]),
689
- confidence=float(row["confidence"]),
690
- evidence=json.loads(row["evidence"] or "[]"),
691
- created_by=row["created_by"],
692
- created_at=row["created_at"],
693
- )
694
-
695
-
696
- # ── Migration: legacy (nodes/edges) → v2 (nodes_v2/edges_v2) ───────────────
697
- def migrate_legacy_to_v2(db_path: str, *, dry_run: bool = False) -> Dict[str, int]:
698
- """기존 ``knowledge_graph.py`` 가 만든 ``nodes`` / ``edges`` 테이블을
699
- ``nodes_v2`` / ``edges_v2`` 로 복사. ``dry_run`` 이면 카운트만 반환.
700
- """
701
- counters = {"nodes_seen": 0, "nodes_migrated": 0,
702
- "edges_seen": 0, "edges_migrated": 0, "edges_skipped": 0}
703
- store = KGStoreV2(db_path)
704
- store.init_schema()
705
- with sqlite3.connect(db_path) as conn:
706
- conn.row_factory = sqlite3.Row
707
-
708
- # nodes
709
- try:
710
- old_nodes = conn.execute(
711
- "SELECT id, type, title, summary, metadata_json, created_at, updated_at FROM nodes"
712
- ).fetchall()
713
- except sqlite3.OperationalError:
714
- old_nodes = []
715
- for r in old_nodes:
716
- counters["nodes_seen"] += 1
717
- if dry_run:
718
- continue
719
- node = Node(
720
- id=r["id"],
721
- type=NodeType.from_legacy(r["type"]),
722
- label=r["title"] or "(untitled)",
723
- attrs=_safe_json(r["metadata_json"]),
724
- created_at=r["created_at"] or _now_iso(),
725
- updated_at=r["updated_at"] or _now_iso(),
726
- )
727
- try:
728
- store.upsert_node(node)
729
- counters["nodes_migrated"] += 1
730
- except Exception as exc:
731
- # 잘못된 legacy row 는 스킵
732
- _log_skip("node", r["id"], exc)
733
-
734
- # edges
735
- try:
736
- old_edges = conn.execute(
737
- "SELECT id, from_node, to_node, type, weight, metadata_json, created_at FROM edges"
738
- ).fetchall()
739
- except sqlite3.OperationalError:
740
- old_edges = []
741
- for r in old_edges:
742
- counters["edges_seen"] += 1
743
- if dry_run:
744
- continue
745
- meta = _safe_json(r["metadata_json"])
746
- edge = Edge(
747
- id=r["id"],
748
- source=r["from_node"],
749
- target=r["to_node"],
750
- type=EdgeType.from_legacy(r["type"]),
751
- weight=float(r["weight"] or 1.0),
752
- confidence=float(meta.get("confidence", 1.0)),
753
- evidence=list(meta.get("evidence", []) or []),
754
- created_by=str(meta.get("created_by", "legacy")),
755
- created_at=r["created_at"] or _now_iso(),
756
- )
757
- try:
758
- store.upsert_edge(edge, check_endpoints=False)
759
- counters["edges_migrated"] += 1
760
- except Exception as exc:
761
- counters["edges_skipped"] += 1
762
- _log_skip("edge", r["id"], exc)
763
- return counters
764
-
765
-
766
- def _safe_json(raw: Optional[str]) -> Dict[str, Any]:
767
- if not raw:
768
- return {}
769
- try:
770
- v = json.loads(raw)
771
- return v if isinstance(v, dict) else {"_raw": v}
772
- except (ValueError, TypeError):
773
- return {"_raw": raw}
774
-
775
-
776
- def _log_skip(kind: str, ident: str, exc: Exception) -> None:
777
- # 의도적으로 print: 마이그레이션은 일회성이라 로깅 인프라 의존 안 함
778
- print(f"[migrate] skip {kind} {ident}: {exc}")
442
+ # NOTE: legacyv2 reprojection lives in ``knowledge_graph.py``
443
+ # (``KnowledgeGraphStore._backfill_v2_if_needed`` / ``_v2_project_node``/_edge),
444
+ # which is the single live, version-gated migration path. The old standalone
445
+ # ``migrate_legacy_to_v2()`` helper + CLI ``migrate`` subcommand were removed as
446
+ # dead code (no callers); the normalized projection now writes the first-class
447
+ # ``legacy_type``/``summary``/``metadata`` columns directly.
779
448
 
780
449
 
781
450
  # ── CLI ────────────────────────────────────────────────────────────────────
@@ -788,10 +457,6 @@ def _cli() -> int:
788
457
  sub_init = sub.add_parser("init", help="initialize v2 schema in a DB")
789
458
  sub_init.add_argument("db", help="path to sqlite db")
790
459
 
791
- sub_mig = sub.add_parser("migrate", help="migrate legacy nodes/edges → v2")
792
- sub_mig.add_argument("db", help="path to sqlite db")
793
- sub_mig.add_argument("--dry-run", action="store_true")
794
-
795
460
  sub_stats = sub.add_parser("stats", help="print store statistics")
796
461
  sub_stats.add_argument("db", help="path to sqlite db")
797
462
 
@@ -800,10 +465,6 @@ def _cli() -> int:
800
465
  KGStoreV2(args.db).init_schema()
801
466
  print(f"initialized v2 schema in {args.db}")
802
467
  return 0
803
- if args.cmd == "migrate":
804
- out = migrate_legacy_to_v2(args.db, dry_run=args.dry_run)
805
- print(json.dumps(out, indent=2, ensure_ascii=False))
806
- return 0
807
468
  if args.cmd == "stats":
808
469
  print(json.dumps(KGStoreV2(args.db).stats(), indent=2, ensure_ascii=False))
809
470
  return 0