ltcai 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,417 @@
1
+ """Lattice AI Auto Graph Curator.
2
+
3
+ 피드백 #4 (lattice_ai_auto_graph_direction.txt) 반영.
4
+
5
+ 핵심 방향:
6
+ - 사용자는 노드/엣지를 직접 만들지 않는다.
7
+ - 대화/파일/작업 로그 → topic candidate → cluster → promoted node
8
+ → derived thread edge → 자동 레이아웃.
9
+ - 너무 많은 노드를 만들지 않고, 알리아스를 자동 병합.
10
+ - secret/API key/private key 같은 원문은 그래프에 들어가면 안 된다.
11
+
12
+ 이 모듈은 텍스트 단위 토픽 후보 추출, 클러스터링/병합, 노드 승격 판정,
13
+ 파생 이야기 엣지 생성, 큐레이션(중요도 점수)을 담당하는 가벼운 헬퍼다.
14
+ 무거운 의존성 없이 동작하므로 기존 knowledge_graph.py 위에 얹어 쓸 수 있다.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ import math
21
+ import re
22
+ import time
23
+ from dataclasses import dataclass, field, asdict
24
+ from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ── Secret / sensitive patterns to NEVER include in graph ─────────────────────
30
+
31
+ SECRET_PATTERNS: List[re.Pattern] = [
32
+ re.compile(r"(?i)\b(?:api[_-]?key|secret|access[_-]?token|password|passwd|pwd|bearer)\s*[:=]\s*\S+"),
33
+ re.compile(r"sk-[A-Za-z0-9]{20,}"),
34
+ re.compile(r"-----BEGIN [A-Z ]+PRIVATE KEY-----[\s\S]+?-----END [A-Z ]+PRIVATE KEY-----"),
35
+ re.compile(r"AKIA[0-9A-Z]{16}"), # AWS access key
36
+ re.compile(r"ghp_[A-Za-z0-9]{30,}"), # GitHub PAT
37
+ re.compile(r"xox[baprs]-[A-Za-z0-9-]{10,}"), # Slack token
38
+ ]
39
+
40
+
41
+ def contains_secret(text: str) -> bool:
42
+ if not text:
43
+ return False
44
+ for pat in SECRET_PATTERNS:
45
+ if pat.search(text):
46
+ return True
47
+ return False
48
+
49
+
50
+ def mask_secrets(text: str) -> str:
51
+ """문자열 안의 secret을 마스킹한다. 그래프 저장 직전에 한 번 더 거쳐야 한다."""
52
+ if not text:
53
+ return text
54
+ out = text
55
+ for pat in SECRET_PATTERNS:
56
+ out = pat.sub("[REDACTED]", out)
57
+ return out
58
+
59
+
60
+ # ── Stopwords (KO + EN) ───────────────────────────────────────────────────────
61
+
62
+ _STOPWORDS: Set[str] = {
63
+ # 한국어
64
+ "그리고", "그러나", "또한", "하지만", "그런데", "그래서", "이것", "저것",
65
+ "이번", "저번", "지금", "어제", "오늘", "내일", "에서", "에게", "에는",
66
+ "되어", "있다", "없다", "있는", "없는", "같은", "처럼", "위해", "통해",
67
+ "에서의", "에서는", "라고", "이라고", "이다", "이며", "이고", "되는",
68
+ # 영어
69
+ "the", "and", "for", "are", "but", "not", "you", "can", "with", "this",
70
+ "that", "from", "into", "have", "has", "your", "any", "all", "one", "out",
71
+ "use", "using", "used", "about", "via", "per", "let", "let's", "we'll",
72
+ "i'll", "as", "be", "is", "it", "an", "or", "to", "of", "in", "on",
73
+ }
74
+
75
+
76
+ def _tokenize(text: str) -> List[str]:
77
+ if not text:
78
+ return []
79
+ # 한글/영문/숫자만 남김
80
+ cleaned = re.sub(r"[^0-9A-Za-z가-힣\s]", " ", text)
81
+ tokens = [t for t in cleaned.split() if t]
82
+ out = []
83
+ for t in tokens:
84
+ low = t.lower()
85
+ if len(low) < 2:
86
+ continue
87
+ if low in _STOPWORDS:
88
+ continue
89
+ out.append(low)
90
+ return out
91
+
92
+
93
+ def _ngrams(tokens: Sequence[str], n: int = 2) -> List[str]:
94
+ if len(tokens) < n:
95
+ return []
96
+ return [" ".join(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]
97
+
98
+
99
+ # ── Topic candidates ──────────────────────────────────────────────────────────
100
+
101
+
102
+ @dataclass
103
+ class TopicCandidate:
104
+ label: str
105
+ score: float
106
+ sources: List[str] = field(default_factory=list)
107
+ aliases: Set[str] = field(default_factory=set)
108
+
109
+ def to_dict(self) -> Dict[str, Any]:
110
+ return {
111
+ "label": self.label,
112
+ "score": self.score,
113
+ "sources": list(self.sources),
114
+ "aliases": sorted(self.aliases),
115
+ }
116
+
117
+
118
+ def extract_topic_candidates(
119
+ documents: Iterable[Dict[str, Any]],
120
+ *,
121
+ min_score: float = 1.5,
122
+ top_k: int = 50,
123
+ ) -> List[TopicCandidate]:
124
+ """대화/파일/작업 로그 documents에서 topic candidate를 뽑는다.
125
+
126
+ documents: [{"id": str, "text": str, "kind": "chat|file|task", "weight": float}]
127
+ """
128
+ counts: Dict[str, float] = {}
129
+ sources: Dict[str, List[str]] = {}
130
+
131
+ for doc in documents:
132
+ text = str(doc.get("text") or "")
133
+ # secret이 섞여 있으면 제거하고 진행
134
+ text = mask_secrets(text)
135
+ weight = float(doc.get("weight") or 1.0)
136
+ kind = str(doc.get("kind") or "chat")
137
+ if kind == "file":
138
+ weight *= 1.5 # 파일은 신호가 강함
139
+ elif kind == "task":
140
+ weight *= 1.2
141
+
142
+ tokens = _tokenize(text)
143
+ if not tokens:
144
+ continue
145
+
146
+ # 단어 + 2gram 두 가지 모두 후보로 둔다
147
+ bag = list(set(tokens + _ngrams(tokens, 2)))
148
+ seen_in_doc: Set[str] = set()
149
+ for term in bag:
150
+ if term in seen_in_doc:
151
+ continue
152
+ seen_in_doc.add(term)
153
+ counts[term] = counts.get(term, 0.0) + weight
154
+ sources.setdefault(term, []).append(str(doc.get("id") or ""))
155
+
156
+ # log-normalize and filter
157
+ candidates: List[TopicCandidate] = []
158
+ for term, score in counts.items():
159
+ if score < min_score:
160
+ continue
161
+ normalized = math.log(1.0 + score) * (1.0 + 0.05 * len(term.split()))
162
+ candidates.append(
163
+ TopicCandidate(
164
+ label=term,
165
+ score=round(normalized, 4),
166
+ sources=sources.get(term, [])[:20],
167
+ )
168
+ )
169
+
170
+ candidates.sort(key=lambda c: c.score, reverse=True)
171
+ return candidates[:top_k]
172
+
173
+
174
+ # ── Alias normalization / merging ─────────────────────────────────────────────
175
+
176
+ DEFAULT_ALIAS_GROUPS: List[List[str]] = [
177
+ ["lattice ai", "latticeai", "래티스 ai", "래티스ai", "내 앱", "내 ai"],
178
+ ["gpt-oss", "gpt oss", "openai gpt-oss"],
179
+ ["gemma 4", "gemma4", "google gemma 4"],
180
+ ["llama 3", "llama3", "meta llama 3"],
181
+ ]
182
+
183
+
184
+ def build_alias_index(groups: Optional[List[List[str]]] = None) -> Dict[str, str]:
185
+ groups = groups or DEFAULT_ALIAS_GROUPS
186
+ idx: Dict[str, str] = {}
187
+ for grp in groups:
188
+ if not grp:
189
+ continue
190
+ canon = grp[0].lower().strip()
191
+ for alias in grp:
192
+ idx[alias.lower().strip()] = canon
193
+ return idx
194
+
195
+
196
+ def cluster_candidates(
197
+ candidates: List[TopicCandidate],
198
+ alias_index: Optional[Dict[str, str]] = None,
199
+ ) -> List[TopicCandidate]:
200
+ """비슷한 라벨을 자동 병합한다."""
201
+ alias_index = alias_index or build_alias_index()
202
+ merged: Dict[str, TopicCandidate] = {}
203
+
204
+ def canon_of(label: str) -> str:
205
+ low = label.lower().strip()
206
+ if low in alias_index:
207
+ return alias_index[low]
208
+ # 단순 정규화: 공백/하이픈 통일
209
+ norm = re.sub(r"[-_]+", " ", low)
210
+ norm = re.sub(r"\s+", " ", norm).strip()
211
+ return norm
212
+
213
+ for c in candidates:
214
+ key = canon_of(c.label)
215
+ if key in merged:
216
+ existing = merged[key]
217
+ existing.score += c.score * 0.6 # 중복일수록 score는 약간 가산
218
+ existing.aliases.add(c.label)
219
+ existing.sources = list({*existing.sources, *c.sources})[:50]
220
+ else:
221
+ cand = TopicCandidate(
222
+ label=key,
223
+ score=c.score,
224
+ sources=list(c.sources),
225
+ aliases={c.label} if c.label.lower() != key else set(),
226
+ )
227
+ merged[key] = cand
228
+
229
+ return sorted(merged.values(), key=lambda x: x.score, reverse=True)
230
+
231
+
232
+ # ── Promotion rules ───────────────────────────────────────────────────────────
233
+
234
+
235
+ @dataclass
236
+ class PromotionDecision:
237
+ candidate: TopicCandidate
238
+ promote: bool
239
+ reason: str
240
+ importance: float
241
+
242
+
243
+ def should_promote(
244
+ candidate: TopicCandidate,
245
+ *,
246
+ existing_node_labels: Optional[Set[str]] = None,
247
+ min_sources: int = 2,
248
+ min_importance: float = 1.0,
249
+ ) -> PromotionDecision:
250
+ existing_node_labels = existing_node_labels or set()
251
+ # 1. secret 라벨이면 절대 승격 금지
252
+ if contains_secret(candidate.label):
253
+ return PromotionDecision(candidate, False, "contains secret", 0.0)
254
+ # 2. 이미 같은 라벨의 노드가 있으면 승격하지 않음 (alias로 들어감)
255
+ if candidate.label in existing_node_labels:
256
+ return PromotionDecision(candidate, False, "duplicate of existing node", candidate.score)
257
+ # 3. 출처가 너무 적으면 노이즈로 간주
258
+ if len(set(candidate.sources)) < min_sources:
259
+ return PromotionDecision(candidate, False, "too few sources", candidate.score)
260
+ # 4. 너무 짧은 라벨(단어 1자) 거부
261
+ if len(candidate.label) < 2:
262
+ return PromotionDecision(candidate, False, "label too short", candidate.score)
263
+
264
+ importance = candidate.score
265
+ if importance < min_importance:
266
+ return PromotionDecision(candidate, False, "importance below threshold", importance)
267
+
268
+ return PromotionDecision(candidate, True, "promoted", importance)
269
+
270
+
271
+ # ── Thread edges (파생 이야기) ────────────────────────────────────────────────
272
+
273
+
274
+ @dataclass
275
+ class ThreadEdge:
276
+ source: str
277
+ target: str
278
+ story: str
279
+ evidence: List[str] = field(default_factory=list)
280
+ created_at: float = field(default_factory=time.time)
281
+
282
+ def to_dict(self) -> Dict[str, Any]:
283
+ return {
284
+ "source": self.source,
285
+ "target": self.target,
286
+ "story": self.story,
287
+ "evidence": list(self.evidence),
288
+ "created_at": self.created_at,
289
+ }
290
+
291
+
292
+ def derive_thread_story(
293
+ source_label: str,
294
+ target_label: str,
295
+ *,
296
+ snippets: Iterable[str],
297
+ max_len: int = 220,
298
+ ) -> str:
299
+ """간단한 1~2문장 파생 이야기를 만든다. 빠르고 결정적."""
300
+ cleaned: List[str] = []
301
+ for s in snippets:
302
+ if not s:
303
+ continue
304
+ sm = mask_secrets(str(s))
305
+ # 가장 의미있어 보이는 첫 문장만 따온다
306
+ sentences = re.split(r"[.!?\n]+", sm)
307
+ for sent in sentences:
308
+ t = sent.strip()
309
+ if 8 <= len(t) <= max_len:
310
+ cleaned.append(t)
311
+ break
312
+ if len(cleaned) >= 2:
313
+ break
314
+ if not cleaned:
315
+ return f"{source_label}에서 {target_label}로 이어지는 흐름이 발견되었습니다."
316
+ joined = ". ".join(cleaned[:2])
317
+ return joined[:max_len]
318
+
319
+
320
+ # ── Curation (중요도 기반 hide/show) ──────────────────────────────────────────
321
+
322
+
323
+ def curate_nodes(
324
+ nodes: List[Dict[str, Any]],
325
+ *,
326
+ max_visible: int = 20,
327
+ behavior_signals: Optional[Dict[str, Dict[str, float]]] = None,
328
+ decay_seconds: float = 60 * 60 * 24 * 14, # 2주
329
+ now: Optional[float] = None,
330
+ ) -> List[Dict[str, Any]]:
331
+ """노드 리스트에 visible/score 정보를 부여한다.
332
+
333
+ nodes: [{"id": str, "label": str, "importance": float, "updated_at": float}]
334
+ behavior_signals: {node_id: {"clicks": int, "searches": int}} 형태.
335
+ """
336
+ now = now or time.time()
337
+ behavior_signals = behavior_signals or {}
338
+ enriched: List[Dict[str, Any]] = []
339
+
340
+ for n in nodes:
341
+ importance = float(n.get("importance") or 0.0)
342
+ updated_at = float(n.get("updated_at") or now)
343
+ age = max(0.0, now - updated_at)
344
+ decay = math.exp(-age / decay_seconds) if decay_seconds > 0 else 1.0
345
+ sig = behavior_signals.get(str(n.get("id") or ""), {})
346
+ boost = (
347
+ 0.4 * math.log(1.0 + float(sig.get("clicks") or 0))
348
+ + 0.6 * math.log(1.0 + float(sig.get("searches") or 0))
349
+ )
350
+ final_score = round(importance * decay + boost, 4)
351
+ enriched.append({**n, "curated_score": final_score})
352
+
353
+ enriched.sort(key=lambda x: x.get("curated_score", 0.0), reverse=True)
354
+ for i, n in enumerate(enriched):
355
+ n["visible"] = i < max_visible
356
+ return enriched
357
+
358
+
359
+ # ── End-to-end helper ─────────────────────────────────────────────────────────
360
+
361
+
362
+ def auto_build_graph_overlay(
363
+ documents: List[Dict[str, Any]],
364
+ *,
365
+ existing_node_labels: Optional[Set[str]] = None,
366
+ alias_index: Optional[Dict[str, str]] = None,
367
+ max_new_nodes: int = 8,
368
+ ) -> Dict[str, Any]:
369
+ """한 번에 토픽 추출 → 클러스터 → 승격 결정까지 수행한 결과를 돌려준다.
370
+
371
+ 실제 그래프 DB에 쓰는 작업은 호출자가 담당한다. 이 함수는 부작용 없음.
372
+ """
373
+ candidates = extract_topic_candidates(documents)
374
+ clustered = cluster_candidates(candidates, alias_index=alias_index)
375
+
376
+ promotions: List[Dict[str, Any]] = []
377
+ skipped: List[Dict[str, Any]] = []
378
+ promoted_count = 0
379
+ for cand in clustered:
380
+ if promoted_count >= max_new_nodes:
381
+ skipped.append({"label": cand.label, "reason": "max_new_nodes reached"})
382
+ continue
383
+ decision = should_promote(cand, existing_node_labels=existing_node_labels)
384
+ if decision.promote:
385
+ promotions.append({
386
+ "label": cand.label,
387
+ "importance": decision.importance,
388
+ "aliases": sorted(cand.aliases),
389
+ "sources": cand.sources,
390
+ })
391
+ promoted_count += 1
392
+ else:
393
+ skipped.append({"label": cand.label, "reason": decision.reason})
394
+
395
+ return {
396
+ "promotions": promotions,
397
+ "skipped": skipped,
398
+ "candidates_total": len(candidates),
399
+ "clustered_total": len(clustered),
400
+ }
401
+
402
+
403
+ __all__ = [
404
+ "TopicCandidate",
405
+ "PromotionDecision",
406
+ "ThreadEdge",
407
+ "contains_secret",
408
+ "mask_secrets",
409
+ "extract_topic_candidates",
410
+ "cluster_candidates",
411
+ "should_promote",
412
+ "derive_thread_story",
413
+ "curate_nodes",
414
+ "auto_build_graph_overlay",
415
+ "build_alias_index",
416
+ "DEFAULT_ALIAS_GROUPS",
417
+ ]