knowledge-graph-kit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ """knowledge-graph-kit: 知识图谱构建管线。
2
+
3
+ 提供从文本分块、LLM 抽取、实体解析到 Neo4j 写入的完整管线。
4
+ """
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ from .chunker import Chunk, SectionChunker, chunk_file
9
+ from .entity_resolver import (
10
+ clean_properties,
11
+ dedup_exact,
12
+ fuzzy_align,
13
+ resolve,
14
+ )
15
+ from .extractor import (
16
+ Entity,
17
+ ExtractionResult,
18
+ Relation,
19
+ build_system_prompt,
20
+ extract_from_chunk,
21
+ merge_results,
22
+ parse_schema,
23
+ )
24
+ from .neo4j_writer import Neo4jWriter
25
+
26
+ __all__ = [
27
+ "Chunk",
28
+ "SectionChunker",
29
+ "chunk_file",
30
+ "Entity",
31
+ "Relation",
32
+ "ExtractionResult",
33
+ "parse_schema",
34
+ "build_system_prompt",
35
+ "extract_from_chunk",
36
+ "merge_results",
37
+ "dedup_exact",
38
+ "clean_properties",
39
+ "fuzzy_align",
40
+ "resolve",
41
+ "Neo4jWriter",
42
+ ]
@@ -0,0 +1,252 @@
1
+ """
2
+ chunker.py — Section-Aware Chunking
3
+
4
+ 从教材 txt 文件读取内容,按多级标题(2.1 / 2.1.1 / ...)拆分为语义块,
5
+ 每个 Chunk 保留层级元数据,供后续 LLM 抽取实体和关系。
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import re
12
+ import sys
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ # Windows 终端 UTF-8 支持
18
+ if sys.platform == "win32":
19
+ try:
20
+ sys.stdout.reconfigure(encoding="utf-8") # type: ignore[attr-defined]
21
+ except Exception:
22
+ pass
23
+
24
+
25
+ # ── 正则 ────────────────────────────────────────────────────
26
+ # 匹配 "2.1 基本语法"、"2.1.1标识符"(标题前可有空格也可无空格)
27
+ SECTION_RE = re.compile(r"^(\d+)\.(\d+)(?:\.(\d+))?\s*(.+)$")
28
+ # 匹配 "第2章 ES基础"
29
+ CHAPTER_RE = re.compile(r"^第(\d+)章\s+(.+)$")
30
+ # 匹配 "【示例2-1】" 或 "【示例2-10】"
31
+ EXAMPLE_RE = re.compile(r"【示例(\d+)-(\d+)】")
32
+ # 匹配 "表2-1" 或 "表2-10"
33
+ TABLE_RE = re.compile(r"表(\d+)-(\d+)")
34
+
35
+
36
+ # ── 数据结构 ────────────────────────────────────────────────
37
+
38
+ @dataclass
39
+ class Chunk:
40
+ """一个语义块"""
41
+ chunk_id: str # "2.1" / "2.1.1"
42
+ title: str # "基本语法" / "标识符"
43
+ level: int # 0=章, 1=节, 2=子节
44
+ parent_id: Optional[str] # "2" 或 "2.1"
45
+ text: str # 块内容(不含标题行自身)
46
+ lines: tuple[int, int] # (start_line, end_line) 1-indexed
47
+ examples: list[str] = field(default_factory=list) # ["2-1", "2-2"]
48
+ tables: list[str] = field(default_factory=list) # ["2-1", "2-2"]
49
+ has_code: bool = False # 是否包含代码示例
50
+
51
+
52
+ # ── 分块器 ──────────────────────────────────────────────────
53
+
54
+ class SectionChunker:
55
+ """按教材章节层级拆分语义块"""
56
+
57
+ def __init__(self, text: str):
58
+ self.lines = text.splitlines()
59
+ self._chunks: list[Chunk] = []
60
+
61
+ def chunk(self) -> list[Chunk]:
62
+ """执行分块,返回 Chunk 列表"""
63
+ self._chunks = []
64
+ boundaries = self._find_section_boundaries()
65
+
66
+ for i, (start, end) in enumerate(boundaries):
67
+ header_line = self.lines[start]
68
+ chunk = self._build_chunk(header_line, start, end, boundaries)
69
+
70
+ # 章标题使用特殊 ID
71
+ if chunk.level == 0:
72
+ chunk.chunk_id = f"ch{chunk.chunk_id}"
73
+
74
+ # 提取块内的示例和表格引用
75
+ body = "\n".join(self.lines[start + 1 : end])
76
+ chunk.examples = EXAMPLE_RE.findall(body)
77
+ chunk.tables = TABLE_RE.findall(body)
78
+ chunk.has_code = "```" in body or "<script>" in body
79
+
80
+ self._chunks.append(chunk)
81
+
82
+ return self._chunks
83
+
84
+ # ── 内部方法 ────────────────────────────────────────────
85
+
86
+ def _find_section_boundaries(self) -> list[tuple[int, int]]:
87
+ """找出所有标题行及其对应的行号区间"""
88
+ headers: list[tuple[int, int, str]] = [] # (line_no, level, title_text)
89
+
90
+ for idx, line in enumerate(self.lines):
91
+ # 尝试匹配章标题
92
+ m = CHAPTER_RE.match(line)
93
+ if m:
94
+ headers.append((idx, 0, line.strip()))
95
+ continue
96
+
97
+ # 尝试匹配节/子节标题
98
+ m = SECTION_RE.match(line)
99
+ if m:
100
+ # 判断层级
101
+ if m.group(3) is not None: # 2.1.1 → level 2
102
+ level = 2
103
+ else:
104
+ level = 1
105
+ headers.append((idx, level, line.strip()))
106
+
107
+ # 转换成 (start, end) 区间
108
+ boundaries: list[tuple[int, int]] = []
109
+ for i, (start, _, _) in enumerate(headers):
110
+ end = headers[i + 1][0] if i + 1 < len(headers) else len(self.lines)
111
+ boundaries.append((start, end))
112
+
113
+ return boundaries
114
+
115
+ def _build_chunk(
116
+ self,
117
+ header_line: str,
118
+ start: int,
119
+ end: int,
120
+ boundaries: list[tuple[int, int]],
121
+ ) -> Chunk:
122
+ """从标题行和行区间构建 Chunk"""
123
+ # 提取层级信息
124
+ cm = CHAPTER_RE.match(header_line)
125
+ if cm:
126
+ ch_num = cm.group(1)
127
+ title = cm.group(2)
128
+ # 找前一个 header 作为 parent
129
+ parent_id = None
130
+ for s, _ in boundaries:
131
+ h = self.lines[s]
132
+ sm = SECTION_RE.match(h)
133
+ if sm and sm.group(1) == ch_num and sm.group(3) is None:
134
+ parent_id = sm.group(1)
135
+ break
136
+ return Chunk(
137
+ chunk_id=ch_num,
138
+ title=title,
139
+ level=0,
140
+ parent_id=parent_id,
141
+ text="\n".join(self.lines[start + 1 : end]),
142
+ lines=(start + 1, end),
143
+ )
144
+
145
+ sm = SECTION_RE.match(header_line)
146
+ if sm:
147
+ major = sm.group(1)
148
+ minor = sm.group(2)
149
+ sub = sm.group(3) # None 表示 level 1
150
+ title = sm.group(4).strip()
151
+
152
+ if sub is None:
153
+ # Level 1: "2.1 基本语法"
154
+ chunk_id = f"{major}.{minor}"
155
+ parent_id = f"ch{major}"
156
+ level = 1
157
+ else:
158
+ # Level 2: "2.1.1 标识符"
159
+ chunk_id = f"{major}.{minor}.{sub}"
160
+ parent_id = f"{major}.{minor}"
161
+ level = 2
162
+
163
+ return Chunk(
164
+ chunk_id=chunk_id,
165
+ title=title,
166
+ level=level,
167
+ parent_id=parent_id,
168
+ text="\n".join(self.lines[start + 1 : end]),
169
+ lines=(start + 1, end),
170
+ )
171
+
172
+ # 兜底(理论上不会走到这里)
173
+ return Chunk(
174
+ chunk_id="unknown",
175
+ title=header_line.strip(),
176
+ level=-1,
177
+ parent_id=None,
178
+ text="\n".join(self.lines[start + 1 : end]),
179
+ lines=(start + 1, end),
180
+ )
181
+
182
+
183
+ # ── 入口 ────────────────────────────────────────────────────
184
+
185
+ def chunk_file(txt_path: str | Path) -> list[Chunk]:
186
+ """读取 txt 文件并分块"""
187
+ txt_path = Path(txt_path)
188
+ if not txt_path.exists():
189
+ raise FileNotFoundError(f"文件未找到: {txt_path}")
190
+
191
+ text = txt_path.read_text(encoding="utf-8")
192
+ chunker = SectionChunker(text)
193
+ return chunker.chunk()
194
+
195
+
196
+ def safe_print(text: str = "", **kwargs):
197
+ """安全打印,避免 Windows GBK 编码问题"""
198
+ try:
199
+ print(text, **kwargs)
200
+ except UnicodeEncodeError:
201
+ print(text.encode("utf-8", errors="replace").decode("utf-8", errors="replace"), **kwargs)
202
+
203
+
204
+ def print_summary(chunks: list[Chunk]) -> None:
205
+ """打印分块摘要(用于验证)"""
206
+ safe_print(f"\n{'='*60}")
207
+ safe_print(f"共 {len(chunks)} 个 Chunk")
208
+ safe_print(f"{'='*60}\n")
209
+ for c in chunks:
210
+ line_info = f"L{c.lines[0]}-{c.lines[1]}"
211
+ parent = f" ← {c.parent_id}" if c.parent_id else ""
212
+ tags = []
213
+ if c.examples:
214
+ tags.append(f"示例:{len(c.examples)}")
215
+ if c.tables:
216
+ tags.append(f"表格:{len(c.tables)}")
217
+ if c.has_code:
218
+ tags.append("含代码")
219
+ tag_str = f" [{', '.join(tags)}]" if tags else ""
220
+ text_preview = c.text[:60].replace("\n", " ")
221
+ safe_print(f" [{c.chunk_id}] (level={c.level}) {line_info}{parent}")
222
+ safe_print(f" 标题: {c.title}")
223
+ safe_print(f" 预览: {text_preview}...{tag_str}")
224
+ safe_print()
225
+
226
+
227
+ # ── CLI ─────────────────────────────────────────────────────
228
+
229
+ def main(argv: list[str] | None = None) -> None:
230
+ """CLI 入口:分块文本文件并打印摘要。
231
+
232
+ 用法: kg-chunker <txt_path>
233
+ kg-chunker (使用 KG_CHUNKER_INPUT 环境变量)
234
+ """
235
+ if argv is None:
236
+ argv = sys.argv[1:]
237
+
238
+ if len(argv) >= 1:
239
+ txt_path = argv[0]
240
+ elif os.environ.get("KG_CHUNKER_INPUT"):
241
+ txt_path = os.environ["KG_CHUNKER_INPUT"]
242
+ else:
243
+ print("用法: kg-chunker <txt_path>", file=sys.stderr)
244
+ print(" 或设置 KG_CHUNKER_INPUT 环境变量", file=sys.stderr)
245
+ sys.exit(1)
246
+
247
+ chunks = chunk_file(txt_path)
248
+ print_summary(chunks)
249
+
250
+
251
+ if __name__ == "__main__":
252
+ main()
@@ -0,0 +1,393 @@
1
+ """
2
+ entity_resolver.py — 实体对齐去重
3
+
4
+ 策略:
5
+ 1. 精确去重:合并 (name, type) 完全相同的实体
6
+ 2. 模糊对齐:用 LLM 判断名称近似但字面不同的实体是否同一概念
7
+ 3. 关系更新:去重后更新关系中引用的实体名称
8
+ 4. 干净输出:清理 null 属性值
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import os
15
+ import re
16
+ import sys
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from openai import OpenAI
21
+
22
+ if sys.platform == "win32":
23
+ try:
24
+ sys.stdout.reconfigure(encoding="utf-8") # type: ignore[attr-defined]
25
+ except Exception:
26
+ pass
27
+
28
+ # ── 惰性客户端 ────────────────────────────────────────────────
29
+ _client_instance: OpenAI | None = None
30
+ _http_client_instance: httpx.Client | None = None
31
+ _MODEL: str = "gpt-4o-mini"
32
+
33
+
34
+ def configure(api_key: str, base_url: str | None = None, model: str = "gpt-4o-mini") -> None:
35
+ """显式配置 OpenAI 客户端(在程序化使用时调用)。"""
36
+ global _client_instance, _http_client_instance, _MODEL
37
+ import httpx
38
+ _http_client_instance = httpx.Client(verify=True, follow_redirects=True)
39
+ _client_instance = OpenAI(api_key=api_key, base_url=base_url, http_client=_http_client_instance, timeout=30)
40
+ _MODEL = model
41
+
42
+
43
+ def _get_client() -> OpenAI:
44
+ """惰性初始化 OpenAI 客户端,从环境变量读取配置。"""
45
+ global _client_instance, _http_client_instance, _MODEL
46
+ if _client_instance is None:
47
+ api_key = os.getenv("OPENAI_API_KEY", "")
48
+ if not api_key:
49
+ raise RuntimeError(
50
+ "OPENAI_API_KEY 未设置。请在 .env 中配置或调用 configure()。"
51
+ )
52
+ base_url = os.getenv("OPENAI_BASE_URL")
53
+ model = os.getenv("LLM_MODEL_NAME", "gpt-4o-mini")
54
+ import httpx
55
+ _http_client_instance = httpx.Client(verify=True, follow_redirects=True)
56
+ _client_instance = OpenAI(api_key=api_key, base_url=base_url, http_client=_http_client_instance, timeout=30)
57
+ _MODEL = model
58
+ return _client_instance
59
+
60
+
61
+ # ═══════════════════════════════════════════════════════════
62
+ # 1. 精确去重
63
+ # ═══════════════════════════════════════════════════════════
64
+
65
+ def dedup_exact(entities: list[dict]) -> list[dict]:
66
+ """基于 (name, type) 精确合并,相同 key 的实体合并属性"""
67
+ merged: dict[tuple[str, str], dict] = {}
68
+ for e in entities:
69
+ key = (e["name"].strip(), e["type"].strip())
70
+ if key in merged:
71
+ # 合并属性:后出现的覆盖前面的 null
72
+ existing = merged[key]
73
+ for k, v in e.get("properties", {}).items():
74
+ if v is not None:
75
+ existing.setdefault("properties", {})[k] = v
76
+ # 如果现有属性是 null 则用新值填充
77
+ for k, v in list(existing.get("properties", {}).items()):
78
+ if v is None and k in e.get("properties", {}):
79
+ existing["properties"][k] = e["properties"][k]
80
+ else:
81
+ merged[key] = dict(e) # shallow copy
82
+
83
+ return list(merged.values())
84
+
85
+
86
+ # ═══════════════════════════════════════════════════════════
87
+ # 2. 属性清理
88
+ # ═══════════════════════════════════════════════════════════
89
+
90
+ def clean_properties(entities: list[dict]) -> list[dict]:
91
+ """去掉值为 null 的属性"""
92
+ for e in entities:
93
+ if "properties" in e and e["properties"]:
94
+ e["properties"] = {k: v for k, v in e["properties"].items() if v is not None}
95
+ return entities
96
+
97
+
98
+ # ═══════════════════════════════════════════════════════════
99
+ # 3. 模糊对齐(LLM 判断)
100
+ # ═══════════════════════════════════════════════════════════
101
+
102
+ def _find_fuzzy_candidates(entities: list[dict]) -> list[tuple[int, int, str]]:
103
+ """找出名称相似的候选对(同类型或相关类型的跨类型候选)"""
104
+ candidates = []
105
+ by_type: dict[str, list[tuple[int, str]]] = {}
106
+ for i, e in enumerate(entities):
107
+ t = e["type"]
108
+ by_type.setdefault(t, []).append((i, e["name"]))
109
+
110
+ # 可跨类型比较的组(Topic/Concept/Skill 可互相转换)
111
+ cross_groups = [{"Topic", "Concept", "Skill"}]
112
+
113
+ # 同类型比较
114
+ for t, items in by_type.items():
115
+ for a_idx, a_name in items:
116
+ for b_idx, b_name in items:
117
+ if b_idx <= a_idx:
118
+ continue
119
+ if t == "CodeExample":
120
+ continue
121
+ if _name_similar(a_name, b_name) > 0.5:
122
+ candidates.append((a_idx, b_idx, t))
123
+
124
+ # 跨类型比较(Topic/Concept/Skill 之间)
125
+ for group in cross_groups:
126
+ all_items = []
127
+ for t in group:
128
+ all_items.extend(by_type.get(t, []))
129
+ for a_idx, a_name in all_items:
130
+ for b_idx, b_name in all_items:
131
+ if b_idx <= a_idx:
132
+ continue
133
+ if entities[a_idx]["type"] == entities[b_idx]["type"]:
134
+ continue # 已在同类型中处理过
135
+ if _name_similar(a_name, b_name) > 0.5:
136
+ candidates.append((a_idx, b_idx, f"{entities[a_idx]['type']}/{entities[b_idx]['type']}"))
137
+
138
+ return candidates
139
+
140
+
141
+ def _name_similar(a: str, b: str) -> float:
142
+ """基于字符集合的简单名称相似度"""
143
+ if not a or not b:
144
+ return 0.0
145
+ a_chars, b_chars = set(a), set(b)
146
+ if not a_chars or not b_chars:
147
+ return 0.0
148
+ intersection = a_chars & b_chars
149
+ return 2 * len(intersection) / (len(a_chars) + len(b_chars))
150
+
151
+
152
+ def fuzzy_align(entities: list[dict], client: OpenAI | None = None) -> list[dict]:
153
+ """用 LLM 判断模糊候选对是否同义,合并如果同义"""
154
+ candidates = _find_fuzzy_candidates(entities)
155
+ if not candidates:
156
+ print(" ℹ️ 无可模糊对齐的候选对")
157
+ return entities
158
+
159
+ print(f" 🔍 发现 {len(candidates)} 个模糊候选对,正在用 LLM 判断...")
160
+
161
+ # 构建合并图:找到同义实体组
162
+ uf = _UnionFind(len(entities))
163
+ for a_idx, b_idx, etype in candidates:
164
+ if _judge_same(entities[a_idx]["name"], entities[b_idx]["name"], etype, client=client) == "same":
165
+ uf.union(a_idx, b_idx)
166
+ print(f" ✅ 同义: '{entities[a_idx]['name']}' ≈ '{entities[b_idx]['name']}'")
167
+
168
+ # 按组聚合
169
+ groups: dict[int, list[int]] = {}
170
+ for i in range(len(entities)):
171
+ root = uf.find(i)
172
+ groups.setdefault(root, []).append(i)
173
+
174
+ merged_count = sum(len(indices) - 1 for indices in groups.values())
175
+ if merged_count == 0:
176
+ print(" ℹ️ 无实际合并")
177
+ return entities
178
+
179
+ # 每组保留一个最佳实体,合并属性
180
+ result = []
181
+ for root, indices in groups.items():
182
+ if len(indices) == 1:
183
+ result.append(dict(entities[indices[0]]))
184
+ continue
185
+
186
+ # 找到最佳保留者
187
+ best = min(indices, key=lambda i: (
188
+ {"Concept": 0, "Skill": 1, "Topic": 2, "CodeExample": 3, "TextSegment": 4, "Question": 5}.get(entities[i]["type"], 9),
189
+ len(entities[i]["name"]),
190
+ ))
191
+ survivor = dict(entities[best])
192
+ # 合并其他实体的属性
193
+ for i in indices:
194
+ if i == best:
195
+ continue
196
+ print(f" → 合并 '{entities[i]['name']}'({entities[i]['type']}) 到 '{entities[best]['name']}'")
197
+ for k, v in entities[i].get("properties", {}).items():
198
+ if v is not None and (k not in survivor.get("properties", {}) or survivor["properties"].get(k) is None):
199
+ survivor.setdefault("properties", {})[k] = v
200
+ result.append(survivor)
201
+
202
+ print(f" 📊 合并完成: {len(entities)} → {len(result)}")
203
+ return result
204
+
205
+
206
+ class _UnionFind:
207
+ """并查集,用于合并同义实体组"""
208
+ def __init__(self, n: int):
209
+ self.parent = list(range(n))
210
+ def find(self, x: int) -> int:
211
+ while self.parent[x] != x:
212
+ self.parent[x] = self.parent[self.parent[x]]
213
+ x = self.parent[x]
214
+ return x
215
+ def union(self, a: int, b: int) -> None:
216
+ ra, rb = self.find(a), self.find(b)
217
+ if ra != rb:
218
+ self.parent[ra] = rb
219
+
220
+
221
+ def _judge_same(name_a: str, name_b: str, etype: str, client: OpenAI | None = None) -> str:
222
+ """调用 LLM 判断两个名称是否指向同一实体"""
223
+ llm_client = client or _get_client()
224
+ prompt = f"""判断以下两个实体名称在教材知识图谱中是否指向**同一个概念**。
225
+
226
+ 实体类型: {etype}
227
+ 名称 A: {name_a}
228
+ 名称 B: {name_b}
229
+
230
+ 规则:
231
+ - 如果 A 和 B 是同一概念的不同表述(如 "箭头函数" 和 "arrow function"),回答 "same"
232
+ - 如果是不同的概念,回答 "different"
233
+ - 只回答 same 或 different,不要解释"""
234
+
235
+ try:
236
+ resp = llm_client.chat.completions.create(
237
+ model=_MODEL,
238
+ messages=[{"role": "user", "content": prompt}],
239
+ temperature=0.0,
240
+ max_tokens=10,
241
+ )
242
+ answer = resp.choices[0].message.content.strip().lower()
243
+ return "same" if answer == "same" else "different"
244
+ except Exception:
245
+ return "different"
246
+
247
+
248
+ def _pick_canonical(a_idx: int, b_idx: int, entities: list[dict]) -> tuple[int, int]:
249
+ """选择更规范的名称(更具体的类型优先,其次短名)"""
250
+ a = entities[a_idx]
251
+ b = entities[b_idx]
252
+ # 优先级:Concept > Skill > Topic(越具体越好)
253
+ type_order = {"Concept": 0, "Skill": 1, "Topic": 2, "CodeExample": 3}
254
+ a_score = type_order.get(a["type"], 5)
255
+ b_score = type_order.get(b["type"], 5)
256
+
257
+ if a_score < b_score:
258
+ return a_idx, b_idx
259
+ if b_score < a_score:
260
+ return b_idx, a_idx
261
+
262
+ # 同类型:保留短名(更可能是规范术语)
263
+ if len(a["name"]) <= len(b["name"]):
264
+ return a_idx, b_idx
265
+ return b_idx, a_idx
266
+
267
+
268
+ # ═══════════════════════════════════════════════════════════
269
+ # 4. 关系更新
270
+ # ═══════════════════════════════════════════════════════════
271
+
272
+ def update_relations(
273
+ relations: list[dict],
274
+ entities: list[dict],
275
+ old_entities: list[dict],
276
+ ) -> list[dict]:
277
+ """去重后更新关系中的实体名称引用(如果实体名称变了)"""
278
+ # 构建旧名称→新名称的映射(仅当名称在合并中被规范化时)
279
+ name_map: dict[str, str] = {}
280
+ old_names = {(e["name"], e["type"]) for e in old_entities}
281
+ new_names = {(e["name"], e["type"]) for e in entities}
282
+
283
+ for oe in old_entities:
284
+ key = (oe["name"], oe["type"])
285
+ if key not in new_names:
286
+ # 该实体被合并了,找到新名称
287
+ for ne in entities:
288
+ if ne["type"] == oe["type"] and (
289
+ ne["name"] in oe["name"] or oe["name"] in ne["name"]
290
+ ):
291
+ name_map[oe["name"]] = ne["name"]
292
+ break
293
+
294
+ if not name_map:
295
+ return relations
296
+
297
+ updated = []
298
+ for rel in relations:
299
+ nr = dict(rel)
300
+ if nr["source"] in name_map:
301
+ nr["source"] = name_map[nr["source"]]
302
+ if nr["target"] in name_map:
303
+ nr["target"] = name_map[nr["target"]]
304
+ updated.append(nr)
305
+
306
+ return updated
307
+
308
+
309
+ # ═══════════════════════════════════════════════════════════
310
+ # 5. 完整去重管线
311
+ # ═══════════════════════════════════════════════════════════
312
+
313
+ def resolve(entities: list[dict], relations: list[dict], skip_fuzzy: bool = True) -> tuple[list[dict], list[dict]]:
314
+ """完整实体对齐去重管线"""
315
+ old_entities = list(entities)
316
+
317
+ # 步骤 1: 精确去重
318
+ entities = dedup_exact(entities)
319
+ print(f" 📊 精确去重: {len(old_entities)} → {len(entities)}")
320
+
321
+ # 步骤 2: 属性清理
322
+ entities = clean_properties(entities)
323
+ print(f" 🧹 属性清理完成")
324
+
325
+ # 步骤 3: 模糊对齐(默认跳过,避免大量 LLM 调用)
326
+ if not skip_fuzzy:
327
+ entities = fuzzy_align(entities)
328
+ print(f" 🎯 模糊对齐后: {len(entities)} 个实体")
329
+ else:
330
+ print(f" ⏭️ 跳过模糊对齐(设置 SKIP_FUZZY=false 可启用)")
331
+
332
+ # 步骤 4: 更新关系
333
+ relations = update_relations(relations, entities, old_entities)
334
+ print(f" 🔗 关系更新: {len(relations)} 条")
335
+
336
+ return entities, relations
337
+
338
+
339
+ # ═══════════════════════════════════════════════════════════
340
+ # 6. CLI
341
+ # ═══════════════════════════════════════════════════════════
342
+
343
+ def main(argv: list[str] | None = None) -> None:
344
+ """CLI 入口:实体对齐去重。
345
+
346
+ 用法: kg-resolver <input.json>
347
+ kg-resolver (使用 KG_RESOLVER_INPUT / KG_RESOLVER_OUTPUT 环境变量)
348
+ """
349
+ if argv is None:
350
+ argv = sys.argv[1:]
351
+
352
+ from dotenv import load_dotenv
353
+ load_dotenv()
354
+
355
+ if len(argv) >= 1:
356
+ src_path = Path(argv[0])
357
+ elif os.environ.get("KG_RESOLVER_INPUT"):
358
+ src_path = Path(os.environ["KG_RESOLVER_INPUT"])
359
+ else:
360
+ print("用法: kg-resolver <input.json>", file=sys.stderr)
361
+ print(" 或设置 KG_RESOLVER_INPUT 环境变量", file=sys.stderr)
362
+ sys.exit(1)
363
+
364
+ if not src_path.exists():
365
+ print(f"❌ 未找到提取结果: {src_path}")
366
+ sys.exit(1)
367
+
368
+ data = json.loads(src_path.read_text(encoding="utf-8"))
369
+ entities: list[dict] = data["entities"]
370
+ relations: list[dict] = data["relations"]
371
+
372
+ print(f"📥 加载: {len(entities)} 实体, {len(relations)} 关系")
373
+
374
+ skip_fuzzy = os.environ.get("SKIP_FUZZY", "true").lower() in ("true", "1", "yes")
375
+ entities, relations = resolve(entities, relations, skip_fuzzy=skip_fuzzy)
376
+
377
+ # 保存结果
378
+ output = {
379
+ "entities": entities,
380
+ "relations": relations,
381
+ }
382
+ out_path_str = os.environ.get("KG_RESOLVER_OUTPUT", "extraction_result_clean.json")
383
+ out_path = Path(out_path_str)
384
+ out_path.write_text(
385
+ json.dumps(output, indent=2, ensure_ascii=False),
386
+ encoding="utf-8",
387
+ )
388
+ print(f"\n✅ 已保存: {out_path}")
389
+ print(f" 实体: {len(entities)} 个, 关系: {len(relations)} 条")
390
+
391
+
392
+ if __name__ == "__main__":
393
+ main()