knowledge-graph-kit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge_graph_kit/__init__.py +42 -0
- knowledge_graph_kit/chunker.py +252 -0
- knowledge_graph_kit/entity_resolver.py +393 -0
- knowledge_graph_kit/extractor.py +419 -0
- knowledge_graph_kit/neo4j_writer.py +273 -0
- knowledge_graph_kit/schema.txt +38 -0
- knowledge_graph_kit-0.1.0.dist-info/METADATA +120 -0
- knowledge_graph_kit-0.1.0.dist-info/RECORD +12 -0
- knowledge_graph_kit-0.1.0.dist-info/WHEEL +5 -0
- knowledge_graph_kit-0.1.0.dist-info/entry_points.txt +5 -0
- knowledge_graph_kit-0.1.0.dist-info/licenses/LICENSE +21 -0
- knowledge_graph_kit-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""knowledge-graph-kit: 知识图谱构建管线。
|
|
2
|
+
|
|
3
|
+
提供从文本分块、LLM 抽取、实体解析到 Neo4j 写入的完整管线。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
from .chunker import Chunk, SectionChunker, chunk_file
|
|
9
|
+
from .entity_resolver import (
|
|
10
|
+
clean_properties,
|
|
11
|
+
dedup_exact,
|
|
12
|
+
fuzzy_align,
|
|
13
|
+
resolve,
|
|
14
|
+
)
|
|
15
|
+
from .extractor import (
|
|
16
|
+
Entity,
|
|
17
|
+
ExtractionResult,
|
|
18
|
+
Relation,
|
|
19
|
+
build_system_prompt,
|
|
20
|
+
extract_from_chunk,
|
|
21
|
+
merge_results,
|
|
22
|
+
parse_schema,
|
|
23
|
+
)
|
|
24
|
+
from .neo4j_writer import Neo4jWriter
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"Chunk",
|
|
28
|
+
"SectionChunker",
|
|
29
|
+
"chunk_file",
|
|
30
|
+
"Entity",
|
|
31
|
+
"Relation",
|
|
32
|
+
"ExtractionResult",
|
|
33
|
+
"parse_schema",
|
|
34
|
+
"build_system_prompt",
|
|
35
|
+
"extract_from_chunk",
|
|
36
|
+
"merge_results",
|
|
37
|
+
"dedup_exact",
|
|
38
|
+
"clean_properties",
|
|
39
|
+
"fuzzy_align",
|
|
40
|
+
"resolve",
|
|
41
|
+
"Neo4jWriter",
|
|
42
|
+
]
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
chunker.py — Section-Aware Chunking
|
|
3
|
+
|
|
4
|
+
从教材 txt 文件读取内容,按多级标题(2.1 / 2.1.1 / ...)拆分为语义块,
|
|
5
|
+
每个 Chunk 保留层级元数据,供后续 LLM 抽取实体和关系。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import sys
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
# Windows 终端 UTF-8 支持
|
|
18
|
+
if sys.platform == "win32":
|
|
19
|
+
try:
|
|
20
|
+
sys.stdout.reconfigure(encoding="utf-8") # type: ignore[attr-defined]
|
|
21
|
+
except Exception:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ── 正则 ────────────────────────────────────────────────────
|
|
26
|
+
# 匹配 "2.1 基本语法"、"2.1.1标识符"(标题前可有空格也可无空格)
|
|
27
|
+
SECTION_RE = re.compile(r"^(\d+)\.(\d+)(?:\.(\d+))?\s*(.+)$")
|
|
28
|
+
# 匹配 "第2章 ES基础"
|
|
29
|
+
CHAPTER_RE = re.compile(r"^第(\d+)章\s+(.+)$")
|
|
30
|
+
# 匹配 "【示例2-1】" 或 "【示例2-10】"
|
|
31
|
+
EXAMPLE_RE = re.compile(r"【示例(\d+)-(\d+)】")
|
|
32
|
+
# 匹配 "表2-1" 或 "表2-10"
|
|
33
|
+
TABLE_RE = re.compile(r"表(\d+)-(\d+)")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ── 数据结构 ────────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class Chunk:
|
|
40
|
+
"""一个语义块"""
|
|
41
|
+
chunk_id: str # "2.1" / "2.1.1"
|
|
42
|
+
title: str # "基本语法" / "标识符"
|
|
43
|
+
level: int # 0=章, 1=节, 2=子节
|
|
44
|
+
parent_id: Optional[str] # "2" 或 "2.1"
|
|
45
|
+
text: str # 块内容(不含标题行自身)
|
|
46
|
+
lines: tuple[int, int] # (start_line, end_line) 1-indexed
|
|
47
|
+
examples: list[str] = field(default_factory=list) # ["2-1", "2-2"]
|
|
48
|
+
tables: list[str] = field(default_factory=list) # ["2-1", "2-2"]
|
|
49
|
+
has_code: bool = False # 是否包含代码示例
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ── 分块器 ──────────────────────────────────────────────────
|
|
53
|
+
|
|
54
|
+
class SectionChunker:
|
|
55
|
+
"""按教材章节层级拆分语义块"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, text: str):
|
|
58
|
+
self.lines = text.splitlines()
|
|
59
|
+
self._chunks: list[Chunk] = []
|
|
60
|
+
|
|
61
|
+
def chunk(self) -> list[Chunk]:
|
|
62
|
+
"""执行分块,返回 Chunk 列表"""
|
|
63
|
+
self._chunks = []
|
|
64
|
+
boundaries = self._find_section_boundaries()
|
|
65
|
+
|
|
66
|
+
for i, (start, end) in enumerate(boundaries):
|
|
67
|
+
header_line = self.lines[start]
|
|
68
|
+
chunk = self._build_chunk(header_line, start, end, boundaries)
|
|
69
|
+
|
|
70
|
+
# 章标题使用特殊 ID
|
|
71
|
+
if chunk.level == 0:
|
|
72
|
+
chunk.chunk_id = f"ch{chunk.chunk_id}"
|
|
73
|
+
|
|
74
|
+
# 提取块内的示例和表格引用
|
|
75
|
+
body = "\n".join(self.lines[start + 1 : end])
|
|
76
|
+
chunk.examples = EXAMPLE_RE.findall(body)
|
|
77
|
+
chunk.tables = TABLE_RE.findall(body)
|
|
78
|
+
chunk.has_code = "```" in body or "<script>" in body
|
|
79
|
+
|
|
80
|
+
self._chunks.append(chunk)
|
|
81
|
+
|
|
82
|
+
return self._chunks
|
|
83
|
+
|
|
84
|
+
# ── 内部方法 ────────────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
def _find_section_boundaries(self) -> list[tuple[int, int]]:
|
|
87
|
+
"""找出所有标题行及其对应的行号区间"""
|
|
88
|
+
headers: list[tuple[int, int, str]] = [] # (line_no, level, title_text)
|
|
89
|
+
|
|
90
|
+
for idx, line in enumerate(self.lines):
|
|
91
|
+
# 尝试匹配章标题
|
|
92
|
+
m = CHAPTER_RE.match(line)
|
|
93
|
+
if m:
|
|
94
|
+
headers.append((idx, 0, line.strip()))
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
# 尝试匹配节/子节标题
|
|
98
|
+
m = SECTION_RE.match(line)
|
|
99
|
+
if m:
|
|
100
|
+
# 判断层级
|
|
101
|
+
if m.group(3) is not None: # 2.1.1 → level 2
|
|
102
|
+
level = 2
|
|
103
|
+
else:
|
|
104
|
+
level = 1
|
|
105
|
+
headers.append((idx, level, line.strip()))
|
|
106
|
+
|
|
107
|
+
# 转换成 (start, end) 区间
|
|
108
|
+
boundaries: list[tuple[int, int]] = []
|
|
109
|
+
for i, (start, _, _) in enumerate(headers):
|
|
110
|
+
end = headers[i + 1][0] if i + 1 < len(headers) else len(self.lines)
|
|
111
|
+
boundaries.append((start, end))
|
|
112
|
+
|
|
113
|
+
return boundaries
|
|
114
|
+
|
|
115
|
+
def _build_chunk(
|
|
116
|
+
self,
|
|
117
|
+
header_line: str,
|
|
118
|
+
start: int,
|
|
119
|
+
end: int,
|
|
120
|
+
boundaries: list[tuple[int, int]],
|
|
121
|
+
) -> Chunk:
|
|
122
|
+
"""从标题行和行区间构建 Chunk"""
|
|
123
|
+
# 提取层级信息
|
|
124
|
+
cm = CHAPTER_RE.match(header_line)
|
|
125
|
+
if cm:
|
|
126
|
+
ch_num = cm.group(1)
|
|
127
|
+
title = cm.group(2)
|
|
128
|
+
# 找前一个 header 作为 parent
|
|
129
|
+
parent_id = None
|
|
130
|
+
for s, _ in boundaries:
|
|
131
|
+
h = self.lines[s]
|
|
132
|
+
sm = SECTION_RE.match(h)
|
|
133
|
+
if sm and sm.group(1) == ch_num and sm.group(3) is None:
|
|
134
|
+
parent_id = sm.group(1)
|
|
135
|
+
break
|
|
136
|
+
return Chunk(
|
|
137
|
+
chunk_id=ch_num,
|
|
138
|
+
title=title,
|
|
139
|
+
level=0,
|
|
140
|
+
parent_id=parent_id,
|
|
141
|
+
text="\n".join(self.lines[start + 1 : end]),
|
|
142
|
+
lines=(start + 1, end),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
sm = SECTION_RE.match(header_line)
|
|
146
|
+
if sm:
|
|
147
|
+
major = sm.group(1)
|
|
148
|
+
minor = sm.group(2)
|
|
149
|
+
sub = sm.group(3) # None 表示 level 1
|
|
150
|
+
title = sm.group(4).strip()
|
|
151
|
+
|
|
152
|
+
if sub is None:
|
|
153
|
+
# Level 1: "2.1 基本语法"
|
|
154
|
+
chunk_id = f"{major}.{minor}"
|
|
155
|
+
parent_id = f"ch{major}"
|
|
156
|
+
level = 1
|
|
157
|
+
else:
|
|
158
|
+
# Level 2: "2.1.1 标识符"
|
|
159
|
+
chunk_id = f"{major}.{minor}.{sub}"
|
|
160
|
+
parent_id = f"{major}.{minor}"
|
|
161
|
+
level = 2
|
|
162
|
+
|
|
163
|
+
return Chunk(
|
|
164
|
+
chunk_id=chunk_id,
|
|
165
|
+
title=title,
|
|
166
|
+
level=level,
|
|
167
|
+
parent_id=parent_id,
|
|
168
|
+
text="\n".join(self.lines[start + 1 : end]),
|
|
169
|
+
lines=(start + 1, end),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# 兜底(理论上不会走到这里)
|
|
173
|
+
return Chunk(
|
|
174
|
+
chunk_id="unknown",
|
|
175
|
+
title=header_line.strip(),
|
|
176
|
+
level=-1,
|
|
177
|
+
parent_id=None,
|
|
178
|
+
text="\n".join(self.lines[start + 1 : end]),
|
|
179
|
+
lines=(start + 1, end),
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ── 入口 ────────────────────────────────────────────────────
|
|
184
|
+
|
|
185
|
+
def chunk_file(txt_path: str | Path) -> list[Chunk]:
|
|
186
|
+
"""读取 txt 文件并分块"""
|
|
187
|
+
txt_path = Path(txt_path)
|
|
188
|
+
if not txt_path.exists():
|
|
189
|
+
raise FileNotFoundError(f"文件未找到: {txt_path}")
|
|
190
|
+
|
|
191
|
+
text = txt_path.read_text(encoding="utf-8")
|
|
192
|
+
chunker = SectionChunker(text)
|
|
193
|
+
return chunker.chunk()
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def safe_print(text: str = "", **kwargs):
|
|
197
|
+
"""安全打印,避免 Windows GBK 编码问题"""
|
|
198
|
+
try:
|
|
199
|
+
print(text, **kwargs)
|
|
200
|
+
except UnicodeEncodeError:
|
|
201
|
+
print(text.encode("utf-8", errors="replace").decode("utf-8", errors="replace"), **kwargs)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def print_summary(chunks: list[Chunk]) -> None:
|
|
205
|
+
"""打印分块摘要(用于验证)"""
|
|
206
|
+
safe_print(f"\n{'='*60}")
|
|
207
|
+
safe_print(f"共 {len(chunks)} 个 Chunk")
|
|
208
|
+
safe_print(f"{'='*60}\n")
|
|
209
|
+
for c in chunks:
|
|
210
|
+
line_info = f"L{c.lines[0]}-{c.lines[1]}"
|
|
211
|
+
parent = f" ← {c.parent_id}" if c.parent_id else ""
|
|
212
|
+
tags = []
|
|
213
|
+
if c.examples:
|
|
214
|
+
tags.append(f"示例:{len(c.examples)}")
|
|
215
|
+
if c.tables:
|
|
216
|
+
tags.append(f"表格:{len(c.tables)}")
|
|
217
|
+
if c.has_code:
|
|
218
|
+
tags.append("含代码")
|
|
219
|
+
tag_str = f" [{', '.join(tags)}]" if tags else ""
|
|
220
|
+
text_preview = c.text[:60].replace("\n", " ")
|
|
221
|
+
safe_print(f" [{c.chunk_id}] (level={c.level}) {line_info}{parent}")
|
|
222
|
+
safe_print(f" 标题: {c.title}")
|
|
223
|
+
safe_print(f" 预览: {text_preview}...{tag_str}")
|
|
224
|
+
safe_print()
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ── CLI ─────────────────────────────────────────────────────
|
|
228
|
+
|
|
229
|
+
def main(argv: list[str] | None = None) -> None:
|
|
230
|
+
"""CLI 入口:分块文本文件并打印摘要。
|
|
231
|
+
|
|
232
|
+
用法: kg-chunker <txt_path>
|
|
233
|
+
kg-chunker (使用 KG_CHUNKER_INPUT 环境变量)
|
|
234
|
+
"""
|
|
235
|
+
if argv is None:
|
|
236
|
+
argv = sys.argv[1:]
|
|
237
|
+
|
|
238
|
+
if len(argv) >= 1:
|
|
239
|
+
txt_path = argv[0]
|
|
240
|
+
elif os.environ.get("KG_CHUNKER_INPUT"):
|
|
241
|
+
txt_path = os.environ["KG_CHUNKER_INPUT"]
|
|
242
|
+
else:
|
|
243
|
+
print("用法: kg-chunker <txt_path>", file=sys.stderr)
|
|
244
|
+
print(" 或设置 KG_CHUNKER_INPUT 环境变量", file=sys.stderr)
|
|
245
|
+
sys.exit(1)
|
|
246
|
+
|
|
247
|
+
chunks = chunk_file(txt_path)
|
|
248
|
+
print_summary(chunks)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
if __name__ == "__main__":
|
|
252
|
+
main()
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
"""
|
|
2
|
+
entity_resolver.py — 实体对齐去重
|
|
3
|
+
|
|
4
|
+
策略:
|
|
5
|
+
1. 精确去重:合并 (name, type) 完全相同的实体
|
|
6
|
+
2. 模糊对齐:用 LLM 判断名称近似但字面不同的实体是否同一概念
|
|
7
|
+
3. 关系更新:去重后更新关系中引用的实体名称
|
|
8
|
+
4. 干净输出:清理 null 属性值
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from openai import OpenAI
|
|
21
|
+
|
|
22
|
+
if sys.platform == "win32":
|
|
23
|
+
try:
|
|
24
|
+
sys.stdout.reconfigure(encoding="utf-8") # type: ignore[attr-defined]
|
|
25
|
+
except Exception:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
# ── 惰性客户端 ────────────────────────────────────────────────
|
|
29
|
+
_client_instance: OpenAI | None = None
|
|
30
|
+
_http_client_instance: httpx.Client | None = None
|
|
31
|
+
_MODEL: str = "gpt-4o-mini"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def configure(api_key: str, base_url: str | None = None, model: str = "gpt-4o-mini") -> None:
|
|
35
|
+
"""显式配置 OpenAI 客户端(在程序化使用时调用)。"""
|
|
36
|
+
global _client_instance, _http_client_instance, _MODEL
|
|
37
|
+
import httpx
|
|
38
|
+
_http_client_instance = httpx.Client(verify=True, follow_redirects=True)
|
|
39
|
+
_client_instance = OpenAI(api_key=api_key, base_url=base_url, http_client=_http_client_instance, timeout=30)
|
|
40
|
+
_MODEL = model
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _get_client() -> OpenAI:
|
|
44
|
+
"""惰性初始化 OpenAI 客户端,从环境变量读取配置。"""
|
|
45
|
+
global _client_instance, _http_client_instance, _MODEL
|
|
46
|
+
if _client_instance is None:
|
|
47
|
+
api_key = os.getenv("OPENAI_API_KEY", "")
|
|
48
|
+
if not api_key:
|
|
49
|
+
raise RuntimeError(
|
|
50
|
+
"OPENAI_API_KEY 未设置。请在 .env 中配置或调用 configure()。"
|
|
51
|
+
)
|
|
52
|
+
base_url = os.getenv("OPENAI_BASE_URL")
|
|
53
|
+
model = os.getenv("LLM_MODEL_NAME", "gpt-4o-mini")
|
|
54
|
+
import httpx
|
|
55
|
+
_http_client_instance = httpx.Client(verify=True, follow_redirects=True)
|
|
56
|
+
_client_instance = OpenAI(api_key=api_key, base_url=base_url, http_client=_http_client_instance, timeout=30)
|
|
57
|
+
_MODEL = model
|
|
58
|
+
return _client_instance
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ═══════════════════════════════════════════════════════════
|
|
62
|
+
# 1. 精确去重
|
|
63
|
+
# ═══════════════════════════════════════════════════════════
|
|
64
|
+
|
|
65
|
+
def dedup_exact(entities: list[dict]) -> list[dict]:
|
|
66
|
+
"""基于 (name, type) 精确合并,相同 key 的实体合并属性"""
|
|
67
|
+
merged: dict[tuple[str, str], dict] = {}
|
|
68
|
+
for e in entities:
|
|
69
|
+
key = (e["name"].strip(), e["type"].strip())
|
|
70
|
+
if key in merged:
|
|
71
|
+
# 合并属性:后出现的覆盖前面的 null
|
|
72
|
+
existing = merged[key]
|
|
73
|
+
for k, v in e.get("properties", {}).items():
|
|
74
|
+
if v is not None:
|
|
75
|
+
existing.setdefault("properties", {})[k] = v
|
|
76
|
+
# 如果现有属性是 null 则用新值填充
|
|
77
|
+
for k, v in list(existing.get("properties", {}).items()):
|
|
78
|
+
if v is None and k in e.get("properties", {}):
|
|
79
|
+
existing["properties"][k] = e["properties"][k]
|
|
80
|
+
else:
|
|
81
|
+
merged[key] = dict(e) # shallow copy
|
|
82
|
+
|
|
83
|
+
return list(merged.values())
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ═══════════════════════════════════════════════════════════
|
|
87
|
+
# 2. 属性清理
|
|
88
|
+
# ═══════════════════════════════════════════════════════════
|
|
89
|
+
|
|
90
|
+
def clean_properties(entities: list[dict]) -> list[dict]:
|
|
91
|
+
"""去掉值为 null 的属性"""
|
|
92
|
+
for e in entities:
|
|
93
|
+
if "properties" in e and e["properties"]:
|
|
94
|
+
e["properties"] = {k: v for k, v in e["properties"].items() if v is not None}
|
|
95
|
+
return entities
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ═══════════════════════════════════════════════════════════
|
|
99
|
+
# 3. 模糊对齐(LLM 判断)
|
|
100
|
+
# ═══════════════════════════════════════════════════════════
|
|
101
|
+
|
|
102
|
+
def _find_fuzzy_candidates(entities: list[dict]) -> list[tuple[int, int, str]]:
|
|
103
|
+
"""找出名称相似的候选对(同类型或相关类型的跨类型候选)"""
|
|
104
|
+
candidates = []
|
|
105
|
+
by_type: dict[str, list[tuple[int, str]]] = {}
|
|
106
|
+
for i, e in enumerate(entities):
|
|
107
|
+
t = e["type"]
|
|
108
|
+
by_type.setdefault(t, []).append((i, e["name"]))
|
|
109
|
+
|
|
110
|
+
# 可跨类型比较的组(Topic/Concept/Skill 可互相转换)
|
|
111
|
+
cross_groups = [{"Topic", "Concept", "Skill"}]
|
|
112
|
+
|
|
113
|
+
# 同类型比较
|
|
114
|
+
for t, items in by_type.items():
|
|
115
|
+
for a_idx, a_name in items:
|
|
116
|
+
for b_idx, b_name in items:
|
|
117
|
+
if b_idx <= a_idx:
|
|
118
|
+
continue
|
|
119
|
+
if t == "CodeExample":
|
|
120
|
+
continue
|
|
121
|
+
if _name_similar(a_name, b_name) > 0.5:
|
|
122
|
+
candidates.append((a_idx, b_idx, t))
|
|
123
|
+
|
|
124
|
+
# 跨类型比较(Topic/Concept/Skill 之间)
|
|
125
|
+
for group in cross_groups:
|
|
126
|
+
all_items = []
|
|
127
|
+
for t in group:
|
|
128
|
+
all_items.extend(by_type.get(t, []))
|
|
129
|
+
for a_idx, a_name in all_items:
|
|
130
|
+
for b_idx, b_name in all_items:
|
|
131
|
+
if b_idx <= a_idx:
|
|
132
|
+
continue
|
|
133
|
+
if entities[a_idx]["type"] == entities[b_idx]["type"]:
|
|
134
|
+
continue # 已在同类型中处理过
|
|
135
|
+
if _name_similar(a_name, b_name) > 0.5:
|
|
136
|
+
candidates.append((a_idx, b_idx, f"{entities[a_idx]['type']}/{entities[b_idx]['type']}"))
|
|
137
|
+
|
|
138
|
+
return candidates
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _name_similar(a: str, b: str) -> float:
|
|
142
|
+
"""基于字符集合的简单名称相似度"""
|
|
143
|
+
if not a or not b:
|
|
144
|
+
return 0.0
|
|
145
|
+
a_chars, b_chars = set(a), set(b)
|
|
146
|
+
if not a_chars or not b_chars:
|
|
147
|
+
return 0.0
|
|
148
|
+
intersection = a_chars & b_chars
|
|
149
|
+
return 2 * len(intersection) / (len(a_chars) + len(b_chars))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def fuzzy_align(entities: list[dict], client: OpenAI | None = None) -> list[dict]:
|
|
153
|
+
"""用 LLM 判断模糊候选对是否同义,合并如果同义"""
|
|
154
|
+
candidates = _find_fuzzy_candidates(entities)
|
|
155
|
+
if not candidates:
|
|
156
|
+
print(" ℹ️ 无可模糊对齐的候选对")
|
|
157
|
+
return entities
|
|
158
|
+
|
|
159
|
+
print(f" 🔍 发现 {len(candidates)} 个模糊候选对,正在用 LLM 判断...")
|
|
160
|
+
|
|
161
|
+
# 构建合并图:找到同义实体组
|
|
162
|
+
uf = _UnionFind(len(entities))
|
|
163
|
+
for a_idx, b_idx, etype in candidates:
|
|
164
|
+
if _judge_same(entities[a_idx]["name"], entities[b_idx]["name"], etype, client=client) == "same":
|
|
165
|
+
uf.union(a_idx, b_idx)
|
|
166
|
+
print(f" ✅ 同义: '{entities[a_idx]['name']}' ≈ '{entities[b_idx]['name']}'")
|
|
167
|
+
|
|
168
|
+
# 按组聚合
|
|
169
|
+
groups: dict[int, list[int]] = {}
|
|
170
|
+
for i in range(len(entities)):
|
|
171
|
+
root = uf.find(i)
|
|
172
|
+
groups.setdefault(root, []).append(i)
|
|
173
|
+
|
|
174
|
+
merged_count = sum(len(indices) - 1 for indices in groups.values())
|
|
175
|
+
if merged_count == 0:
|
|
176
|
+
print(" ℹ️ 无实际合并")
|
|
177
|
+
return entities
|
|
178
|
+
|
|
179
|
+
# 每组保留一个最佳实体,合并属性
|
|
180
|
+
result = []
|
|
181
|
+
for root, indices in groups.items():
|
|
182
|
+
if len(indices) == 1:
|
|
183
|
+
result.append(dict(entities[indices[0]]))
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
# 找到最佳保留者
|
|
187
|
+
best = min(indices, key=lambda i: (
|
|
188
|
+
{"Concept": 0, "Skill": 1, "Topic": 2, "CodeExample": 3, "TextSegment": 4, "Question": 5}.get(entities[i]["type"], 9),
|
|
189
|
+
len(entities[i]["name"]),
|
|
190
|
+
))
|
|
191
|
+
survivor = dict(entities[best])
|
|
192
|
+
# 合并其他实体的属性
|
|
193
|
+
for i in indices:
|
|
194
|
+
if i == best:
|
|
195
|
+
continue
|
|
196
|
+
print(f" → 合并 '{entities[i]['name']}'({entities[i]['type']}) 到 '{entities[best]['name']}'")
|
|
197
|
+
for k, v in entities[i].get("properties", {}).items():
|
|
198
|
+
if v is not None and (k not in survivor.get("properties", {}) or survivor["properties"].get(k) is None):
|
|
199
|
+
survivor.setdefault("properties", {})[k] = v
|
|
200
|
+
result.append(survivor)
|
|
201
|
+
|
|
202
|
+
print(f" 📊 合并完成: {len(entities)} → {len(result)}")
|
|
203
|
+
return result
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class _UnionFind:
|
|
207
|
+
"""并查集,用于合并同义实体组"""
|
|
208
|
+
def __init__(self, n: int):
|
|
209
|
+
self.parent = list(range(n))
|
|
210
|
+
def find(self, x: int) -> int:
|
|
211
|
+
while self.parent[x] != x:
|
|
212
|
+
self.parent[x] = self.parent[self.parent[x]]
|
|
213
|
+
x = self.parent[x]
|
|
214
|
+
return x
|
|
215
|
+
def union(self, a: int, b: int) -> None:
|
|
216
|
+
ra, rb = self.find(a), self.find(b)
|
|
217
|
+
if ra != rb:
|
|
218
|
+
self.parent[ra] = rb
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _judge_same(name_a: str, name_b: str, etype: str, client: OpenAI | None = None) -> str:
|
|
222
|
+
"""调用 LLM 判断两个名称是否指向同一实体"""
|
|
223
|
+
llm_client = client or _get_client()
|
|
224
|
+
prompt = f"""判断以下两个实体名称在教材知识图谱中是否指向**同一个概念**。
|
|
225
|
+
|
|
226
|
+
实体类型: {etype}
|
|
227
|
+
名称 A: {name_a}
|
|
228
|
+
名称 B: {name_b}
|
|
229
|
+
|
|
230
|
+
规则:
|
|
231
|
+
- 如果 A 和 B 是同一概念的不同表述(如 "箭头函数" 和 "arrow function"),回答 "same"
|
|
232
|
+
- 如果是不同的概念,回答 "different"
|
|
233
|
+
- 只回答 same 或 different,不要解释"""
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
resp = llm_client.chat.completions.create(
|
|
237
|
+
model=_MODEL,
|
|
238
|
+
messages=[{"role": "user", "content": prompt}],
|
|
239
|
+
temperature=0.0,
|
|
240
|
+
max_tokens=10,
|
|
241
|
+
)
|
|
242
|
+
answer = resp.choices[0].message.content.strip().lower()
|
|
243
|
+
return "same" if answer == "same" else "different"
|
|
244
|
+
except Exception:
|
|
245
|
+
return "different"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _pick_canonical(a_idx: int, b_idx: int, entities: list[dict]) -> tuple[int, int]:
|
|
249
|
+
"""选择更规范的名称(更具体的类型优先,其次短名)"""
|
|
250
|
+
a = entities[a_idx]
|
|
251
|
+
b = entities[b_idx]
|
|
252
|
+
# 优先级:Concept > Skill > Topic(越具体越好)
|
|
253
|
+
type_order = {"Concept": 0, "Skill": 1, "Topic": 2, "CodeExample": 3}
|
|
254
|
+
a_score = type_order.get(a["type"], 5)
|
|
255
|
+
b_score = type_order.get(b["type"], 5)
|
|
256
|
+
|
|
257
|
+
if a_score < b_score:
|
|
258
|
+
return a_idx, b_idx
|
|
259
|
+
if b_score < a_score:
|
|
260
|
+
return b_idx, a_idx
|
|
261
|
+
|
|
262
|
+
# 同类型:保留短名(更可能是规范术语)
|
|
263
|
+
if len(a["name"]) <= len(b["name"]):
|
|
264
|
+
return a_idx, b_idx
|
|
265
|
+
return b_idx, a_idx
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# ═══════════════════════════════════════════════════════════
|
|
269
|
+
# 4. 关系更新
|
|
270
|
+
# ═══════════════════════════════════════════════════════════
|
|
271
|
+
|
|
272
|
+
def update_relations(
|
|
273
|
+
relations: list[dict],
|
|
274
|
+
entities: list[dict],
|
|
275
|
+
old_entities: list[dict],
|
|
276
|
+
) -> list[dict]:
|
|
277
|
+
"""去重后更新关系中的实体名称引用(如果实体名称变了)"""
|
|
278
|
+
# 构建旧名称→新名称的映射(仅当名称在合并中被规范化时)
|
|
279
|
+
name_map: dict[str, str] = {}
|
|
280
|
+
old_names = {(e["name"], e["type"]) for e in old_entities}
|
|
281
|
+
new_names = {(e["name"], e["type"]) for e in entities}
|
|
282
|
+
|
|
283
|
+
for oe in old_entities:
|
|
284
|
+
key = (oe["name"], oe["type"])
|
|
285
|
+
if key not in new_names:
|
|
286
|
+
# 该实体被合并了,找到新名称
|
|
287
|
+
for ne in entities:
|
|
288
|
+
if ne["type"] == oe["type"] and (
|
|
289
|
+
ne["name"] in oe["name"] or oe["name"] in ne["name"]
|
|
290
|
+
):
|
|
291
|
+
name_map[oe["name"]] = ne["name"]
|
|
292
|
+
break
|
|
293
|
+
|
|
294
|
+
if not name_map:
|
|
295
|
+
return relations
|
|
296
|
+
|
|
297
|
+
updated = []
|
|
298
|
+
for rel in relations:
|
|
299
|
+
nr = dict(rel)
|
|
300
|
+
if nr["source"] in name_map:
|
|
301
|
+
nr["source"] = name_map[nr["source"]]
|
|
302
|
+
if nr["target"] in name_map:
|
|
303
|
+
nr["target"] = name_map[nr["target"]]
|
|
304
|
+
updated.append(nr)
|
|
305
|
+
|
|
306
|
+
return updated
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# ═══════════════════════════════════════════════════════════
|
|
310
|
+
# 5. 完整去重管线
|
|
311
|
+
# ═══════════════════════════════════════════════════════════
|
|
312
|
+
|
|
313
|
+
def resolve(entities: list[dict], relations: list[dict], skip_fuzzy: bool = True) -> tuple[list[dict], list[dict]]:
|
|
314
|
+
"""完整实体对齐去重管线"""
|
|
315
|
+
old_entities = list(entities)
|
|
316
|
+
|
|
317
|
+
# 步骤 1: 精确去重
|
|
318
|
+
entities = dedup_exact(entities)
|
|
319
|
+
print(f" 📊 精确去重: {len(old_entities)} → {len(entities)}")
|
|
320
|
+
|
|
321
|
+
# 步骤 2: 属性清理
|
|
322
|
+
entities = clean_properties(entities)
|
|
323
|
+
print(f" 🧹 属性清理完成")
|
|
324
|
+
|
|
325
|
+
# 步骤 3: 模糊对齐(默认跳过,避免大量 LLM 调用)
|
|
326
|
+
if not skip_fuzzy:
|
|
327
|
+
entities = fuzzy_align(entities)
|
|
328
|
+
print(f" 🎯 模糊对齐后: {len(entities)} 个实体")
|
|
329
|
+
else:
|
|
330
|
+
print(f" ⏭️ 跳过模糊对齐(设置 SKIP_FUZZY=false 可启用)")
|
|
331
|
+
|
|
332
|
+
# 步骤 4: 更新关系
|
|
333
|
+
relations = update_relations(relations, entities, old_entities)
|
|
334
|
+
print(f" 🔗 关系更新: {len(relations)} 条")
|
|
335
|
+
|
|
336
|
+
return entities, relations
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# ═══════════════════════════════════════════════════════════
|
|
340
|
+
# 6. CLI
|
|
341
|
+
# ═══════════════════════════════════════════════════════════
|
|
342
|
+
|
|
343
|
+
def main(argv: list[str] | None = None) -> None:
|
|
344
|
+
"""CLI 入口:实体对齐去重。
|
|
345
|
+
|
|
346
|
+
用法: kg-resolver <input.json>
|
|
347
|
+
kg-resolver (使用 KG_RESOLVER_INPUT / KG_RESOLVER_OUTPUT 环境变量)
|
|
348
|
+
"""
|
|
349
|
+
if argv is None:
|
|
350
|
+
argv = sys.argv[1:]
|
|
351
|
+
|
|
352
|
+
from dotenv import load_dotenv
|
|
353
|
+
load_dotenv()
|
|
354
|
+
|
|
355
|
+
if len(argv) >= 1:
|
|
356
|
+
src_path = Path(argv[0])
|
|
357
|
+
elif os.environ.get("KG_RESOLVER_INPUT"):
|
|
358
|
+
src_path = Path(os.environ["KG_RESOLVER_INPUT"])
|
|
359
|
+
else:
|
|
360
|
+
print("用法: kg-resolver <input.json>", file=sys.stderr)
|
|
361
|
+
print(" 或设置 KG_RESOLVER_INPUT 环境变量", file=sys.stderr)
|
|
362
|
+
sys.exit(1)
|
|
363
|
+
|
|
364
|
+
if not src_path.exists():
|
|
365
|
+
print(f"❌ 未找到提取结果: {src_path}")
|
|
366
|
+
sys.exit(1)
|
|
367
|
+
|
|
368
|
+
data = json.loads(src_path.read_text(encoding="utf-8"))
|
|
369
|
+
entities: list[dict] = data["entities"]
|
|
370
|
+
relations: list[dict] = data["relations"]
|
|
371
|
+
|
|
372
|
+
print(f"📥 加载: {len(entities)} 实体, {len(relations)} 关系")
|
|
373
|
+
|
|
374
|
+
skip_fuzzy = os.environ.get("SKIP_FUZZY", "true").lower() in ("true", "1", "yes")
|
|
375
|
+
entities, relations = resolve(entities, relations, skip_fuzzy=skip_fuzzy)
|
|
376
|
+
|
|
377
|
+
# 保存结果
|
|
378
|
+
output = {
|
|
379
|
+
"entities": entities,
|
|
380
|
+
"relations": relations,
|
|
381
|
+
}
|
|
382
|
+
out_path_str = os.environ.get("KG_RESOLVER_OUTPUT", "extraction_result_clean.json")
|
|
383
|
+
out_path = Path(out_path_str)
|
|
384
|
+
out_path.write_text(
|
|
385
|
+
json.dumps(output, indent=2, ensure_ascii=False),
|
|
386
|
+
encoding="utf-8",
|
|
387
|
+
)
|
|
388
|
+
print(f"\n✅ 已保存: {out_path}")
|
|
389
|
+
print(f" 实体: {len(entities)} 个, 关系: {len(relations)} 条")
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
if __name__ == "__main__":
|
|
393
|
+
main()
|