knowledge-graph-kit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge_graph_kit-0.1.0/LICENSE +21 -0
- knowledge_graph_kit-0.1.0/PKG-INFO +120 -0
- knowledge_graph_kit-0.1.0/README.md +93 -0
- knowledge_graph_kit-0.1.0/pyproject.toml +49 -0
- knowledge_graph_kit-0.1.0/setup.cfg +4 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit/__init__.py +42 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit/chunker.py +252 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit/entity_resolver.py +393 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit/extractor.py +419 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit/neo4j_writer.py +273 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit/schema.txt +38 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit.egg-info/PKG-INFO +120 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit.egg-info/SOURCES.txt +15 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit.egg-info/dependency_links.txt +1 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit.egg-info/entry_points.txt +5 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit.egg-info/requires.txt +5 -0
- knowledge_graph_kit-0.1.0/src/knowledge_graph_kit.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: knowledge-graph-kit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: 知识图谱构建管线:文本分块、LLM抽取、实体解析、Neo4j写入
|
|
5
|
+
Author-email: hbue_jerry <lovecpp@foxmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jerryhbue/property-graph
|
|
8
|
+
Project-URL: Source, https://github.com/jerryhbue/property-graph
|
|
9
|
+
Project-URL: Tracker, https://github.com/jerryhbue/property-graph/issues
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
22
|
+
Requires-Dist: openai>=1.0.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: neo4j>=5.0.0
|
|
25
|
+
Requires-Dist: httpx>=0.28.0
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# knowledge-graph-kit
|
|
29
|
+
|
|
30
|
+
知识图谱构建管线:从教材文本中抽取实体和关系,写入 Neo4j 图数据库。
|
|
31
|
+
|
|
32
|
+
## 安装
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install knowledge-graph-kit
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## 环境变量
|
|
39
|
+
|
|
40
|
+
| 变量 | 说明 | 默认值 |
|
|
41
|
+
|---|---|---|
|
|
42
|
+
| `OPENAI_API_KEY` | OpenAI API 密钥 | — |
|
|
43
|
+
| `OPENAI_BASE_URL` | OpenAI API 地址(可换兼容 API) | — |
|
|
44
|
+
| `LLM_MODEL_NAME` | 模型名称 | `gpt-4o-mini` |
|
|
45
|
+
| `NEO4J_URI` | Neo4j 连接地址 | `bolt://localhost:7687` |
|
|
46
|
+
| `NEO4J_USERNAME` | Neo4j 用户名 | `neo4j` |
|
|
47
|
+
| `NEO4J_PASSWORD` | Neo4j 密码 | `12345678` |
|
|
48
|
+
|
|
49
|
+
在项目根目录创建 `.env` 文件即可自动加载,或直接在 shell 中设置。
|
|
50
|
+
|
|
51
|
+
## CLI 命令
|
|
52
|
+
|
|
53
|
+
安装后提供 4 个命令行工具:
|
|
54
|
+
|
|
55
|
+
### 1. 文本分块
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
kg-chunker <txt_path>
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
按章节标题将教材文本拆分为语义块。
|
|
62
|
+
|
|
63
|
+
### 2. 实体关系抽取
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
kg-extractor <txt_path>
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
基于本体 schema 引导 LLM 抽取实体和关系,输出 `extraction_result.json`。
|
|
70
|
+
|
|
71
|
+
可通过 `KG_SCHEMA_PATH` 环境变量指定自定义 ontology 文件。
|
|
72
|
+
|
|
73
|
+
### 3. 实体解析去重
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
kg-resolver <input.json>
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
精确/模糊去重实体,清理属性,更新关系引用,输出 `extraction_result_clean.json`。
|
|
80
|
+
|
|
81
|
+
### 4. 写入 Neo4j
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
kg-neo4j-writer [input.json]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
将清洗后的结果写入 Neo4j 图数据库。默认尝试读取 `extraction_result_clean.json` 或 `extraction_result.json`。
|
|
88
|
+
|
|
89
|
+
## 程序化使用
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from knowledge_graph_kit import chunk_file, Neo4jWriter
|
|
93
|
+
|
|
94
|
+
# 文本分块
|
|
95
|
+
chunks = chunk_file("教材.txt")
|
|
96
|
+
|
|
97
|
+
# 配置 OpenAI
|
|
98
|
+
from knowledge_graph_kit.extractor import configure
|
|
99
|
+
configure(api_key="sk-xxx", model="gpt-4o")
|
|
100
|
+
|
|
101
|
+
# 写入 Neo4j
|
|
102
|
+
writer = Neo4jWriter(
|
|
103
|
+
uri="bolt://localhost:7687",
|
|
104
|
+
user="neo4j",
|
|
105
|
+
password="your-password",
|
|
106
|
+
)
|
|
107
|
+
writer.write_entities(entities)
|
|
108
|
+
writer.write_relations(relations)
|
|
109
|
+
writer.close()
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## 管线流程
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
txt 文件 → 分块(chunker) → LLM抽取(extractor) → 实体解析(resolver) → Neo4j写入(writer)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
|
|
120
|
+
MIT
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# knowledge-graph-kit
|
|
2
|
+
|
|
3
|
+
知识图谱构建管线:从教材文本中抽取实体和关系,写入 Neo4j 图数据库。
|
|
4
|
+
|
|
5
|
+
## 安装
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install knowledge-graph-kit
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## 环境变量
|
|
12
|
+
|
|
13
|
+
| 变量 | 说明 | 默认值 |
|
|
14
|
+
|---|---|---|
|
|
15
|
+
| `OPENAI_API_KEY` | OpenAI API 密钥 | — |
|
|
16
|
+
| `OPENAI_BASE_URL` | OpenAI API 地址(可换兼容 API) | — |
|
|
17
|
+
| `LLM_MODEL_NAME` | 模型名称 | `gpt-4o-mini` |
|
|
18
|
+
| `NEO4J_URI` | Neo4j 连接地址 | `bolt://localhost:7687` |
|
|
19
|
+
| `NEO4J_USERNAME` | Neo4j 用户名 | `neo4j` |
|
|
20
|
+
| `NEO4J_PASSWORD` | Neo4j 密码 | `12345678` |
|
|
21
|
+
|
|
22
|
+
在项目根目录创建 `.env` 文件即可自动加载,或直接在 shell 中设置。
|
|
23
|
+
|
|
24
|
+
## CLI 命令
|
|
25
|
+
|
|
26
|
+
安装后提供 4 个命令行工具:
|
|
27
|
+
|
|
28
|
+
### 1. 文本分块
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
kg-chunker <txt_path>
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
按章节标题将教材文本拆分为语义块。
|
|
35
|
+
|
|
36
|
+
### 2. 实体关系抽取
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
kg-extractor <txt_path>
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
基于本体 schema 引导 LLM 抽取实体和关系,输出 `extraction_result.json`。
|
|
43
|
+
|
|
44
|
+
可通过 `KG_SCHEMA_PATH` 环境变量指定自定义 ontology 文件。
|
|
45
|
+
|
|
46
|
+
### 3. 实体解析去重
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
kg-resolver <input.json>
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
精确/模糊去重实体,清理属性,更新关系引用,输出 `extraction_result_clean.json`。
|
|
53
|
+
|
|
54
|
+
### 4. 写入 Neo4j
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
kg-neo4j-writer [input.json]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
将清洗后的结果写入 Neo4j 图数据库。默认尝试读取 `extraction_result_clean.json` 或 `extraction_result.json`。
|
|
61
|
+
|
|
62
|
+
## 程序化使用
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from knowledge_graph_kit import chunk_file, Neo4jWriter
|
|
66
|
+
|
|
67
|
+
# 文本分块
|
|
68
|
+
chunks = chunk_file("教材.txt")
|
|
69
|
+
|
|
70
|
+
# 配置 OpenAI
|
|
71
|
+
from knowledge_graph_kit.extractor import configure
|
|
72
|
+
configure(api_key="sk-xxx", model="gpt-4o")
|
|
73
|
+
|
|
74
|
+
# 写入 Neo4j
|
|
75
|
+
writer = Neo4jWriter(
|
|
76
|
+
uri="bolt://localhost:7687",
|
|
77
|
+
user="neo4j",
|
|
78
|
+
password="your-password",
|
|
79
|
+
)
|
|
80
|
+
writer.write_entities(entities)
|
|
81
|
+
writer.write_relations(relations)
|
|
82
|
+
writer.close()
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## 管线流程
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
txt 文件 → 分块(chunker) → LLM抽取(extractor) → 实体解析(resolver) → Neo4j写入(writer)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
|
|
93
|
+
MIT
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "knowledge-graph-kit"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "知识图谱构建管线:文本分块、LLM抽取、实体解析、Neo4j写入"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "hbue_jerry", email = "lovecpp@foxmail.com" },
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"python-dotenv>=1.0.0",
|
|
27
|
+
"openai>=1.0.0",
|
|
28
|
+
"pydantic>=2.0.0",
|
|
29
|
+
"neo4j>=5.0.0",
|
|
30
|
+
"httpx>=0.28.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/jerryhbue/property-graph"
|
|
35
|
+
Source = "https://github.com/jerryhbue/property-graph"
|
|
36
|
+
Tracker = "https://github.com/jerryhbue/property-graph/issues"
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
kg-chunker = "knowledge_graph_kit.chunker:main"
|
|
40
|
+
kg-extractor = "knowledge_graph_kit.extractor:main"
|
|
41
|
+
kg-resolver = "knowledge_graph_kit.entity_resolver:main"
|
|
42
|
+
kg-neo4j-writer = "knowledge_graph_kit.neo4j_writer:main"
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.packages.find]
|
|
45
|
+
where = ["src"]
|
|
46
|
+
include = ["knowledge_graph_kit*"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.package-data]
|
|
49
|
+
knowledge_graph_kit = ["schema.txt"]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""knowledge-graph-kit: 知识图谱构建管线。
|
|
2
|
+
|
|
3
|
+
提供从文本分块、LLM 抽取、实体解析到 Neo4j 写入的完整管线。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
from .chunker import Chunk, SectionChunker, chunk_file
|
|
9
|
+
from .entity_resolver import (
|
|
10
|
+
clean_properties,
|
|
11
|
+
dedup_exact,
|
|
12
|
+
fuzzy_align,
|
|
13
|
+
resolve,
|
|
14
|
+
)
|
|
15
|
+
from .extractor import (
|
|
16
|
+
Entity,
|
|
17
|
+
ExtractionResult,
|
|
18
|
+
Relation,
|
|
19
|
+
build_system_prompt,
|
|
20
|
+
extract_from_chunk,
|
|
21
|
+
merge_results,
|
|
22
|
+
parse_schema,
|
|
23
|
+
)
|
|
24
|
+
from .neo4j_writer import Neo4jWriter
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"Chunk",
|
|
28
|
+
"SectionChunker",
|
|
29
|
+
"chunk_file",
|
|
30
|
+
"Entity",
|
|
31
|
+
"Relation",
|
|
32
|
+
"ExtractionResult",
|
|
33
|
+
"parse_schema",
|
|
34
|
+
"build_system_prompt",
|
|
35
|
+
"extract_from_chunk",
|
|
36
|
+
"merge_results",
|
|
37
|
+
"dedup_exact",
|
|
38
|
+
"clean_properties",
|
|
39
|
+
"fuzzy_align",
|
|
40
|
+
"resolve",
|
|
41
|
+
"Neo4jWriter",
|
|
42
|
+
]
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
chunker.py — Section-Aware Chunking
|
|
3
|
+
|
|
4
|
+
从教材 txt 文件读取内容,按多级标题(2.1 / 2.1.1 / ...)拆分为语义块,
|
|
5
|
+
每个 Chunk 保留层级元数据,供后续 LLM 抽取实体和关系。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import sys
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
# Windows 终端 UTF-8 支持
|
|
18
|
+
if sys.platform == "win32":
|
|
19
|
+
try:
|
|
20
|
+
sys.stdout.reconfigure(encoding="utf-8") # type: ignore[attr-defined]
|
|
21
|
+
except Exception:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ── 正则 ────────────────────────────────────────────────────
|
|
26
|
+
# 匹配 "2.1 基本语法"、"2.1.1标识符"(标题前可有空格也可无空格)
|
|
27
|
+
SECTION_RE = re.compile(r"^(\d+)\.(\d+)(?:\.(\d+))?\s*(.+)$")
|
|
28
|
+
# 匹配 "第2章 ES基础"
|
|
29
|
+
CHAPTER_RE = re.compile(r"^第(\d+)章\s+(.+)$")
|
|
30
|
+
# 匹配 "【示例2-1】" 或 "【示例2-10】"
|
|
31
|
+
EXAMPLE_RE = re.compile(r"【示例(\d+)-(\d+)】")
|
|
32
|
+
# 匹配 "表2-1" 或 "表2-10"
|
|
33
|
+
TABLE_RE = re.compile(r"表(\d+)-(\d+)")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ── 数据结构 ────────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class Chunk:
|
|
40
|
+
"""一个语义块"""
|
|
41
|
+
chunk_id: str # "2.1" / "2.1.1"
|
|
42
|
+
title: str # "基本语法" / "标识符"
|
|
43
|
+
level: int # 0=章, 1=节, 2=子节
|
|
44
|
+
parent_id: Optional[str] # "2" 或 "2.1"
|
|
45
|
+
text: str # 块内容(不含标题行自身)
|
|
46
|
+
lines: tuple[int, int] # (start_line, end_line) 1-indexed
|
|
47
|
+
examples: list[str] = field(default_factory=list) # ["2-1", "2-2"]
|
|
48
|
+
tables: list[str] = field(default_factory=list) # ["2-1", "2-2"]
|
|
49
|
+
has_code: bool = False # 是否包含代码示例
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ── 分块器 ──────────────────────────────────────────────────
|
|
53
|
+
|
|
54
|
+
class SectionChunker:
|
|
55
|
+
"""按教材章节层级拆分语义块"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, text: str):
|
|
58
|
+
self.lines = text.splitlines()
|
|
59
|
+
self._chunks: list[Chunk] = []
|
|
60
|
+
|
|
61
|
+
def chunk(self) -> list[Chunk]:
|
|
62
|
+
"""执行分块,返回 Chunk 列表"""
|
|
63
|
+
self._chunks = []
|
|
64
|
+
boundaries = self._find_section_boundaries()
|
|
65
|
+
|
|
66
|
+
for i, (start, end) in enumerate(boundaries):
|
|
67
|
+
header_line = self.lines[start]
|
|
68
|
+
chunk = self._build_chunk(header_line, start, end, boundaries)
|
|
69
|
+
|
|
70
|
+
# 章标题使用特殊 ID
|
|
71
|
+
if chunk.level == 0:
|
|
72
|
+
chunk.chunk_id = f"ch{chunk.chunk_id}"
|
|
73
|
+
|
|
74
|
+
# 提取块内的示例和表格引用
|
|
75
|
+
body = "\n".join(self.lines[start + 1 : end])
|
|
76
|
+
chunk.examples = EXAMPLE_RE.findall(body)
|
|
77
|
+
chunk.tables = TABLE_RE.findall(body)
|
|
78
|
+
chunk.has_code = "```" in body or "<script>" in body
|
|
79
|
+
|
|
80
|
+
self._chunks.append(chunk)
|
|
81
|
+
|
|
82
|
+
return self._chunks
|
|
83
|
+
|
|
84
|
+
# ── 内部方法 ────────────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
def _find_section_boundaries(self) -> list[tuple[int, int]]:
|
|
87
|
+
"""找出所有标题行及其对应的行号区间"""
|
|
88
|
+
headers: list[tuple[int, int, str]] = [] # (line_no, level, title_text)
|
|
89
|
+
|
|
90
|
+
for idx, line in enumerate(self.lines):
|
|
91
|
+
# 尝试匹配章标题
|
|
92
|
+
m = CHAPTER_RE.match(line)
|
|
93
|
+
if m:
|
|
94
|
+
headers.append((idx, 0, line.strip()))
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
# 尝试匹配节/子节标题
|
|
98
|
+
m = SECTION_RE.match(line)
|
|
99
|
+
if m:
|
|
100
|
+
# 判断层级
|
|
101
|
+
if m.group(3) is not None: # 2.1.1 → level 2
|
|
102
|
+
level = 2
|
|
103
|
+
else:
|
|
104
|
+
level = 1
|
|
105
|
+
headers.append((idx, level, line.strip()))
|
|
106
|
+
|
|
107
|
+
# 转换成 (start, end) 区间
|
|
108
|
+
boundaries: list[tuple[int, int]] = []
|
|
109
|
+
for i, (start, _, _) in enumerate(headers):
|
|
110
|
+
end = headers[i + 1][0] if i + 1 < len(headers) else len(self.lines)
|
|
111
|
+
boundaries.append((start, end))
|
|
112
|
+
|
|
113
|
+
return boundaries
|
|
114
|
+
|
|
115
|
+
def _build_chunk(
|
|
116
|
+
self,
|
|
117
|
+
header_line: str,
|
|
118
|
+
start: int,
|
|
119
|
+
end: int,
|
|
120
|
+
boundaries: list[tuple[int, int]],
|
|
121
|
+
) -> Chunk:
|
|
122
|
+
"""从标题行和行区间构建 Chunk"""
|
|
123
|
+
# 提取层级信息
|
|
124
|
+
cm = CHAPTER_RE.match(header_line)
|
|
125
|
+
if cm:
|
|
126
|
+
ch_num = cm.group(1)
|
|
127
|
+
title = cm.group(2)
|
|
128
|
+
# 找前一个 header 作为 parent
|
|
129
|
+
parent_id = None
|
|
130
|
+
for s, _ in boundaries:
|
|
131
|
+
h = self.lines[s]
|
|
132
|
+
sm = SECTION_RE.match(h)
|
|
133
|
+
if sm and sm.group(1) == ch_num and sm.group(3) is None:
|
|
134
|
+
parent_id = sm.group(1)
|
|
135
|
+
break
|
|
136
|
+
return Chunk(
|
|
137
|
+
chunk_id=ch_num,
|
|
138
|
+
title=title,
|
|
139
|
+
level=0,
|
|
140
|
+
parent_id=parent_id,
|
|
141
|
+
text="\n".join(self.lines[start + 1 : end]),
|
|
142
|
+
lines=(start + 1, end),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
sm = SECTION_RE.match(header_line)
|
|
146
|
+
if sm:
|
|
147
|
+
major = sm.group(1)
|
|
148
|
+
minor = sm.group(2)
|
|
149
|
+
sub = sm.group(3) # None 表示 level 1
|
|
150
|
+
title = sm.group(4).strip()
|
|
151
|
+
|
|
152
|
+
if sub is None:
|
|
153
|
+
# Level 1: "2.1 基本语法"
|
|
154
|
+
chunk_id = f"{major}.{minor}"
|
|
155
|
+
parent_id = f"ch{major}"
|
|
156
|
+
level = 1
|
|
157
|
+
else:
|
|
158
|
+
# Level 2: "2.1.1 标识符"
|
|
159
|
+
chunk_id = f"{major}.{minor}.{sub}"
|
|
160
|
+
parent_id = f"{major}.{minor}"
|
|
161
|
+
level = 2
|
|
162
|
+
|
|
163
|
+
return Chunk(
|
|
164
|
+
chunk_id=chunk_id,
|
|
165
|
+
title=title,
|
|
166
|
+
level=level,
|
|
167
|
+
parent_id=parent_id,
|
|
168
|
+
text="\n".join(self.lines[start + 1 : end]),
|
|
169
|
+
lines=(start + 1, end),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# 兜底(理论上不会走到这里)
|
|
173
|
+
return Chunk(
|
|
174
|
+
chunk_id="unknown",
|
|
175
|
+
title=header_line.strip(),
|
|
176
|
+
level=-1,
|
|
177
|
+
parent_id=None,
|
|
178
|
+
text="\n".join(self.lines[start + 1 : end]),
|
|
179
|
+
lines=(start + 1, end),
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ── 入口 ────────────────────────────────────────────────────
|
|
184
|
+
|
|
185
|
+
def chunk_file(txt_path: str | Path) -> list[Chunk]:
|
|
186
|
+
"""读取 txt 文件并分块"""
|
|
187
|
+
txt_path = Path(txt_path)
|
|
188
|
+
if not txt_path.exists():
|
|
189
|
+
raise FileNotFoundError(f"文件未找到: {txt_path}")
|
|
190
|
+
|
|
191
|
+
text = txt_path.read_text(encoding="utf-8")
|
|
192
|
+
chunker = SectionChunker(text)
|
|
193
|
+
return chunker.chunk()
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def safe_print(text: str = "", **kwargs):
|
|
197
|
+
"""安全打印,避免 Windows GBK 编码问题"""
|
|
198
|
+
try:
|
|
199
|
+
print(text, **kwargs)
|
|
200
|
+
except UnicodeEncodeError:
|
|
201
|
+
print(text.encode("utf-8", errors="replace").decode("utf-8", errors="replace"), **kwargs)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def print_summary(chunks: list[Chunk]) -> None:
|
|
205
|
+
"""打印分块摘要(用于验证)"""
|
|
206
|
+
safe_print(f"\n{'='*60}")
|
|
207
|
+
safe_print(f"共 {len(chunks)} 个 Chunk")
|
|
208
|
+
safe_print(f"{'='*60}\n")
|
|
209
|
+
for c in chunks:
|
|
210
|
+
line_info = f"L{c.lines[0]}-{c.lines[1]}"
|
|
211
|
+
parent = f" ← {c.parent_id}" if c.parent_id else ""
|
|
212
|
+
tags = []
|
|
213
|
+
if c.examples:
|
|
214
|
+
tags.append(f"示例:{len(c.examples)}")
|
|
215
|
+
if c.tables:
|
|
216
|
+
tags.append(f"表格:{len(c.tables)}")
|
|
217
|
+
if c.has_code:
|
|
218
|
+
tags.append("含代码")
|
|
219
|
+
tag_str = f" [{', '.join(tags)}]" if tags else ""
|
|
220
|
+
text_preview = c.text[:60].replace("\n", " ")
|
|
221
|
+
safe_print(f" [{c.chunk_id}] (level={c.level}) {line_info}{parent}")
|
|
222
|
+
safe_print(f" 标题: {c.title}")
|
|
223
|
+
safe_print(f" 预览: {text_preview}...{tag_str}")
|
|
224
|
+
safe_print()
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ── CLI ─────────────────────────────────────────────────────
|
|
228
|
+
|
|
229
|
+
def main(argv: list[str] | None = None) -> None:
|
|
230
|
+
"""CLI 入口:分块文本文件并打印摘要。
|
|
231
|
+
|
|
232
|
+
用法: kg-chunker <txt_path>
|
|
233
|
+
kg-chunker (使用 KG_CHUNKER_INPUT 环境变量)
|
|
234
|
+
"""
|
|
235
|
+
if argv is None:
|
|
236
|
+
argv = sys.argv[1:]
|
|
237
|
+
|
|
238
|
+
if len(argv) >= 1:
|
|
239
|
+
txt_path = argv[0]
|
|
240
|
+
elif os.environ.get("KG_CHUNKER_INPUT"):
|
|
241
|
+
txt_path = os.environ["KG_CHUNKER_INPUT"]
|
|
242
|
+
else:
|
|
243
|
+
print("用法: kg-chunker <txt_path>", file=sys.stderr)
|
|
244
|
+
print(" 或设置 KG_CHUNKER_INPUT 环境变量", file=sys.stderr)
|
|
245
|
+
sys.exit(1)
|
|
246
|
+
|
|
247
|
+
chunks = chunk_file(txt_path)
|
|
248
|
+
print_summary(chunks)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
if __name__ == "__main__":
|
|
252
|
+
main()
|