omni-split 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omni_split/__init__.py +16 -0
- omni_split/base/__init__.py +0 -0
- omni_split/base/chonkie_base.py +139 -0
- omni_split/base/chonkie_tokenizer.py +285 -0
- omni_split/base/chonkie_types.py +519 -0
- omni_split/base/md2json_list.py +303 -0
- omni_split/base/md_json_list2chunk.py +348 -0
- omni_split/main.py +73 -0
- omni_split/model/text_chunker_tokenizer/qwen_tokenizer.json +303282 -0
- omni_split/omni_split.py +93 -0
- omni_split/sub_chunker/__init__.py +0 -0
- omni_split/sub_chunker/document_split.py +32 -0
- omni_split/sub_chunker/markdown_split.py +47 -0
- omni_split/sub_chunker/text_split.py +343 -0
- omni_split/test.py +80 -0
- omni_split/utils/__init__.py +0 -0
- omni_split/utils/base_utils.py +181 -0
- omni_split/utils/download_test_doc.py +61 -0
- omni_split-0.0.3.dist-info/METADATA +147 -0
- omni_split-0.0.3.dist-info/RECORD +23 -0
- omni_split-0.0.3.dist-info/WHEEL +5 -0
- omni_split-0.0.3.dist-info/licenses/LICENSE +21 -0
- omni_split-0.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
from mistletoe import Document
|
|
2
|
+
from mistletoe.block_token import Heading, Paragraph, BlockCode, List, ListItem, Quote, Table, TableRow, TableCell, ThematicBreak, CodeFence
|
|
3
|
+
from mistletoe.span_token import Strong, Emphasis, Link, Image, RawText
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
# # {
|
|
8
|
+
# "type": "text",
|
|
9
|
+
# "text": "Game-theoretic Workflow in this Paper In this paper, we integrate game-theoretic principles into. the reasoning processes of LLMs prior to decision-making. By guiding the models to derive rational. strategies and make decisions based on these strategies, we aim to enhance their ability to perform effectively in strategic settings. ",
|
|
10
|
+
# "page_idx": 4
|
|
11
|
+
# },
|
|
12
|
+
# {
|
|
13
|
+
# "type": "text",
|
|
14
|
+
# "text": "Game-theoretic LLM: Agent Workflow for Negotiation Games ",
|
|
15
|
+
# "text_level": 1,
|
|
16
|
+
# "page_idx": 0
|
|
17
|
+
# },
|
|
18
|
+
# {
|
|
19
|
+
# "type": "image",
|
|
20
|
+
# "img_path": "images/9d516d401a6bdc168c3000baaf9f12a86aac5e2b07edb16344678c15119aebb1.jpg",
|
|
21
|
+
# "img_caption": [
|
|
22
|
+
# "Figure 1: Game-theoretic Landscape Investigated in this Paper. "
|
|
23
|
+
# ],
|
|
24
|
+
# "img_footnote": [],
|
|
25
|
+
# "page_idx": 3
|
|
26
|
+
# },
|
|
27
|
+
# {
|
|
28
|
+
# "type": "table",
|
|
29
|
+
# "img_path": "images/f20e0b1b16500e94a492a40b553e7e3ae8ffe66d69de3c9252ee11b32982775b.jpg",
|
|
30
|
+
# "table_caption": [
|
|
31
|
+
# "Table 3b: Payoff matrix for Wait-Go Game "
|
|
32
|
+
# ],
|
|
33
|
+
# "table_footnote": [],
|
|
34
|
+
# "table_body": "\n\n<html><body><table><tr><td></td><td>Wait</td><td>Go</td></tr><tr><td>Wait</td><td>0,0</td><td>0, 2</td></tr><tr><td>Go</td><td>2,0</td><td>-4,-4</td></tr></table></body></html>\n\n",
|
|
35
|
+
# "page_idx": 7
|
|
36
|
+
# },
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def is_markdown_equal(md_str):
|
|
40
|
+
md_str = md_str.strip() # 去除前后空白
|
|
41
|
+
# 检查是否以 $$ 开头和结尾,并且中间没有 $$
|
|
42
|
+
return bool(re.fullmatch(r"^\$\$(.*?)\$\$", md_str, re.DOTALL))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def is_markdown_table(md_str):
|
|
46
|
+
md_str = md_str.strip()
|
|
47
|
+
if md_str.startswith("<html><body><table>") and md_str.endswith("</table></body></html>"):
|
|
48
|
+
return True
|
|
49
|
+
lines = [line.strip() for line in md_str.split("\n") if line.strip()]
|
|
50
|
+
if not lines:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
# 检查所有行是否至少包含一个普通 |(非转义)
|
|
54
|
+
for line in lines:
|
|
55
|
+
if "|" not in line:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
# 单行内容不视为表格(除非是严格的单行表格,但通常 Markdown 表格需要分隔线)
|
|
59
|
+
if len(lines) == 1:
|
|
60
|
+
return False # 直接返回 False,避免误判 LaTeX 等
|
|
61
|
+
|
|
62
|
+
# 多行表格:检查第二行是否是分隔线
|
|
63
|
+
separator_line = lines[1]
|
|
64
|
+
separator_parts = [part.strip() for part in separator_line.split("|") if part.strip()]
|
|
65
|
+
for part in separator_parts:
|
|
66
|
+
if not re.fullmatch(r"^:?-+:?$", part):
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
return True # 只有符合所有条件才返回 True
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def split_image_url_func(text):
|
|
73
|
+
parts = re.split(r"(!\[\]\([^)]+\))", text.strip('"'))
|
|
74
|
+
parts = [p for p in parts if p]
|
|
75
|
+
ret_parts = []
|
|
76
|
+
for item in parts:
|
|
77
|
+
if item != "":
|
|
78
|
+
ret_parts.append(item)
|
|
79
|
+
return ret_parts
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def md2json_list_func(md_content):
|
|
83
|
+
# 解析Markdown为AST
|
|
84
|
+
doc = Document(md_content)
|
|
85
|
+
|
|
86
|
+
result = []
|
|
87
|
+
|
|
88
|
+
# 遍历AST中的每个节点
|
|
89
|
+
for child in doc.children:
|
|
90
|
+
# 处理标题
|
|
91
|
+
if isinstance(child, Heading):
|
|
92
|
+
level = child.level
|
|
93
|
+
content = get_inline_md(child.children) if hasattr(child, "children") and child.children else ""
|
|
94
|
+
result.append({"text": content, "type": "text", "text_level": level, "page_idx": None})
|
|
95
|
+
|
|
96
|
+
# 处理段落
|
|
97
|
+
elif isinstance(child, Paragraph):
|
|
98
|
+
# 检查是否是独立的图片
|
|
99
|
+
## todo
|
|
100
|
+
if hasattr(child, "children") and child.children and len(child.children) == 1 and isinstance(child.children[0], Image):
|
|
101
|
+
img = child.children[0]
|
|
102
|
+
result.append(
|
|
103
|
+
{
|
|
104
|
+
"type": "image",
|
|
105
|
+
"img_path": img.src,
|
|
106
|
+
"img_caption": [img.title if hasattr(img, "title") else ""],
|
|
107
|
+
"img_footnote": [img.title if hasattr(img, "title") else ""],
|
|
108
|
+
"page_idx": None,
|
|
109
|
+
},
|
|
110
|
+
)
|
|
111
|
+
# result.append({"content": {"src": img.src if hasattr(img, "src") else "", "title": img.title if hasattr(img, "title") else "", "description": img.title if hasattr(img, "title") else ""}, "type": "image"})
|
|
112
|
+
else:
|
|
113
|
+
content = get_inline_md(child.children) if hasattr(child, "children") and child.children else ""
|
|
114
|
+
split_content_list = split_image_url_func(content)
|
|
115
|
+
for split_content in split_content_list:
|
|
116
|
+
if split_content.startswith("![") and split_content.endswith(")"):
|
|
117
|
+
##note: 处理图片
|
|
118
|
+
temp = {
|
|
119
|
+
"type": "image",
|
|
120
|
+
"img_path": split_content[4:-1],
|
|
121
|
+
"img_caption": [""],
|
|
122
|
+
"img_footnote": [""],
|
|
123
|
+
"page_idx": None,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
elif is_markdown_table(split_content):
|
|
127
|
+
##note: 处理表格
|
|
128
|
+
## 处理表格
|
|
129
|
+
temp = {
|
|
130
|
+
"type": "table",
|
|
131
|
+
"img_path": None,
|
|
132
|
+
"table_caption": [""],
|
|
133
|
+
"table_footnote": [""],
|
|
134
|
+
"table_body": f"{split_content}\n",
|
|
135
|
+
"page_idx": None,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
elif is_markdown_equal(split_content):
|
|
139
|
+
##note: 处理单行公式(非行内公式)
|
|
140
|
+
temp = {
|
|
141
|
+
"type": "equation",
|
|
142
|
+
"text": f"{split_content}\n",
|
|
143
|
+
"text_format": "latex",
|
|
144
|
+
"page_idx": None,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
else:
|
|
148
|
+
##note: 其他的按照标准text对待.
|
|
149
|
+
temp = {
|
|
150
|
+
"text": f"{split_content}\n",
|
|
151
|
+
"type": "text",
|
|
152
|
+
"page_idx": None,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
result.append(temp)
|
|
156
|
+
# 处理代码块
|
|
157
|
+
elif isinstance(child, (BlockCode, CodeFence)):
|
|
158
|
+
if isinstance(child, BlockCode):
|
|
159
|
+
# BlockCode(缩进代码块):手动添加 ``` 标记
|
|
160
|
+
code_content = child.children[0].content if hasattr(child, "children") and child.children else ""
|
|
161
|
+
language = ""
|
|
162
|
+
code_content = f"```\n{code_content}\n```" # 添加 ``` 围栏
|
|
163
|
+
else:
|
|
164
|
+
# CodeFence(围栏代码块):保留原始 ```language\ncontent\n```
|
|
165
|
+
code_content = child.children[0].content if hasattr(child, "children") and child.children else ""
|
|
166
|
+
language = getattr(child, "language", "")
|
|
167
|
+
|
|
168
|
+
fence_char = getattr(child, "fence_char", "`") # 可能是 ` 或 ~
|
|
169
|
+
fence_length = getattr(child, "fence_length", 3) # 通常是 3
|
|
170
|
+
fence = fence_char * fence_length
|
|
171
|
+
|
|
172
|
+
code_content = f"{fence}{language}\n{code_content}\n{fence}"
|
|
173
|
+
|
|
174
|
+
result.append({"content": code_content, "type": "code", "language": language})
|
|
175
|
+
|
|
176
|
+
# 处理列表
|
|
177
|
+
elif isinstance(child, List):
|
|
178
|
+
items = []
|
|
179
|
+
if hasattr(child, "children") and child.children:
|
|
180
|
+
for item in child.children:
|
|
181
|
+
if isinstance(item, ListItem):
|
|
182
|
+
item_content = get_inline_md(item.children) if hasattr(item, "children") and item.children else ""
|
|
183
|
+
items.append(item_content)
|
|
184
|
+
|
|
185
|
+
# result.append({"content": items, "type": "list", "ordered": child.start is not None if hasattr(child, "start") else False})
|
|
186
|
+
temp_str = ""
|
|
187
|
+
for item in items:
|
|
188
|
+
temp_str += "- " + item + "\n"
|
|
189
|
+
temp = {
|
|
190
|
+
"text": temp_str,
|
|
191
|
+
"type": "text",
|
|
192
|
+
"page_idx": None,
|
|
193
|
+
}
|
|
194
|
+
result.append(temp)
|
|
195
|
+
|
|
196
|
+
# 处理引用
|
|
197
|
+
elif isinstance(child, Quote):
|
|
198
|
+
content = get_inline_md(child.children) if hasattr(child, "children") and child.children else ""
|
|
199
|
+
temp = {
|
|
200
|
+
"text": str(content)+"\n",
|
|
201
|
+
"type": "text",
|
|
202
|
+
"page_idx": None,
|
|
203
|
+
}
|
|
204
|
+
result.append(temp)
|
|
205
|
+
|
|
206
|
+
# result.append({"content": content, "type": "text", "page_idx": None})
|
|
207
|
+
|
|
208
|
+
# 处理表格
|
|
209
|
+
elif isinstance(child, Table):
|
|
210
|
+
table_content = ""
|
|
211
|
+
if hasattr(child, "children") and child.children:
|
|
212
|
+
# 重建表格的Markdown表示
|
|
213
|
+
if hasattr(child, "header") and child.header:
|
|
214
|
+
header_row = child.header
|
|
215
|
+
if hasattr(header_row, "children") and header_row.children:
|
|
216
|
+
table_content += "| " + " | ".join(get_inline_md(cell.children) if hasattr(cell, "children") else "" for cell in header_row.children if isinstance(cell, TableCell)) + " |\n"
|
|
217
|
+
table_content += "| " + " | ".join(["---"] * len(header_row.children)) + " |\n"
|
|
218
|
+
|
|
219
|
+
for row in child.children[1:] if hasattr(child, "children") else []:
|
|
220
|
+
if isinstance(row, TableRow) and hasattr(row, "children") and row.children:
|
|
221
|
+
table_content += "| " + " | ".join(get_inline_md(cell.children) if hasattr(cell, "children") else "" for cell in row.children if isinstance(cell, TableCell)) + " |\n"
|
|
222
|
+
result.append(
|
|
223
|
+
{
|
|
224
|
+
"type": "table",
|
|
225
|
+
"img_path": None,
|
|
226
|
+
"table_caption": [""],
|
|
227
|
+
"table_footnote": [""],
|
|
228
|
+
"table_body": table_content.strip(),
|
|
229
|
+
"page_idx": None,
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
# 处理分隔线
|
|
233
|
+
elif isinstance(child, ThematicBreak):
|
|
234
|
+
result.append(
|
|
235
|
+
{
|
|
236
|
+
"content": "---\n",
|
|
237
|
+
"type": "text",
|
|
238
|
+
"page_idx": None,
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
# 处理数学公式块
|
|
242
|
+
elif is_math_block(child):
|
|
243
|
+
content = child.children[0].content.strip() if hasattr(child, "children") and child.children and len(child.children) > 0 else ""
|
|
244
|
+
result.append(
|
|
245
|
+
{
|
|
246
|
+
"type": "equation",
|
|
247
|
+
"text": content,
|
|
248
|
+
"text_format": "latex",
|
|
249
|
+
"page_idx": None,
|
|
250
|
+
}
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return result
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def get_inline_md(tokens):
|
|
257
|
+
"""获取行内元素的Markdown表示"""
|
|
258
|
+
if not tokens:
|
|
259
|
+
return ""
|
|
260
|
+
|
|
261
|
+
md = ""
|
|
262
|
+
for token in tokens:
|
|
263
|
+
if isinstance(token, RawText):
|
|
264
|
+
md += token.content if hasattr(token, "content") else ""
|
|
265
|
+
elif isinstance(token, Strong):
|
|
266
|
+
md += f"**{get_inline_md(token.children) if hasattr(token, 'children') else ''}**"
|
|
267
|
+
elif isinstance(token, Emphasis):
|
|
268
|
+
md += f"*{get_inline_md(token.children) if hasattr(token, 'children') else ''}*"
|
|
269
|
+
elif isinstance(token, Link):
|
|
270
|
+
md += f"[{get_inline_md(token.children) if hasattr(token, 'children') else ''}]({token.target if hasattr(token, 'target') else ''})"
|
|
271
|
+
elif isinstance(token, Image):
|
|
272
|
+
md += f" else ''})"
|
|
273
|
+
elif is_math_inline(token):
|
|
274
|
+
content = token.children[0].content if hasattr(token, "children") and token.children and len(token.children) > 0 else ""
|
|
275
|
+
md += f"${content}$"
|
|
276
|
+
elif hasattr(token, "children"):
|
|
277
|
+
md += get_inline_md(token.children) if token.children else ""
|
|
278
|
+
return md
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def is_math_block(token):
|
|
282
|
+
"""检查是否是数学公式块"""
|
|
283
|
+
if isinstance(token, Paragraph) and hasattr(token, "children") and token.children and len(token.children) == 1:
|
|
284
|
+
content = token.children[0].content.strip() if hasattr(token.children[0], "content") else ""
|
|
285
|
+
return isinstance(token.children[0], RawText) and content.startswith("$$") and content.endswith("$$")
|
|
286
|
+
return False
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def is_math_inline(token):
|
|
290
|
+
"""检查是否是行内数学公式"""
|
|
291
|
+
if isinstance(token, RawText):
|
|
292
|
+
content = token.content.strip() if hasattr(token, "content") else ""
|
|
293
|
+
return content.startswith("$") and content.endswith("$") and len(content) > 1
|
|
294
|
+
return False
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
if __name__ == "__main__":
|
|
298
|
+
print("main function invoke")
|
|
299
|
+
with open("./test/c8d4614affc19ba92d7ba0671fd709803d0488a0c5a68bc237783a8af39fe32e/1c7fbb26-1012-4b03-894c-69ab2257985c_1743677710.4311144.md", "r") as f:
|
|
300
|
+
md_content = f.read()
|
|
301
|
+
|
|
302
|
+
json_list = md2json_list_func(md_content)
|
|
303
|
+
print(json.dumps(json_list, indent=4, ensure_ascii=False))
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
# 设置文本长度限制
|
|
5
|
+
|
|
6
|
+
# 定义假性公式的标志
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# 定义占位符前缀
|
|
10
|
+
element_placeholder = {"equation": "[EQUATION]", "image": "[IMAGE]", "table": "[TABLE]"}
|
|
11
|
+
PSEUDO_EQUATION_FLAG = "[PSEUDO_EQUATION]"
|
|
12
|
+
|
|
13
|
+
# 用于保存占位符映射内容,格式:(placeholder, element_type, content)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# ---- 核心功能函数 ----
|
|
17
|
+
def count_words(text, placeholder_size_map=None):
|
|
18
|
+
## todo 使用tokenizer来计算字数
|
|
19
|
+
"""更精确的字数统计方法(接近Word统计规则)"""
|
|
20
|
+
# 统计中文字符(含中文标点)
|
|
21
|
+
chinese_count = len(re.findall(r"[\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef]", text))
|
|
22
|
+
# 统计英文单词(字母数字组合)
|
|
23
|
+
english_words = re.findall(r"\b[a-zA-Z0-9]+\b", text)
|
|
24
|
+
# 统计特殊符号(非中文、非字母数字的独立字符)
|
|
25
|
+
special_chars = re.findall(r"(?<!\S)[^\w\u4e00-\u9fa5](?!\S)", text)
|
|
26
|
+
|
|
27
|
+
base_count = chinese_count + len(english_words) + len(special_chars)
|
|
28
|
+
|
|
29
|
+
if placeholder_size_map:
|
|
30
|
+
# 查找文本中的占位符并累加其实际长度
|
|
31
|
+
# 占位符格式: [TYPE]_index, e.g., [TABLE]_0, [IMAGE]_1
|
|
32
|
+
placeholders = re.findall(r"\[(?:TABLE|IMAGE|EQUATION)\]_\d+", text)
|
|
33
|
+
for ph in placeholders:
|
|
34
|
+
if ph in placeholder_size_map:
|
|
35
|
+
# 加上实际内容的长度
|
|
36
|
+
base_count += placeholder_size_map[ph]
|
|
37
|
+
# 减去占位符本身被统计的长度(大致估算)
|
|
38
|
+
# 占位符通常会被统计为 1 个单词 (e.g. [TABLE]_0) 或几个特殊字符
|
|
39
|
+
# 这里简单处理:如果不扣除,误差很小且偏向保守(偏大),是安全的
|
|
40
|
+
# 若要精确,可以计算 ph 的 word count 并减去
|
|
41
|
+
base_count -= 1 # 简单减去1,假设占位符算作一个词
|
|
42
|
+
|
|
43
|
+
return base_count
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def split_text_by_words(text, max_words, soft_words, placeholder_size_map=None):
|
|
47
|
+
"""智能分块算法,保留原始换行符"""
|
|
48
|
+
chunks = []
|
|
49
|
+
current_count = 0
|
|
50
|
+
buffer = []
|
|
51
|
+
|
|
52
|
+
# 按自然断点分割,增加换行符作为分隔符,以便处理连续的表格/图片
|
|
53
|
+
sentences = re.split(r"([。!?;\.\?!]|\n)", text)
|
|
54
|
+
sentences = [s for s in sentences if s] # 保留换行符,但去掉空字符串 (re.split might produce empty strings)
|
|
55
|
+
|
|
56
|
+
for i in range(0, len(sentences), 2):
|
|
57
|
+
sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")
|
|
58
|
+
sentence_word_count = count_words(sentence, placeholder_size_map)
|
|
59
|
+
|
|
60
|
+
# 强制分割条件
|
|
61
|
+
if current_count + sentence_word_count > max_words:
|
|
62
|
+
if buffer:
|
|
63
|
+
chunks.append("".join(buffer)) # 改为直接拼接,不添加额外换行
|
|
64
|
+
buffer = []
|
|
65
|
+
current_count = 0
|
|
66
|
+
|
|
67
|
+
buffer.append(sentence)
|
|
68
|
+
current_count += sentence_word_count
|
|
69
|
+
|
|
70
|
+
# 软分割条件
|
|
71
|
+
if current_count >= soft_words:
|
|
72
|
+
chunks.append("".join(buffer))
|
|
73
|
+
buffer = []
|
|
74
|
+
current_count = 0
|
|
75
|
+
|
|
76
|
+
if buffer:
|
|
77
|
+
chunks.append("".join(buffer))
|
|
78
|
+
ret_chunks = []
|
|
79
|
+
for item in chunks:
|
|
80
|
+
ret_chunks.append(item.replace("\n\n\n", "\n"))
|
|
81
|
+
return ret_chunks
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def find_balanced_split(text):
|
|
85
|
+
"""寻找最佳均分点"""
|
|
86
|
+
mid = len(text) // 2
|
|
87
|
+
split_chars = ["\n", "。", ";", "!", "?", ";"]
|
|
88
|
+
|
|
89
|
+
# 向前查找
|
|
90
|
+
for i in range(mid, min(mid + 300, len(text))):
|
|
91
|
+
if text[i] in split_chars:
|
|
92
|
+
return i + 1 # 包含分割符
|
|
93
|
+
|
|
94
|
+
# 向后查找
|
|
95
|
+
for i in range(mid, max(mid - 300, 0), -1):
|
|
96
|
+
if text[i] in split_chars:
|
|
97
|
+
return i + 1
|
|
98
|
+
|
|
99
|
+
return mid
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---- 文档结构处理 ----
|
|
103
|
+
def process_sections(data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT, placeholder_size_map):
|
|
104
|
+
"""处理章节结构"""
|
|
105
|
+
processed = []
|
|
106
|
+
current_title = None
|
|
107
|
+
accumulated = []
|
|
108
|
+
if len(data) ==1:
|
|
109
|
+
data[0]["text_level"] = 0
|
|
110
|
+
for item in data:
|
|
111
|
+
if is_section_title(item):
|
|
112
|
+
if current_title is not None:
|
|
113
|
+
flush_section(current_title, accumulated, processed, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT, placeholder_size_map)
|
|
114
|
+
current_title = item["text"]
|
|
115
|
+
accumulated = []
|
|
116
|
+
else:
|
|
117
|
+
accumulated.append(item["text"])
|
|
118
|
+
|
|
119
|
+
flush_section(current_title, accumulated, processed, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT, placeholder_size_map)
|
|
120
|
+
return processed
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def flush_section(title, parts, output, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT, placeholder_size_map):
|
|
124
|
+
"""处理单个章节内容"""
|
|
125
|
+
full_text =title+ "\n".join(parts)
|
|
126
|
+
if not full_text.strip():
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
word_count = count_words(full_text, placeholder_size_map)
|
|
130
|
+
|
|
131
|
+
if word_count <= MAX_CHUNK_WORDS:
|
|
132
|
+
output.append(build_chunk(title, full_text))
|
|
133
|
+
elif MAX_CHUNK_WORDS < word_count <= HARD_LIMIT:
|
|
134
|
+
split_pos = find_balanced_split(full_text)
|
|
135
|
+
output.append(build_chunk(title, full_text[:split_pos]))
|
|
136
|
+
output.append(build_chunk(None, full_text[split_pos:]))
|
|
137
|
+
else:
|
|
138
|
+
chunks = split_text_by_words(full_text, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, placeholder_size_map)
|
|
139
|
+
for i, chunk in enumerate(chunks):
|
|
140
|
+
output.append(build_chunk(title if i == 0 else None, chunk))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def build_chunk(title, text):
|
|
144
|
+
"""构建分块结构"""
|
|
145
|
+
if title:
|
|
146
|
+
return {"type": "text", "text": f"{title}\n{text.strip()}"}
|
|
147
|
+
return {"type": "text", "text": text.strip()}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def is_section_title(item):
|
|
151
|
+
"""判断是否为章节标题"""
|
|
152
|
+
return item.get("type") == "text" and item.get("text_level") == 1
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# 生成唯一占位符并记录映射
|
|
156
|
+
def generate_placeholder(element_type, content, placeholder_map, placeholder_size_map):
|
|
157
|
+
placeholder = element_placeholder[element_type] + f"_{len(placeholder_map)}"
|
|
158
|
+
placeholder_map.append((placeholder, element_type, content))
|
|
159
|
+
# 计算 content 的字数并存入 map
|
|
160
|
+
placeholder_size_map[placeholder] = count_words(content)
|
|
161
|
+
# print(f"Generated placeholder: {placeholder} for {element_type} with content:\n{content}\n")
|
|
162
|
+
return placeholder, placeholder_map, placeholder_size_map
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# 根据占位符还原文本内容
|
|
166
|
+
def restore_placeholders(text, placeholder_map):
|
|
167
|
+
for placeholder, element_type, content in placeholder_map:
|
|
168
|
+
if placeholder in text:
|
|
169
|
+
# 直接替换为之前生成的内容
|
|
170
|
+
text = text.replace(placeholder, content)
|
|
171
|
+
return text
|
|
172
|
+
def replace_PSEUDO_EQUATION_FLAG(text, replace_text="\n"):
|
|
173
|
+
return text.replace(PSEUDO_EQUATION_FLAG, replace_text)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def merge_element(prev, current, placeholder_map, placeholder_size_map):
|
|
177
|
+
"""
|
|
178
|
+
* @description:
|
|
179
|
+
# 合并元素:
|
|
180
|
+
# - 图像:用 Markdown 语法生成 " 图片描述",并与上文合并。
|
|
181
|
+
# - 表格:将表格描述(如 table_caption)添加在表格内容上方,与上文合并。
|
|
182
|
+
# - 公式:真公式与上下文合并(原逻辑不变)。假公式与下文合并
|
|
183
|
+
* @param prev :
|
|
184
|
+
* @param current :
|
|
185
|
+
* @return
|
|
186
|
+
"""
|
|
187
|
+
if current["type"] in ["equation", "image", "table"]:
|
|
188
|
+
if current["type"] == "image":
|
|
189
|
+
# 针对图像,取 img_path 和 img_caption
|
|
190
|
+
img_path = current.get("img_path", "[Image path missing]")
|
|
191
|
+
img_cap = current.get("img_caption")
|
|
192
|
+
if isinstance(img_cap, list):
|
|
193
|
+
img_caption_text = " ".join(img_cap) if img_cap else ""
|
|
194
|
+
else:
|
|
195
|
+
img_caption_text = img_cap if img_cap else ""
|
|
196
|
+
# Markdown 格式: 后接图片描述
|
|
197
|
+
placeholder_content = f" {img_caption_text}"
|
|
198
|
+
elif current["type"] == "table":
|
|
199
|
+
# 针对表格,先获取描述(table_caption),若没有则为空
|
|
200
|
+
table_caption = current.get("table_caption")
|
|
201
|
+
if isinstance(table_caption, list):
|
|
202
|
+
table_caption_text = " ".join(table_caption) if table_caption else ""
|
|
203
|
+
else:
|
|
204
|
+
table_caption_text = table_caption if table_caption else ""
|
|
205
|
+
# 表格内容取 table_body,如不存在则尝试 text 字段
|
|
206
|
+
table_body = current.get("table_body", current.get("text", "[Table content missing]"))
|
|
207
|
+
# 生成内容:在表格上方添加描述,再换行后显示表格内容
|
|
208
|
+
placeholder_content = f"{table_caption_text}\n{table_body}"
|
|
209
|
+
else: # equation
|
|
210
|
+
placeholder_content = current.get("text", "[Equation]")
|
|
211
|
+
|
|
212
|
+
# 生成占位符并记录映射
|
|
213
|
+
placeholder, placeholder_map, placeholder_size_map = generate_placeholder(current["type"], placeholder_content, placeholder_map, placeholder_size_map)
|
|
214
|
+
|
|
215
|
+
# 合并规则:
|
|
216
|
+
# 公式与上下文合并;图像和表格只与上文合并
|
|
217
|
+
if current["type"] == "equation":
|
|
218
|
+
if prev and prev["type"] == "text":
|
|
219
|
+
# 合并公式到上一个文本段,并标记为假性公式
|
|
220
|
+
prev["text"] += PSEUDO_EQUATION_FLAG + placeholder
|
|
221
|
+
prev["is_pseudo_equation"] = True # 标记为假性公式
|
|
222
|
+
return prev, None, placeholder_map, placeholder_size_map
|
|
223
|
+
else:
|
|
224
|
+
if prev and prev["type"] == "text":
|
|
225
|
+
prev["text"] += "\n" + placeholder + "\n"
|
|
226
|
+
return prev, None, placeholder_map, placeholder_size_map
|
|
227
|
+
|
|
228
|
+
return prev, current, placeholder_map, placeholder_size_map
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def pre_handle_func(data):
|
|
232
|
+
"""
|
|
233
|
+
* @description: 预处理数据
|
|
234
|
+
* @param data :
|
|
235
|
+
* @return
|
|
236
|
+
"""
|
|
237
|
+
# 如果第一个元素不是文本,则在最前面插入一个空文本项,确保有上文可以合并
|
|
238
|
+
if data and data[0]["type"] != "text":
|
|
239
|
+
data.insert(0, {"type": "text", "text": "", "text_level": 1})
|
|
240
|
+
|
|
241
|
+
# 确保每个元素都有 "text" 和 "text_level" 键,避免 KeyError
|
|
242
|
+
for item in data:
|
|
243
|
+
if "text" not in item:
|
|
244
|
+
item["text"] = ""
|
|
245
|
+
if "text_level" not in item:
|
|
246
|
+
item["text_level"] = 0
|
|
247
|
+
|
|
248
|
+
# 过滤掉 "text" 为空的项,但保留 image、equation、table 类型的项以及 text_level 为 1 的项
|
|
249
|
+
filtered_data = [
|
|
250
|
+
item for item in data
|
|
251
|
+
if item.get("text") != "" or item["type"] in ["image", "equation", "table"] or item.get("text_level") == 1
|
|
252
|
+
]
|
|
253
|
+
processed_data = []
|
|
254
|
+
previous_item = None
|
|
255
|
+
placeholder_map = []
|
|
256
|
+
placeholder_size_map = {} # 新增:用于记录占位符对应的实际字数
|
|
257
|
+
# 主处理流程:合并文本、拆分长文本等
|
|
258
|
+
for idx, item in enumerate(filtered_data):
|
|
259
|
+
if previous_item:
|
|
260
|
+
# 合并相邻的 "text_level": 1(标题连续),但如果文本中包含摘要或关键字,则不合并
|
|
261
|
+
if previous_item["type"] == "text" and item["type"] == "text":
|
|
262
|
+
if filtered_data[idx-1]==1 and item.get("text_level") == 1:
|
|
263
|
+
previous_item["text"] += "-" + item["text"]
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
# 合并元素(图像、表格与上文,公式与上下文)
|
|
267
|
+
previous_item, item, placeholder_map, placeholder_size_map = merge_element(previous_item, item, placeholder_map, placeholder_size_map)
|
|
268
|
+
if item is None:
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
# 处理假性公式的合并
|
|
272
|
+
if previous_item.get("is_pseudo_equation", False):
|
|
273
|
+
# 如果下一个文本段是标题,则不合并
|
|
274
|
+
if item.get("text_level") == 1:
|
|
275
|
+
processed_data.append(previous_item)
|
|
276
|
+
previous_item = item
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
# 如果当前文本段加上下一个文本段的长度超过 1200,则不合并
|
|
280
|
+
# 注意:这里使用简单的 len() 可能会低估包含占位符的文本实际长度,
|
|
281
|
+
# 但为了保持原有逻辑大致不变,且考虑到这里是初步合并,暂时保留。
|
|
282
|
+
# 理想情况应该用 count_words 结合 placeholder_size_map 判断。
|
|
283
|
+
|
|
284
|
+
# Update: 使用更准确的字数统计,以避免在预处理阶段就合并了过大的块,
|
|
285
|
+
# 导致后续 split_text_by_words 难以处理或产生非预期的巨大块。
|
|
286
|
+
# 虽然这里硬编码了 1200,但为了解决 "table chunk too long" 问题,
|
|
287
|
+
# 我们需要在这里也感知占位符的实际大小。
|
|
288
|
+
|
|
289
|
+
prev_len = count_words(previous_item["text"], placeholder_size_map)
|
|
290
|
+
curr_len = count_words(item["text"], placeholder_size_map)
|
|
291
|
+
|
|
292
|
+
if prev_len + curr_len > 1200:
|
|
293
|
+
processed_data.append(previous_item)
|
|
294
|
+
previous_item = item
|
|
295
|
+
continue
|
|
296
|
+
|
|
297
|
+
# 否则,合并假性公式与下一个文本段
|
|
298
|
+
previous_item["text"] += item["text"]
|
|
299
|
+
previous_item["is_pseudo_equation"] = False # 清除假性公式标志
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
processed_data.append(previous_item)
|
|
303
|
+
|
|
304
|
+
previous_item = item
|
|
305
|
+
|
|
306
|
+
if previous_item:
|
|
307
|
+
processed_data.append(previous_item)
|
|
308
|
+
return processed_data, placeholder_map, placeholder_size_map
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def markdown_json_list2chunk_list(data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT):
|
|
312
|
+
"""
|
|
313
|
+
* @description: 主要流程, 整理
|
|
314
|
+
* @param data :
|
|
315
|
+
* @param MAX_CHUNK_WORDS :
|
|
316
|
+
* @param SOFT_CHUNK_WORDS :
|
|
317
|
+
* @param HARD_LIMIT :
|
|
318
|
+
* @return
|
|
319
|
+
"""
|
|
320
|
+
pre_handle_data, placeholder_map, placeholder_size_map = pre_handle_func(data=data)
|
|
321
|
+
|
|
322
|
+
middle_handle_data = process_sections(pre_handle_data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT, placeholder_size_map)
|
|
323
|
+
# 统一恢复占位符,确保所有文本中的占位符都被正确替换
|
|
324
|
+
for item in middle_handle_data:
|
|
325
|
+
if item["type"] == "text":
|
|
326
|
+
item["text"] = restore_placeholders(item["text"], placeholder_map)
|
|
327
|
+
item["text"] = replace_PSEUDO_EQUATION_FLAG(item["text"],"\n")
|
|
328
|
+
|
|
329
|
+
# 后处理:过滤掉空的文本元素
|
|
330
|
+
final_handle_data = [
|
|
331
|
+
item for item in middle_handle_data
|
|
332
|
+
if not (item.get("type") == "text" and not item.get("text", "").strip())
|
|
333
|
+
]
|
|
334
|
+
return final_handle_data
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
if __name__ == "__main__":
|
|
338
|
+
## config:
|
|
339
|
+
MAX_CHUNK_WORDS = 1000
|
|
340
|
+
SOFT_CHUNK_WORDS = 400
|
|
341
|
+
HARD_LIMIT = 1400
|
|
342
|
+
##
|
|
343
|
+
with open("temp.json", "r", encoding="utf-8") as file:
|
|
344
|
+
data = json.load(file)
|
|
345
|
+
final_handle_data = markdown_json_list2chunk_list(data, MAX_CHUNK_WORDS=MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS=SOFT_CHUNK_WORDS, HARD_LIMIT=HARD_LIMIT)
|
|
346
|
+
# 保存更新后的 JSON 文件
|
|
347
|
+
with open("output4-yyb-en2.json", "w", encoding="utf-8") as file:
|
|
348
|
+
json.dump(final_handle_data, file, ensure_ascii=False, indent=4)
|