omni-split 0.0.1rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of omni-split might be problematic. Click here for more details.
- omni_split/__init__.py +16 -0
- omni_split/base/__init__.py +0 -0
- omni_split/base/chonkie_base.py +139 -0
- omni_split/base/chonkie_tokenizer.py +285 -0
- omni_split/base/chonkie_types.py +519 -0
- omni_split/base/md2json_list.py +303 -0
- omni_split/base/md_json_list2chunk.py +310 -0
- omni_split/base/native_text_split_utils4content2.py +306 -0
- omni_split/main.py +73 -0
- omni_split/model/text_chunker_tokenizer/qwen_tokenizer.json +303282 -0
- omni_split/omni_split.py +93 -0
- omni_split/sub_chunker/__init__.py +0 -0
- omni_split/sub_chunker/document_split.py +32 -0
- omni_split/sub_chunker/markdown_split.py +47 -0
- omni_split/sub_chunker/text_split.py +343 -0
- omni_split/test.py +80 -0
- omni_split/utils/__init__.py +0 -0
- omni_split/utils/base_utils.py +181 -0
- omni_split/utils/download_test_doc.py +61 -0
- omni_split-0.0.1rc0.dist-info/METADATA +139 -0
- omni_split-0.0.1rc0.dist-info/RECORD +24 -0
- omni_split-0.0.1rc0.dist-info/WHEEL +5 -0
- omni_split-0.0.1rc0.dist-info/licenses/LICENSE +21 -0
- omni_split-0.0.1rc0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
# 设置文本长度限制
|
|
5
|
+
|
|
6
|
+
# 定义假性公式的标志
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# 定义占位符前缀
|
|
10
|
+
element_placeholder = {"equation": "[EQUATION]", "image": "[IMAGE]", "table": "[TABLE]"}
|
|
11
|
+
PSEUDO_EQUATION_FLAG = "[PSEUDO_EQUATION]"
|
|
12
|
+
|
|
13
|
+
# 用于保存占位符映射内容,格式:(placeholder, element_type, content)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# ---- 核心功能函数 ----
|
|
17
|
+
def count_words(text):
|
|
18
|
+
"""更精确的字数统计方法(接近Word统计规则)"""
|
|
19
|
+
# 统计中文字符(含中文标点)
|
|
20
|
+
chinese_count = len(re.findall(r"[\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef]", text))
|
|
21
|
+
# 统计英文单词(字母数字组合)
|
|
22
|
+
english_words = re.findall(r"\b[a-zA-Z0-9]+\b", text)
|
|
23
|
+
# 统计特殊符号(非中文、非字母数字的独立字符)
|
|
24
|
+
special_chars = re.findall(r"(?<!\S)[^\w\u4e00-\u9fa5](?!\S)", text)
|
|
25
|
+
|
|
26
|
+
return chinese_count + len(english_words) + len(special_chars)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def split_text_by_words(text, max_words, soft_words):
|
|
30
|
+
"""智能分块算法"""
|
|
31
|
+
chunks = []
|
|
32
|
+
current_count = 0
|
|
33
|
+
buffer = []
|
|
34
|
+
|
|
35
|
+
# 按自然断点分割
|
|
36
|
+
sentences = re.split(r"([。!?;\.\?!;\n])", text)
|
|
37
|
+
sentences = [s for s in sentences if s.strip()]
|
|
38
|
+
|
|
39
|
+
for i in range(0, len(sentences), 2):
|
|
40
|
+
sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")
|
|
41
|
+
sentence_word_count = count_words(sentence)
|
|
42
|
+
|
|
43
|
+
# 强制分割条件
|
|
44
|
+
if current_count + sentence_word_count > max_words:
|
|
45
|
+
if buffer:
|
|
46
|
+
chunks.append("".join(buffer))
|
|
47
|
+
buffer = []
|
|
48
|
+
current_count = 0
|
|
49
|
+
|
|
50
|
+
buffer.append(sentence)
|
|
51
|
+
current_count += sentence_word_count
|
|
52
|
+
|
|
53
|
+
# 软分割条件
|
|
54
|
+
if current_count >= soft_words:
|
|
55
|
+
chunks.append("".join(buffer))
|
|
56
|
+
buffer = []
|
|
57
|
+
current_count = 0
|
|
58
|
+
|
|
59
|
+
if buffer:
|
|
60
|
+
chunks.append("".join(buffer))
|
|
61
|
+
return chunks
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def find_balanced_split(text):
|
|
65
|
+
"""寻找最佳均分点"""
|
|
66
|
+
mid = len(text) // 2
|
|
67
|
+
split_chars = ["\n", "。", ";", "!", "?", ";"]
|
|
68
|
+
|
|
69
|
+
# 向前查找
|
|
70
|
+
for i in range(mid, min(mid + 300, len(text))):
|
|
71
|
+
if text[i] in split_chars:
|
|
72
|
+
return i + 1 # 包含分割符
|
|
73
|
+
|
|
74
|
+
# 向后查找
|
|
75
|
+
for i in range(mid, max(mid - 300, 0), -1):
|
|
76
|
+
if text[i] in split_chars:
|
|
77
|
+
return i + 1
|
|
78
|
+
|
|
79
|
+
return mid
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ---- 文档结构处理 ----
|
|
83
|
+
def process_sections(data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT):
|
|
84
|
+
"""处理章节结构"""
|
|
85
|
+
processed = []
|
|
86
|
+
current_title = None
|
|
87
|
+
accumulated = []
|
|
88
|
+
|
|
89
|
+
for item in data:
|
|
90
|
+
if is_section_title(item):
|
|
91
|
+
if current_title is not None:
|
|
92
|
+
flush_section(current_title, accumulated, processed, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT)
|
|
93
|
+
current_title = item["text"]
|
|
94
|
+
accumulated = []
|
|
95
|
+
else:
|
|
96
|
+
accumulated.append(item["text"])
|
|
97
|
+
|
|
98
|
+
flush_section(current_title, accumulated, processed, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT)
|
|
99
|
+
return processed
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def flush_section(title, parts, output, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT):
|
|
103
|
+
"""处理单个章节内容"""
|
|
104
|
+
full_text = "\n".join(parts)
|
|
105
|
+
if not full_text.strip():
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
word_count = count_words(full_text)
|
|
109
|
+
|
|
110
|
+
if word_count <= MAX_CHUNK_WORDS:
|
|
111
|
+
output.append(build_chunk(title, full_text))
|
|
112
|
+
elif MAX_CHUNK_WORDS < word_count <= HARD_LIMIT:
|
|
113
|
+
split_pos = find_balanced_split(full_text)
|
|
114
|
+
output.append(build_chunk(title, full_text[:split_pos]))
|
|
115
|
+
output.append(build_chunk(None, full_text[split_pos:]))
|
|
116
|
+
else:
|
|
117
|
+
chunks = split_text_by_words(full_text, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS)
|
|
118
|
+
for i, chunk in enumerate(chunks):
|
|
119
|
+
output.append(build_chunk(title if i == 0 else None, chunk))
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def build_chunk(title, text):
|
|
123
|
+
"""构建分块结构"""
|
|
124
|
+
if title:
|
|
125
|
+
return {"type": "text", "text": f"{title}\n{text.strip()}"}
|
|
126
|
+
return {"type": "text", "text": text.strip()}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def is_section_title(item):
|
|
130
|
+
"""判断是否为章节标题"""
|
|
131
|
+
return item.get("type") == "text" and item.get("text_level") == 1
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# 生成唯一占位符并记录映射
|
|
135
|
+
def generate_placeholder(element_type, content, placeholder_map):
|
|
136
|
+
placeholder = element_placeholder[element_type] + f"_{len(placeholder_map)}"
|
|
137
|
+
placeholder_map.append((placeholder, element_type, content))
|
|
138
|
+
print(f"Generated placeholder: {placeholder} for {element_type} with content:\n{content}\n")
|
|
139
|
+
return placeholder, placeholder_map
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# 根据占位符还原文本内容
|
|
143
|
+
def restore_placeholders(text, placeholder_map):
|
|
144
|
+
for placeholder, element_type, content in placeholder_map:
|
|
145
|
+
if placeholder in text:
|
|
146
|
+
# 直接替换为之前生成的内容
|
|
147
|
+
text = text.replace(placeholder, content)
|
|
148
|
+
return text
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def merge_element(prev, current, placeholder_map):
|
|
152
|
+
"""
|
|
153
|
+
* @description:
|
|
154
|
+
# 合并元素:
|
|
155
|
+
# - 图像:用 Markdown 语法生成 " 图片描述",并与上文合并。
|
|
156
|
+
# - 表格:将表格描述(如 table_caption)添加在表格内容上方,与上文合并。
|
|
157
|
+
# - 公式:真公式与上下文合并(原逻辑不变)。假公式与下文合并
|
|
158
|
+
* @param prev :
|
|
159
|
+
* @param current :
|
|
160
|
+
* @return
|
|
161
|
+
"""
|
|
162
|
+
if current["type"] in ["equation", "image", "table"]:
|
|
163
|
+
if current["type"] == "image":
|
|
164
|
+
# 针对图像,取 img_path 和 img_caption
|
|
165
|
+
img_path = current.get("img_path", "[Image path missing]")
|
|
166
|
+
img_cap = current.get("img_caption")
|
|
167
|
+
if isinstance(img_cap, list):
|
|
168
|
+
img_caption_text = " ".join(img_cap) if img_cap else ""
|
|
169
|
+
else:
|
|
170
|
+
img_caption_text = img_cap if img_cap else ""
|
|
171
|
+
# Markdown 格式: 后接图片描述
|
|
172
|
+
placeholder_content = f" {img_caption_text}"
|
|
173
|
+
elif current["type"] == "table":
|
|
174
|
+
# 针对表格,先获取描述(table_caption),若没有则为空
|
|
175
|
+
table_caption = current.get("table_caption")
|
|
176
|
+
if isinstance(table_caption, list):
|
|
177
|
+
table_caption_text = " ".join(table_caption) if table_caption else ""
|
|
178
|
+
else:
|
|
179
|
+
table_caption_text = table_caption if table_caption else ""
|
|
180
|
+
# 表格内容取 table_body,如不存在则尝试 text 字段
|
|
181
|
+
table_body = current.get("table_body", current.get("text", "[Table content missing]"))
|
|
182
|
+
# 生成内容:在表格上方添加描述,再换行后显示表格内容
|
|
183
|
+
placeholder_content = f"{table_caption_text}\n{table_body}"
|
|
184
|
+
else: # equation
|
|
185
|
+
placeholder_content = current.get("text", "[Equation]")
|
|
186
|
+
|
|
187
|
+
# 生成占位符并记录映射
|
|
188
|
+
placeholder, placeholder_map = generate_placeholder(current["type"], placeholder_content, placeholder_map)
|
|
189
|
+
|
|
190
|
+
# 合并规则:
|
|
191
|
+
# 公式与上下文合并;图像和表格只与上文合并
|
|
192
|
+
if current["type"] == "equation":
|
|
193
|
+
if prev and prev["type"] == "text":
|
|
194
|
+
# 合并公式到上一个文本段,并标记为假性公式
|
|
195
|
+
prev["text"] += PSEUDO_EQUATION_FLAG + placeholder
|
|
196
|
+
prev["is_pseudo_equation"] = True # 标记为假性公式
|
|
197
|
+
return prev, None, placeholder_map
|
|
198
|
+
else:
|
|
199
|
+
if prev and prev["type"] == "text":
|
|
200
|
+
prev["text"] += "\n" + placeholder + "\n"
|
|
201
|
+
return prev, None, placeholder_map
|
|
202
|
+
|
|
203
|
+
return prev, current, placeholder_map
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def pre_handle_func(data):
|
|
207
|
+
"""
|
|
208
|
+
* @description: 预处理数据
|
|
209
|
+
* @param data :
|
|
210
|
+
* @return
|
|
211
|
+
"""
|
|
212
|
+
# 如果第一个元素不是文本,则在最前面插入一个空文本项,确保有上文可以合并
|
|
213
|
+
if data and data[0]["type"] != "text":
|
|
214
|
+
data.insert(0, {"type": "text", "text": "", "text_level": 1})
|
|
215
|
+
|
|
216
|
+
# 确保每个元素都有 "text" 和 "text_level" 键,避免 KeyError
|
|
217
|
+
for item in data:
|
|
218
|
+
if "text" not in item:
|
|
219
|
+
item["text"] = ""
|
|
220
|
+
if "text_level" not in item:
|
|
221
|
+
item["text_level"] = 0
|
|
222
|
+
|
|
223
|
+
# 过滤掉 "text" 为空的项,但保留 image、equation、table 类型的项以及 text_level 为 1 的项
|
|
224
|
+
filtered_data = [
|
|
225
|
+
item for item in data
|
|
226
|
+
if item.get("text") != "" or item["type"] in ["image", "equation", "table"] or item.get("text_level") == 1
|
|
227
|
+
]
|
|
228
|
+
processed_data = []
|
|
229
|
+
previous_item = None
|
|
230
|
+
placeholder_map = []
|
|
231
|
+
# 主处理流程:合并文本、拆分长文本等
|
|
232
|
+
for item in filtered_data:
|
|
233
|
+
if previous_item:
|
|
234
|
+
# 合并相邻的 "text_level": 1(标题连续),但如果文本中包含摘要或关键字,则不合并
|
|
235
|
+
if previous_item["type"] == "text" and item["type"] == "text":
|
|
236
|
+
if previous_item.get("text_level") == 1 and item.get("text_level") == 1:
|
|
237
|
+
previous_item["text"] += "-" + item["text"]
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
# 合并元素(图像、表格与上文,公式与上下文)
|
|
241
|
+
previous_item, item, placeholder_map = merge_element(previous_item, item, placeholder_map)
|
|
242
|
+
if item is None:
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
# 处理假性公式的合并
|
|
246
|
+
if previous_item.get("is_pseudo_equation", False):
|
|
247
|
+
# 如果下一个文本段是标题,则不合并
|
|
248
|
+
if item.get("text_level") == 1:
|
|
249
|
+
processed_data.append(previous_item)
|
|
250
|
+
previous_item = item
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
# 如果当前文本段加上下一个文本段的长度超过 1200,则不合并
|
|
254
|
+
if len(previous_item["text"]) + len(item["text"]) > 1200:
|
|
255
|
+
processed_data.append(previous_item)
|
|
256
|
+
previous_item = item
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
# 否则,合并假性公式与下一个文本段
|
|
260
|
+
previous_item["text"] += item["text"]
|
|
261
|
+
previous_item["is_pseudo_equation"] = False # 清除假性公式标志
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
processed_data.append(previous_item)
|
|
265
|
+
|
|
266
|
+
previous_item = item
|
|
267
|
+
|
|
268
|
+
if previous_item:
|
|
269
|
+
processed_data.append(previous_item)
|
|
270
|
+
return processed_data, placeholder_map
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def split_md_func(data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT):
|
|
274
|
+
"""
|
|
275
|
+
* @description: 主要流程, 整理
|
|
276
|
+
* @param data :
|
|
277
|
+
* @param MAX_CHUNK_WORDS :
|
|
278
|
+
* @param SOFT_CHUNK_WORDS :
|
|
279
|
+
* @param HARD_LIMIT :
|
|
280
|
+
* @return
|
|
281
|
+
"""
|
|
282
|
+
pre_handle_data, placeholder_map = pre_handle_func(data=data)
|
|
283
|
+
|
|
284
|
+
middle_handle_data = process_sections(pre_handle_data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT)
|
|
285
|
+
# 统一恢复占位符,确保所有文本中的占位符都被正确替换
|
|
286
|
+
for item in middle_handle_data:
|
|
287
|
+
if item["type"] == "text":
|
|
288
|
+
item["text"] = restore_placeholders(item["text"], placeholder_map)
|
|
289
|
+
final_handle_data = middle_handle_data
|
|
290
|
+
return final_handle_data
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
if __name__ == "__main__":
|
|
294
|
+
## config:
|
|
295
|
+
MAX_CHUNK_WORDS = 1000
|
|
296
|
+
SOFT_CHUNK_WORDS = 400
|
|
297
|
+
HARD_LIMIT = 1400
|
|
298
|
+
##
|
|
299
|
+
# with open("/media/disk0/xzzn_data_all/yinyabo/omni_split/test/c8d4614affc19ba92d7ba0671fd709803d0488a0c5a68bc237783a8af39fe32e/1c7fbb26-1012-4b03-894c-69ab2257985c_1743677710.4311144_content_list.json", "r", encoding="utf-8") as file:
|
|
300
|
+
with open("1c7fbb26-1012-4b03-894c-69ab2257985c_1743677710.4311144_content_list.json", "r", encoding="utf-8") as file:
|
|
301
|
+
|
|
302
|
+
data = json.load(file)
|
|
303
|
+
final_handle_data = split_md_func(data, MAX_CHUNK_WORDS=MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS=SOFT_CHUNK_WORDS, HARD_LIMIT=HARD_LIMIT)
|
|
304
|
+
# 保存更新后的 JSON 文件
|
|
305
|
+
with open("output4-yyb-en4.json", "w", encoding="utf-8") as file:
|
|
306
|
+
json.dump(final_handle_data, file, ensure_ascii=False, indent=4)
|
omni_split/main.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from omni_split import OmniSplit
|
|
3
|
+
from omni_split import word_preprocessing_and_return_bytesIO
|
|
4
|
+
|
|
5
|
+
omni_spliter = OmniSplit()
|
|
6
|
+
## note: test text split
|
|
7
|
+
test_text = True
|
|
8
|
+
if test_text:
|
|
9
|
+
with open("omni_split/test/text_test.txt", "r") as f:
|
|
10
|
+
text_content = "".join(f.readlines())
|
|
11
|
+
res = omni_spliter.text_chunk_func(text_content)
|
|
12
|
+
for item in res:
|
|
13
|
+
print(item)
|
|
14
|
+
print("------------")
|
|
15
|
+
print("=" * 10)
|
|
16
|
+
|
|
17
|
+
## note: test markdown json split
|
|
18
|
+
test_markdown = True
|
|
19
|
+
if test_markdown:
|
|
20
|
+
with open("omni_split/test/json_list_test.json", "r") as f:
|
|
21
|
+
md_content_json = json.load(f)
|
|
22
|
+
res = omni_spliter.markdown_json_chunk_func(md_content_json)
|
|
23
|
+
for item in res:
|
|
24
|
+
print(item)
|
|
25
|
+
print("------------")
|
|
26
|
+
print("=" * 10)
|
|
27
|
+
|
|
28
|
+
res = omni_spliter.markdown_json_chunk_func(md_content_json, clear_model=True)
|
|
29
|
+
for item in res:
|
|
30
|
+
print(item)
|
|
31
|
+
print("------------")
|
|
32
|
+
print("=" * 10)
|
|
33
|
+
|
|
34
|
+
## note: test markdown split
|
|
35
|
+
test_markdown = True
|
|
36
|
+
if test_markdown:
|
|
37
|
+
with open("omni_split/test/markdown_test.md", "r") as f:
|
|
38
|
+
md_content = f.read()
|
|
39
|
+
res = omni_spliter.markdown_chunk_func(md_content)
|
|
40
|
+
for item in res:
|
|
41
|
+
print(item)
|
|
42
|
+
print("------------")
|
|
43
|
+
print("=" * 10)
|
|
44
|
+
|
|
45
|
+
res = omni_spliter.markdown_chunk_func(md_content, clear_model=True)
|
|
46
|
+
for item in res:
|
|
47
|
+
print(item)
|
|
48
|
+
print("------------")
|
|
49
|
+
print("=" * 10)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
## note: test word split
|
|
53
|
+
test_document = True
|
|
54
|
+
if test_document:
|
|
55
|
+
|
|
56
|
+
new_doc_io = word_preprocessing_and_return_bytesIO("omni_split/test/docx_test.docx")
|
|
57
|
+
res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False)
|
|
58
|
+
for item in res:
|
|
59
|
+
print(item)
|
|
60
|
+
print("------------")
|
|
61
|
+
print("=" * 10)
|
|
62
|
+
|
|
63
|
+
res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False, save_local_images_dir="./images")
|
|
64
|
+
for item in res:
|
|
65
|
+
print(item)
|
|
66
|
+
print("------------")
|
|
67
|
+
print("=" * 10)
|
|
68
|
+
|
|
69
|
+
res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=True)
|
|
70
|
+
for item in res:
|
|
71
|
+
print(item)
|
|
72
|
+
print("------------")
|
|
73
|
+
print("=" * 10)
|