omni-split 0.0.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omni-split might be problematic. Click here for more details.

@@ -0,0 +1,306 @@
1
+ import json
2
+ import re
3
+
4
+ # 设置文本长度限制
5
+
6
+ # 定义假性公式的标志
7
+
8
+
9
+ # 定义占位符前缀
10
+ element_placeholder = {"equation": "[EQUATION]", "image": "[IMAGE]", "table": "[TABLE]"}
11
+ PSEUDO_EQUATION_FLAG = "[PSEUDO_EQUATION]"
12
+
13
+ # 用于保存占位符映射内容,格式:(placeholder, element_type, content)
14
+
15
+
16
+ # ---- 核心功能函数 ----
17
+ def count_words(text):
18
+ """更精确的字数统计方法(接近Word统计规则)"""
19
+ # 统计中文字符(含中文标点)
20
+ chinese_count = len(re.findall(r"[\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef]", text))
21
+ # 统计英文单词(字母数字组合)
22
+ english_words = re.findall(r"\b[a-zA-Z0-9]+\b", text)
23
+ # 统计特殊符号(非中文、非字母数字的独立字符)
24
+ special_chars = re.findall(r"(?<!\S)[^\w\u4e00-\u9fa5](?!\S)", text)
25
+
26
+ return chinese_count + len(english_words) + len(special_chars)
27
+
28
+
29
+ def split_text_by_words(text, max_words, soft_words):
30
+ """智能分块算法"""
31
+ chunks = []
32
+ current_count = 0
33
+ buffer = []
34
+
35
+ # 按自然断点分割
36
+ sentences = re.split(r"([。!?;\.\?!;\n])", text)
37
+ sentences = [s for s in sentences if s.strip()]
38
+
39
+ for i in range(0, len(sentences), 2):
40
+ sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "")
41
+ sentence_word_count = count_words(sentence)
42
+
43
+ # 强制分割条件
44
+ if current_count + sentence_word_count > max_words:
45
+ if buffer:
46
+ chunks.append("".join(buffer))
47
+ buffer = []
48
+ current_count = 0
49
+
50
+ buffer.append(sentence)
51
+ current_count += sentence_word_count
52
+
53
+ # 软分割条件
54
+ if current_count >= soft_words:
55
+ chunks.append("".join(buffer))
56
+ buffer = []
57
+ current_count = 0
58
+
59
+ if buffer:
60
+ chunks.append("".join(buffer))
61
+ return chunks
62
+
63
+
64
+ def find_balanced_split(text):
65
+ """寻找最佳均分点"""
66
+ mid = len(text) // 2
67
+ split_chars = ["\n", "。", ";", "!", "?", ";"]
68
+
69
+ # 向前查找
70
+ for i in range(mid, min(mid + 300, len(text))):
71
+ if text[i] in split_chars:
72
+ return i + 1 # 包含分割符
73
+
74
+ # 向后查找
75
+ for i in range(mid, max(mid - 300, 0), -1):
76
+ if text[i] in split_chars:
77
+ return i + 1
78
+
79
+ return mid
80
+
81
+
82
+ # ---- 文档结构处理 ----
83
+ def process_sections(data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT):
84
+ """处理章节结构"""
85
+ processed = []
86
+ current_title = None
87
+ accumulated = []
88
+
89
+ for item in data:
90
+ if is_section_title(item):
91
+ if current_title is not None:
92
+ flush_section(current_title, accumulated, processed, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT)
93
+ current_title = item["text"]
94
+ accumulated = []
95
+ else:
96
+ accumulated.append(item["text"])
97
+
98
+ flush_section(current_title, accumulated, processed, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT)
99
+ return processed
100
+
101
+
102
+ def flush_section(title, parts, output, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT):
103
+ """处理单个章节内容"""
104
+ full_text = "\n".join(parts)
105
+ if not full_text.strip():
106
+ return
107
+
108
+ word_count = count_words(full_text)
109
+
110
+ if word_count <= MAX_CHUNK_WORDS:
111
+ output.append(build_chunk(title, full_text))
112
+ elif MAX_CHUNK_WORDS < word_count <= HARD_LIMIT:
113
+ split_pos = find_balanced_split(full_text)
114
+ output.append(build_chunk(title, full_text[:split_pos]))
115
+ output.append(build_chunk(None, full_text[split_pos:]))
116
+ else:
117
+ chunks = split_text_by_words(full_text, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS)
118
+ for i, chunk in enumerate(chunks):
119
+ output.append(build_chunk(title if i == 0 else None, chunk))
120
+
121
+
122
+ def build_chunk(title, text):
123
+ """构建分块结构"""
124
+ if title:
125
+ return {"type": "text", "text": f"{title}\n{text.strip()}"}
126
+ return {"type": "text", "text": text.strip()}
127
+
128
+
129
+ def is_section_title(item):
130
+ """判断是否为章节标题"""
131
+ return item.get("type") == "text" and item.get("text_level") == 1
132
+
133
+
134
+ # 生成唯一占位符并记录映射
135
+ def generate_placeholder(element_type, content, placeholder_map):
136
+ placeholder = element_placeholder[element_type] + f"_{len(placeholder_map)}"
137
+ placeholder_map.append((placeholder, element_type, content))
138
+ print(f"Generated placeholder: {placeholder} for {element_type} with content:\n{content}\n")
139
+ return placeholder, placeholder_map
140
+
141
+
142
+ # 根据占位符还原文本内容
143
+ def restore_placeholders(text, placeholder_map):
144
+ for placeholder, element_type, content in placeholder_map:
145
+ if placeholder in text:
146
+ # 直接替换为之前生成的内容
147
+ text = text.replace(placeholder, content)
148
+ return text
149
+
150
+
151
+ def merge_element(prev, current, placeholder_map):
152
+ """
153
+ * @description:
154
+ # 合并元素:
155
+ # - 图像:用 Markdown 语法生成 "![](图片路径) 图片描述",并与上文合并。
156
+ # - 表格:将表格描述(如 table_caption)添加在表格内容上方,与上文合并。
157
+ # - 公式:真公式与上下文合并(原逻辑不变)。假公式与下文合并
158
+ * @param prev :
159
+ * @param current :
160
+ * @return
161
+ """
162
+ if current["type"] in ["equation", "image", "table"]:
163
+ if current["type"] == "image":
164
+ # 针对图像,取 img_path 和 img_caption
165
+ img_path = current.get("img_path", "[Image path missing]")
166
+ img_cap = current.get("img_caption")
167
+ if isinstance(img_cap, list):
168
+ img_caption_text = " ".join(img_cap) if img_cap else ""
169
+ else:
170
+ img_caption_text = img_cap if img_cap else ""
171
+ # Markdown 格式:![](图片路径) 后接图片描述
172
+ placeholder_content = f"![]({img_path}) {img_caption_text}"
173
+ elif current["type"] == "table":
174
+ # 针对表格,先获取描述(table_caption),若没有则为空
175
+ table_caption = current.get("table_caption")
176
+ if isinstance(table_caption, list):
177
+ table_caption_text = " ".join(table_caption) if table_caption else ""
178
+ else:
179
+ table_caption_text = table_caption if table_caption else ""
180
+ # 表格内容取 table_body,如不存在则尝试 text 字段
181
+ table_body = current.get("table_body", current.get("text", "[Table content missing]"))
182
+ # 生成内容:在表格上方添加描述,再换行后显示表格内容
183
+ placeholder_content = f"{table_caption_text}\n{table_body}"
184
+ else: # equation
185
+ placeholder_content = current.get("text", "[Equation]")
186
+
187
+ # 生成占位符并记录映射
188
+ placeholder, placeholder_map = generate_placeholder(current["type"], placeholder_content, placeholder_map)
189
+
190
+ # 合并规则:
191
+ # 公式与上下文合并;图像和表格只与上文合并
192
+ if current["type"] == "equation":
193
+ if prev and prev["type"] == "text":
194
+ # 合并公式到上一个文本段,并标记为假性公式
195
+ prev["text"] += PSEUDO_EQUATION_FLAG + placeholder
196
+ prev["is_pseudo_equation"] = True # 标记为假性公式
197
+ return prev, None, placeholder_map
198
+ else:
199
+ if prev and prev["type"] == "text":
200
+ prev["text"] += "\n" + placeholder + "\n"
201
+ return prev, None, placeholder_map
202
+
203
+ return prev, current, placeholder_map
204
+
205
+
206
+ def pre_handle_func(data):
207
+ """
208
+ * @description: 预处理数据
209
+ * @param data :
210
+ * @return
211
+ """
212
+ # 如果第一个元素不是文本,则在最前面插入一个空文本项,确保有上文可以合并
213
+ if data and data[0]["type"] != "text":
214
+ data.insert(0, {"type": "text", "text": "", "text_level": 1})
215
+
216
+ # 确保每个元素都有 "text" 和 "text_level" 键,避免 KeyError
217
+ for item in data:
218
+ if "text" not in item:
219
+ item["text"] = ""
220
+ if "text_level" not in item:
221
+ item["text_level"] = 0
222
+
223
+ # 过滤掉 "text" 为空的项,但保留 image、equation、table 类型的项以及 text_level 为 1 的项
224
+ filtered_data = [
225
+ item for item in data
226
+ if item.get("text") != "" or item["type"] in ["image", "equation", "table"] or item.get("text_level") == 1
227
+ ]
228
+ processed_data = []
229
+ previous_item = None
230
+ placeholder_map = []
231
+ # 主处理流程:合并文本、拆分长文本等
232
+ for item in filtered_data:
233
+ if previous_item:
234
+ # 合并相邻的 "text_level": 1(标题连续),但如果文本中包含摘要或关键字,则不合并
235
+ if previous_item["type"] == "text" and item["type"] == "text":
236
+ if previous_item.get("text_level") == 1 and item.get("text_level") == 1:
237
+ previous_item["text"] += "-" + item["text"]
238
+ continue
239
+
240
+ # 合并元素(图像、表格与上文,公式与上下文)
241
+ previous_item, item, placeholder_map = merge_element(previous_item, item, placeholder_map)
242
+ if item is None:
243
+ continue
244
+
245
+ # 处理假性公式的合并
246
+ if previous_item.get("is_pseudo_equation", False):
247
+ # 如果下一个文本段是标题,则不合并
248
+ if item.get("text_level") == 1:
249
+ processed_data.append(previous_item)
250
+ previous_item = item
251
+ continue
252
+
253
+ # 如果当前文本段加上下一个文本段的长度超过 1200,则不合并
254
+ if len(previous_item["text"]) + len(item["text"]) > 1200:
255
+ processed_data.append(previous_item)
256
+ previous_item = item
257
+ continue
258
+
259
+ # 否则,合并假性公式与下一个文本段
260
+ previous_item["text"] += item["text"]
261
+ previous_item["is_pseudo_equation"] = False # 清除假性公式标志
262
+ continue
263
+
264
+ processed_data.append(previous_item)
265
+
266
+ previous_item = item
267
+
268
+ if previous_item:
269
+ processed_data.append(previous_item)
270
+ return processed_data, placeholder_map
271
+
272
+
273
+ def split_md_func(data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT):
274
+ """
275
+ * @description: 主要流程, 整理
276
+ * @param data :
277
+ * @param MAX_CHUNK_WORDS :
278
+ * @param SOFT_CHUNK_WORDS :
279
+ * @param HARD_LIMIT :
280
+ * @return
281
+ """
282
+ pre_handle_data, placeholder_map = pre_handle_func(data=data)
283
+
284
+ middle_handle_data = process_sections(pre_handle_data, MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS, HARD_LIMIT)
285
+ # 统一恢复占位符,确保所有文本中的占位符都被正确替换
286
+ for item in middle_handle_data:
287
+ if item["type"] == "text":
288
+ item["text"] = restore_placeholders(item["text"], placeholder_map)
289
+ final_handle_data = middle_handle_data
290
+ return final_handle_data
291
+
292
+
293
+ if __name__ == "__main__":
294
+ ## config:
295
+ MAX_CHUNK_WORDS = 1000
296
+ SOFT_CHUNK_WORDS = 400
297
+ HARD_LIMIT = 1400
298
+ ##
299
+ # with open("/media/disk0/xzzn_data_all/yinyabo/omni_split/test/c8d4614affc19ba92d7ba0671fd709803d0488a0c5a68bc237783a8af39fe32e/1c7fbb26-1012-4b03-894c-69ab2257985c_1743677710.4311144_content_list.json", "r", encoding="utf-8") as file:
300
+ with open("1c7fbb26-1012-4b03-894c-69ab2257985c_1743677710.4311144_content_list.json", "r", encoding="utf-8") as file:
301
+
302
+ data = json.load(file)
303
+ final_handle_data = split_md_func(data, MAX_CHUNK_WORDS=MAX_CHUNK_WORDS, SOFT_CHUNK_WORDS=SOFT_CHUNK_WORDS, HARD_LIMIT=HARD_LIMIT)
304
+ # 保存更新后的 JSON 文件
305
+ with open("output4-yyb-en4.json", "w", encoding="utf-8") as file:
306
+ json.dump(final_handle_data, file, ensure_ascii=False, indent=4)
omni_split/main.py ADDED
@@ -0,0 +1,73 @@
1
+ import json
2
+ from omni_split import OmniSplit
3
+ from omni_split import word_preprocessing_and_return_bytesIO
4
+
5
+ omni_spliter = OmniSplit()
6
+ ## note: test text split
7
+ test_text = True
8
+ if test_text:
9
+ with open("omni_split/test/text_test.txt", "r") as f:
10
+ text_content = "".join(f.readlines())
11
+ res = omni_spliter.text_chunk_func(text_content)
12
+ for item in res:
13
+ print(item)
14
+ print("------------")
15
+ print("=" * 10)
16
+
17
+ ## note: test markdown json split
18
+ test_markdown = True
19
+ if test_markdown:
20
+ with open("omni_split/test/json_list_test.json", "r") as f:
21
+ md_content_json = json.load(f)
22
+ res = omni_spliter.markdown_json_chunk_func(md_content_json)
23
+ for item in res:
24
+ print(item)
25
+ print("------------")
26
+ print("=" * 10)
27
+
28
+ res = omni_spliter.markdown_json_chunk_func(md_content_json, clear_model=True)
29
+ for item in res:
30
+ print(item)
31
+ print("------------")
32
+ print("=" * 10)
33
+
34
+ ## note: test markdown split
35
+ test_markdown = True
36
+ if test_markdown:
37
+ with open("omni_split/test/markdown_test.md", "r") as f:
38
+ md_content = f.read()
39
+ res = omni_spliter.markdown_chunk_func(md_content)
40
+ for item in res:
41
+ print(item)
42
+ print("------------")
43
+ print("=" * 10)
44
+
45
+ res = omni_spliter.markdown_chunk_func(md_content, clear_model=True)
46
+ for item in res:
47
+ print(item)
48
+ print("------------")
49
+ print("=" * 10)
50
+
51
+
52
+ ## note: test word split
53
+ test_document = True
54
+ if test_document:
55
+
56
+ new_doc_io = word_preprocessing_and_return_bytesIO("omni_split/test/docx_test.docx")
57
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False)
58
+ for item in res:
59
+ print(item)
60
+ print("------------")
61
+ print("=" * 10)
62
+
63
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False, save_local_images_dir="./images")
64
+ for item in res:
65
+ print(item)
66
+ print("------------")
67
+ print("=" * 10)
68
+
69
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=True)
70
+ for item in res:
71
+ print(item)
72
+ print("------------")
73
+ print("=" * 10)