omni-split 0.0.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of omni-split might be problematic. Click here for more details.

@@ -0,0 +1,181 @@
1
+ from io import BytesIO
2
+ from docx import Document
3
+ import os
4
+ from docx.opc.constants import RELATIONSHIP_TYPE as RT
5
+ from loguru import logger
6
+ import re
7
+ import uuid
8
+ import base64
9
+
10
+ from wand.image import Image
11
+ from pathlib import Path
12
+ import warnings
13
+
14
+ def add_fix_before_extension(file_path):
15
+ # 分割文件路径的目录、文件名和扩展名
16
+ dir_name = os.path.dirname(file_path)
17
+ base_name = os.path.basename(file_path)
18
+ name, ext = os.path.splitext(base_name)
19
+
20
+ # 在文件名和后缀之间添加 _fix
21
+ new_name = f"{name}_fix{ext}"
22
+ # 重新组合路径
23
+ new_path = os.path.join(dir_name, new_name)
24
+ return new_path
25
+
26
+
27
+ def delete_file(file_path):
28
+ """删除指定路径的文件"""
29
+ try:
30
+ if os.path.exists(file_path):
31
+ os.remove(file_path)
32
+ # print(f"文件 {file_path} 已成功删除")
33
+ else:
34
+ pass
35
+ # print(f"文件 {file_path} 不存在,无法删除")
36
+ except Exception as e:
37
+ print(f"删除文件 {file_path} 时出错: {e}")
38
+
39
+
40
+ def word_preprocessing_and_return_bytesIO(input_file):
41
+ output_file = add_fix_before_extension(input_file)
42
+ # 打开Word文档
43
+ doc = Document(input_file)
44
+ # 遍历文档中的所有段落
45
+ for paragraph in doc.paragraphs:
46
+ if "#" in paragraph.text:
47
+ # 替换#为"!#"
48
+ paragraph.text = paragraph.text.replace("#", r"\#")
49
+ ## 将超链接部分替换为空字符串
50
+ rels = doc.part.rels
51
+ for rel in rels:
52
+ if rels[rel].reltype == RT.HYPERLINK:
53
+ # hyperlinks[rel] = rels[rel]._target
54
+ rels[rel]._target = ""
55
+ # 保存修改后的文档
56
+ doc.save(output_file)
57
+ with open(output_file, "rb") as f:
58
+ doc_content = f.read()
59
+ # 将bytes包装成BytesIO
60
+ doc_content_io = BytesIO(doc_content)
61
+ delete_file(output_file)
62
+ return doc_content_io
63
+
64
+
65
+ def save_local_images_func(ret_data, image_save_path):
66
+ """
67
+ Process ret_data to extract Base64 images, save them locally, and update text references.
68
+
69
+ Args:
70
+ ret_data: List of dictionaries containing text with potential Base64 images
71
+ image_save_path: Directory to save extracted images
72
+
73
+ Returns:
74
+ Modified ret_data with local image paths instead of Base64 strings
75
+ """
76
+ # Create directory if it doesn't exist
77
+ if image_save_path and not os.path.exists(image_save_path):
78
+ os.makedirs(image_save_path)
79
+
80
+ # Regex pattern to match Base64 image strings
81
+ base64_pattern = re.compile(r"!\[.*?\]\(data:(.*?);base64,(.*?)\)")
82
+
83
+ for item in ret_data:
84
+ if "text" not in item:
85
+ continue
86
+
87
+ text = item["text"]
88
+ matches = base64_pattern.findall(text)
89
+
90
+ if not matches:
91
+ continue
92
+
93
+ for match in list(set(matches)):
94
+ native_format, img_data = match
95
+ try:
96
+ if native_format == "None":
97
+ img_format = "image/png"
98
+ else:
99
+ img_format = native_format
100
+ # Generate unique filename
101
+ try:
102
+ filename = f"{uuid.uuid4()}.{img_format.split('/')[1].split(';')[0]}"
103
+ except:
104
+ logger.info("img_format error, try to forcefully convert it to PNG.")
105
+ filename = f"{uuid.uuid4()}.png"
106
+ filepath = os.path.join(image_save_path, filename)
107
+ ## 最终要保存的local image path
108
+ need_saved_filename = f"{uuid.uuid4()}.png"
109
+ need_saved_filepath = os.path.join(image_save_path, need_saved_filename)
110
+
111
+ # Decode and save image
112
+ with open(filepath, "wb") as f:
113
+ f.write(base64.b64decode(img_data))
114
+
115
+ convert_is_ok = convert_to_png(filepath, need_saved_filepath)
116
+
117
+ # Replace Base64 string with file path
118
+ if convert_is_ok:
119
+ text = text.replace(f"data:{native_format};base64,{img_data}", need_saved_filepath)
120
+ else:
121
+ logger.info("convert_to_png error. save orginal file path.")
122
+ text = text.replace(f"data:{native_format};base64,{img_data}", filepath)
123
+ except Exception as e:
124
+ print(f"Failed to process image: {e}")
125
+ continue
126
+
127
+ item["text"] = text
128
+
129
+ return ret_data
130
+
131
+
132
+
133
+
134
+
135
+ def convert_to_png(input_path: str, output_path: str) -> bool:
136
+ """
137
+ 将输入图片文件(如 WMF/PNG/JPG)转换为 PNG 格式
138
+
139
+ Args:
140
+ input_path (str): 输入文件路径(如 "input.wmf" 或 "input.jpg")
141
+ output_path (str): 输出 PNG 文件路径(如 "output.png")
142
+
143
+ Returns:
144
+ bool: 是否转换成功。如果ImageMagick未安装,直接返回False并保持原文件
145
+ """
146
+ try:
147
+ # 检查wand是否能够正常工作(即ImageMagick是否安装)
148
+ from wand.version import MAGICK_VERSION_INFO
149
+ except (ImportError, ModuleNotFoundError) as e:
150
+ warnings.warn(f"ImageMagick is not properly installed. Skipping conversion: {e}")
151
+ return False
152
+ except Exception as e:
153
+ warnings.warn(f"Error checking ImageMagick installation: {e}")
154
+ return False
155
+
156
+ try:
157
+ # 检查输入文件是否存在
158
+ if not Path(input_path).is_file():
159
+ print(f"Error: Input file '{input_path}' does not exist.")
160
+ return False
161
+
162
+ # 如果输出路径是目录,自动生成文件名
163
+ output_path = str(Path(output_path))
164
+ if Path(output_path).is_dir():
165
+ output_path = str(Path(output_path) / (Path(input_path).stem + ".png"))
166
+
167
+ # 使用 wand 进行转换
168
+ with Image(filename=input_path) as img:
169
+ img.format = "png"
170
+ img.save(filename=output_path)
171
+
172
+ print(f"Success: Converted '{input_path}' to '{output_path}'")
173
+ return True
174
+
175
+ except Exception as e:
176
+ print(f"Error converting '{input_path}': {e}")
177
+ return False
178
+
179
+
180
+ def download_tokenizer_from_network(ms=True):
181
+ pass
@@ -0,0 +1,61 @@
1
+ import os
2
+ import requests
3
+ from typing import Dict
4
+
5
+ def download_files_to_test_doc() -> Dict[str, str]:
6
+ """
7
+ 下载文件到 test_doc 文件夹,并返回 {文件名: 绝对路径} 的字典
8
+
9
+ Args:
10
+ file_list: 包含文件名和下载链接的字典,格式如 {"文件名": "下载链接"}
11
+
12
+ Returns:
13
+ 返回一个字典,key 是文件名,value 是下载后的绝对路径
14
+ """
15
+ # 创建 test_doc 文件夹(如果不存在)
16
+ file_list = {
17
+ "docx_test.docx": "https://modelscope.cn/datasets/yinyabo/omni_split_test_doc/resolve/master/docx_test.docx",
18
+ "json_list_test.json": "https://modelscope.cn/datasets/yinyabo/omni_split_test_doc/resolve/master/json_list_test.json",
19
+ "markdown_test.md": "https://modelscope.cn/datasets/yinyabo/omni_split_test_doc/resolve/master/markdown_test.md",
20
+ "text_test.txt": "https://modelscope.cn/datasets/yinyabo/omni_split_test_doc/resolve/master/text_test.txt"
21
+ }
22
+ os.makedirs("test_doc", exist_ok=True)
23
+
24
+ result = {}
25
+
26
+ def download_file(url: str, filename: str) -> str:
27
+ """下载单个文件并返回其绝对路径"""
28
+ try:
29
+ response = requests.get(url, allow_redirects=True, stream=True)
30
+ response.raise_for_status()
31
+
32
+ filepath = os.path.abspath(os.path.join("test_doc", filename))
33
+ with open(filepath, 'wb') as f:
34
+ for chunk in response.iter_content(1024):
35
+ f.write(chunk)
36
+ print(f"成功下载: {filename}")
37
+ return filepath
38
+ except Exception as e:
39
+ print(f"下载 {filename} 失败: {str(e)}")
40
+ return None
41
+
42
+ # 下载所有文件
43
+ for filename, url in file_list.items():
44
+ absolute_path = download_file(url, filename)
45
+ if absolute_path:
46
+ result[filename] = absolute_path
47
+
48
+ return result
49
+
50
+ # 使用示例
51
+ if __name__ == "__main__":
52
+ # 定义要下载的文件列表
53
+
54
+
55
+ # 调用函数下载文件
56
+ downloaded_files = download_files_to_test_doc()
57
+
58
+ # 打印结果
59
+ print("\n下载结果:")
60
+ for name, path in downloaded_files.items():
61
+ print(f"{name}: {path}")
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: omni_split
3
+ Version: 0.0.1rc0
4
+ Summary: A comprehensive document splitting toolkit
5
+ Home-page: https://github.com/dinobot22/omni_split
6
+ Author: dinobot22
7
+ Author-email: 2802701695yyb@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: mistletoe
15
+ Requires-Dist: transformers
16
+ Requires-Dist: markitdown[docx,pptx,xls,xlsx]
17
+ Requires-Dist: python-docx
18
+ Requires-Dist: loguru
19
+ Requires-Dist: wand
20
+ Dynamic: author
21
+ Dynamic: author-email
22
+ Dynamic: classifier
23
+ Dynamic: description
24
+ Dynamic: description-content-type
25
+ Dynamic: home-page
26
+ Dynamic: license-file
27
+ Dynamic: requires-dist
28
+ Dynamic: requires-python
29
+ Dynamic: summary
30
+
31
+
32
+
33
+
34
+ # omni_split: Split commonly used document (md, doc etc.) forms for RAG that support LLM.
35
+ ---
36
+ <img src="./docs/icon.png" alt="omni_split" >
37
+
38
+ ---
39
+ ### note: All other text formats are highly recommended to be converted to Markdown, and we focus on optimizing documents for Markdown.
40
+ ---
41
+ # usage
42
+ ### install
43
+ ```bash
44
+ pip install omni_split
45
+ ```
46
+ ### use case
47
+ ```python
48
+ import json
49
+ from omni_split import OmniSplit
50
+ from omni_split import word_preprocessing_and_return_bytesIO
51
+ from omni_split import download_files_to_test_doc
52
+
53
+ ### == step 2: download test_doc file ==
54
+
55
+ doc_dict = download_files_to_test_doc()
56
+ text_doc_file_path = doc_dict["text_test.txt"]
57
+ json_list_doc_file_path = doc_dict["json_list_test.json"]
58
+ markdown_doc_file_path = doc_dict["markdown_test.md"]
59
+ word_doc_file_path = doc_dict["docx_test.docx"]
60
+
61
+
62
+ ### == step 3: split to chunk ==
63
+
64
+ omni_spliter = OmniSplit()
65
+
66
+ ## note: test text split
67
+ test_text = True
68
+ if test_text:
69
+ with open(text_doc_file_path, "r") as f:
70
+ text_content = "".join(f.readlines())
71
+ res = omni_spliter.text_chunk_func(text_content,txt_chunk_size=1000)
72
+ for item in res:
73
+ print(item)
74
+ print("------------")
75
+ print("=" * 10)
76
+
77
+ ## note: test markdown json split
78
+ test_markdown = True
79
+ if test_markdown:
80
+ with open(json_list_doc_file_path, "r") as f:
81
+ md_content_json = json.load(f)
82
+ res = omni_spliter.markdown_json_chunk_func(md_content_json)
83
+ for item in res:
84
+ print(item)
85
+ print("------------")
86
+ print("=" * 10)
87
+
88
+ res = omni_spliter.markdown_json_chunk_func(md_content_json, clear_model=True)
89
+ for item in res:
90
+ print(item)
91
+ print("------------")
92
+ print("=" * 10)
93
+
94
+ ## note: test markdown split
95
+ test_markdown = True
96
+ if test_markdown:
97
+ with open(markdown_doc_file_path, "r") as f:
98
+ md_content = f.read()
99
+ res = omni_spliter.markdown_chunk_func(md_content)
100
+ for item in res:
101
+ print(item)
102
+ print("------------")
103
+ print("=" * 10)
104
+
105
+ res = omni_spliter.markdown_chunk_func(md_content, clear_model=True)
106
+ for item in res:
107
+ print(item)
108
+ print("------------")
109
+ print("=" * 10)
110
+
111
+
112
+ ## note: test word split
113
+ test_document = True
114
+ if test_document:
115
+
116
+ new_doc_io = word_preprocessing_and_return_bytesIO(word_doc_file_path)
117
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False)
118
+ for item in res:
119
+ print(item)
120
+ print("------------")
121
+ print("=" * 10)
122
+
123
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False, save_local_images_dir="./images")
124
+ for item in res:
125
+ print(item)
126
+ print("------------")
127
+ print("=" * 10)
128
+
129
+ res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=True)
130
+ for item in res:
131
+ print(item)
132
+ print("------------")
133
+ print("=" * 10)
134
+
135
+ ```
136
+ # Reminder of dependency:
137
+ To automatically convert binary metafiles(e.g. x-wmf.) in Word to PNG, you need to install ImageMagick on Linux.
138
+ Try to install:
139
+ https://docs.wand-py.org/en/latest/guide/install.html
@@ -0,0 +1,24 @@
1
+ omni_split/__init__.py,sha256=FjV45dqfcsb95aLuwCMCBssNwQ0gQ-DVOWLpFO8_A3A,401
2
+ omni_split/main.py,sha256=T_yNeKmyqqTVCx93plow3kixUVMI23sxeyAJDFVYtaM,2166
3
+ omni_split/omni_split.py,sha256=TthOj6j4Lj11eH1giJcccegYkRsbBO9TppsolXLqK6c,3898
4
+ omni_split/test.py,sha256=zYQhqmLUF6gfubsUhGn3RMy3M-13soOMgGdQajlTsZQ,2268
5
+ omni_split/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ omni_split/base/chonkie_base.py,sha256=TP1QTGI4Dg6qJVVk0G1bTZWVMJqEQAq2DeVsBlwppEQ,4900
7
+ omni_split/base/chonkie_tokenizer.py,sha256=_oIbCT9Hd8dPXBHaa5SS4Hcu1Maj1eYdqZv6EGqivys,11891
8
+ omni_split/base/chonkie_types.py,sha256=bI0KDNDbcqeuUnYVK9t_qcqEEjW8QuUvZEZIYyG9QPo,19508
9
+ omni_split/base/md2json_list.py,sha256=7bWukl9e4k0ogvMkFUygyxk2YbJ1xNUozjgSBEMbm68,13126
10
+ omni_split/base/md_json_list2chunk.py,sha256=HJZ5ULNx-Hr508tpeHkpLnjWqqKwZe_f5FpcqTvVBhY,12012
11
+ omni_split/base/native_text_split_utils4content2.py,sha256=mychdMgxDAIxJki_dbrETFhY4r4r5EQOv6D0u_5vGhA,11883
12
+ omni_split/model/text_chunker_tokenizer/qwen_tokenizer.json,sha256=wDghF-oynN8JcEETL21zWSS2l5JNb2_DlFcT6WzodTk,7031645
13
+ omni_split/sub_chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ omni_split/sub_chunker/document_split.py,sha256=aPB0AZBESDp9UEnhnOhWQim_hPWwrwTNA0eUbC4nwbA,1287
15
+ omni_split/sub_chunker/markdown_split.py,sha256=JV1qatm5StIxydnZYofhrulveAtkCkhijZFrup5tNhI,2104
16
+ omni_split/sub_chunker/text_split.py,sha256=QahGJRK9K9F4MB-KZzZIW7r-tnvaYYnw7qPo0AeQNLM,14317
17
+ omni_split/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ omni_split/utils/base_utils.py,sha256=DYY_rJb2g8clTze1GwG4_b3ao0vNA6tUYI0gLKflnVI,6089
19
+ omni_split/utils/download_test_doc.py,sha256=ACQZczmRjU47gXNPB_tXnZ2z4k4OmCnjFI0jYbSbRNg,2229
20
+ omni_split-0.0.1rc0.dist-info/licenses/LICENSE,sha256=6QyD4IqK00qF51METr9C3I9IoAvkjb8OgKBiudMIEZg,1081
21
+ omni_split-0.0.1rc0.dist-info/METADATA,sha256=ZrgaC0Hh0bnNdbwl8boCtyoYa0lu5moItmmcB7UICsM,3932
22
+ omni_split-0.0.1rc0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
23
+ omni_split-0.0.1rc0.dist-info/top_level.txt,sha256=pBBPFY-j8ZzTLrwn21qs3kdg8DtquEun20OYWHmdR1M,11
24
+ omni_split-0.0.1rc0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (78.1.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 M5Stack Technology CO LTD
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ omni_split