PyPI - omni-split - Versions diffs - 0.0.1rc0__py3-none-any.whl - Mend

omni-split 0.0.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of omni-split might be problematic. Click here for more details.

Files changed (24) hide show

omni_split/__init__.py +16 -0
omni_split/base/__init__.py +0 -0
omni_split/base/chonkie_base.py +139 -0
omni_split/base/chonkie_tokenizer.py +285 -0
omni_split/base/chonkie_types.py +519 -0
omni_split/base/md2json_list.py +303 -0
omni_split/base/md_json_list2chunk.py +310 -0
omni_split/base/native_text_split_utils4content2.py +306 -0
omni_split/main.py +73 -0
omni_split/model/text_chunker_tokenizer/qwen_tokenizer.json +303282 -0
omni_split/omni_split.py +93 -0
omni_split/sub_chunker/__init__.py +0 -0
omni_split/sub_chunker/document_split.py +32 -0
omni_split/sub_chunker/markdown_split.py +47 -0
omni_split/sub_chunker/text_split.py +343 -0
omni_split/test.py +80 -0
omni_split/utils/__init__.py +0 -0
omni_split/utils/base_utils.py +181 -0
omni_split/utils/download_test_doc.py +61 -0
omni_split-0.0.1rc0.dist-info/METADATA +139 -0
omni_split-0.0.1rc0.dist-info/RECORD +24 -0
omni_split-0.0.1rc0.dist-info/WHEEL +5 -0
omni_split-0.0.1rc0.dist-info/licenses/LICENSE +21 -0
omni_split-0.0.1rc0.dist-info/top_level.txt +1 -0

omni_split/utils/base_utils.py ADDED Viewed

@@ -0,0 +1,181 @@
+from io import BytesIO
+from docx import Document
+import os
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
+from loguru import logger
+import re
+import uuid
+import base64
+from wand.image import Image
+from pathlib import Path
+import warnings
+def add_fix_before_extension(file_path):
+    # 分割文件路径的目录、文件名和扩展名
+    dir_name = os.path.dirname(file_path)
+    base_name = os.path.basename(file_path)
+    name, ext = os.path.splitext(base_name)
+    # 在文件名和后缀之间添加 _fix
+    new_name = f"{name}_fix{ext}"
+    # 重新组合路径
+    new_path = os.path.join(dir_name, new_name)
+    return new_path
+def delete_file(file_path):
+    """删除指定路径的文件"""
+    try:
+        if os.path.exists(file_path):
+            os.remove(file_path)
+            # print(f"文件 {file_path} 已成功删除")
+        else:
+            pass
+            # print(f"文件 {file_path} 不存在，无法删除")
+    except Exception as e:
+        print(f"删除文件 {file_path} 时出错: {e}")
+def word_preprocessing_and_return_bytesIO(input_file):
+    output_file = add_fix_before_extension(input_file)
+    # 打开Word文档
+    doc = Document(input_file)
+    # 遍历文档中的所有段落
+    for paragraph in doc.paragraphs:
+        if "#" in paragraph.text:
+            # 替换#为"!#"
+            paragraph.text = paragraph.text.replace("#", r"\#")
+    ## 将超链接部分替换为空字符串
+    rels = doc.part.rels
+    for rel in rels:
+        if rels[rel].reltype == RT.HYPERLINK:
+            # hyperlinks[rel] = rels[rel]._target
+            rels[rel]._target = ""
+    # 保存修改后的文档
+    doc.save(output_file)
+    with open(output_file, "rb") as f:
+        doc_content = f.read()
+        # 将bytes包装成BytesIO
+        doc_content_io = BytesIO(doc_content)
+    delete_file(output_file)
+    return doc_content_io
+def save_local_images_func(ret_data, image_save_path):
+    """
+    Process ret_data to extract Base64 images, save them locally, and update text references.
+    Args:
+        ret_data: List of dictionaries containing text with potential Base64 images
+        image_save_path: Directory to save extracted images
+    Returns:
+        Modified ret_data with local image paths instead of Base64 strings
+    """
+    # Create directory if it doesn't exist
+    if image_save_path and not os.path.exists(image_save_path):
+        os.makedirs(image_save_path)
+    # Regex pattern to match Base64 image strings
+    base64_pattern = re.compile(r"!\[.*?\]\(data:(.*?);base64,(.*?)\)")
+    for item in ret_data:
+        if "text" not in item:
+            continue
+        text = item["text"]
+        matches = base64_pattern.findall(text)
+        if not matches:
+            continue
+        for match in list(set(matches)):
+            native_format, img_data = match
+            try:
+                if native_format == "None":
+                    img_format = "image/png"
+                else:
+                    img_format = native_format
+                # Generate unique filename
+                try:
+                    filename = f"{uuid.uuid4()}.{img_format.split('/')[1].split(';')[0]}"
+                except:
+                    logger.info("img_format error, try to forcefully convert it to PNG.")
+                    filename = f"{uuid.uuid4()}.png"
+                filepath = os.path.join(image_save_path, filename)
+                ## 最终要保存的local image path
+                need_saved_filename = f"{uuid.uuid4()}.png"
+                need_saved_filepath = os.path.join(image_save_path, need_saved_filename)
+                # Decode and save image
+                with open(filepath, "wb") as f:
+                    f.write(base64.b64decode(img_data))
+                convert_is_ok = convert_to_png(filepath, need_saved_filepath)
+                # Replace Base64 string with file path
+                if convert_is_ok:
+                    text = text.replace(f"data:{native_format};base64,{img_data}", need_saved_filepath)
+                else:
+                    logger.info("convert_to_png error. save orginal file path.")
+                    text = text.replace(f"data:{native_format};base64,{img_data}", filepath)
+            except Exception as e:
+                print(f"Failed to process image: {e}")
+                continue
+        item["text"] = text
+    return ret_data
+def convert_to_png(input_path: str, output_path: str) -> bool:
+    """
+    将输入图片文件（如 WMF/PNG/JPG）转换为 PNG 格式
+    Args:
+        input_path (str): 输入文件路径（如 "input.wmf" 或 "input.jpg"）
+        output_path (str): 输出 PNG 文件路径（如 "output.png"）
+    Returns:
+        bool: 是否转换成功。如果ImageMagick未安装，直接返回False并保持原文件
+    """
+    try:
+        # 检查wand是否能够正常工作（即ImageMagick是否安装）
+        from wand.version import MAGICK_VERSION_INFO
+    except (ImportError, ModuleNotFoundError) as e:
+        warnings.warn(f"ImageMagick is not properly installed. Skipping conversion: {e}")
+        return False
+    except Exception as e:
+        warnings.warn(f"Error checking ImageMagick installation: {e}")
+        return False
+    try:
+        # 检查输入文件是否存在
+        if not Path(input_path).is_file():
+            print(f"Error: Input file '{input_path}' does not exist.")
+            return False
+        # 如果输出路径是目录，自动生成文件名
+        output_path = str(Path(output_path))
+        if Path(output_path).is_dir():
+            output_path = str(Path(output_path) / (Path(input_path).stem + ".png"))
+        # 使用 wand 进行转换
+        with Image(filename=input_path) as img:
+            img.format = "png"
+            img.save(filename=output_path)
+        print(f"Success: Converted '{input_path}' to '{output_path}'")
+        return True
+    except Exception as e:
+        print(f"Error converting '{input_path}': {e}")
+        return False
+def download_tokenizer_from_network(ms=True):
+    pass

omni_split/utils/download_test_doc.py ADDED Viewed

@@ -0,0 +1,61 @@
+import os
+import requests
+from typing import Dict
+def download_files_to_test_doc() -> Dict[str, str]:
+    """
+    下载文件到 test_doc 文件夹，并返回 {文件名: 绝对路径} 的字典
+    Args:
+        file_list: 包含文件名和下载链接的字典，格式如 {"文件名": "下载链接"}
+    Returns:
+        返回一个字典，key 是文件名，value 是下载后的绝对路径
+    """
+    # 创建 test_doc 文件夹（如果不存在）
+    file_list = {
+        "docx_test.docx": "https://modelscope.cn/datasets/yinyabo/omni_split_test_doc/resolve/master/docx_test.docx",
+        "json_list_test.json": "https://modelscope.cn/datasets/yinyabo/omni_split_test_doc/resolve/master/json_list_test.json",
+        "markdown_test.md": "https://modelscope.cn/datasets/yinyabo/omni_split_test_doc/resolve/master/markdown_test.md",
+        "text_test.txt": "https://modelscope.cn/datasets/yinyabo/omni_split_test_doc/resolve/master/text_test.txt"
+    }
+    os.makedirs("test_doc", exist_ok=True)
+    result = {}
+    def download_file(url: str, filename: str) -> str:
+        """下载单个文件并返回其绝对路径"""
+        try:
+            response = requests.get(url, allow_redirects=True, stream=True)
+            response.raise_for_status()
+            filepath = os.path.abspath(os.path.join("test_doc", filename))
+            with open(filepath, 'wb') as f:
+                for chunk in response.iter_content(1024):
+                    f.write(chunk)
+            print(f"成功下载: {filename}")
+            return filepath
+        except Exception as e:
+            print(f"下载 {filename} 失败: {str(e)}")
+            return None
+    # 下载所有文件
+    for filename, url in file_list.items():
+        absolute_path = download_file(url, filename)
+        if absolute_path:
+            result[filename] = absolute_path
+    return result
+# 使用示例
+if __name__ == "__main__":
+    # 定义要下载的文件列表
+    # 调用函数下载文件
+    downloaded_files = download_files_to_test_doc()
+    # 打印结果
+    print("\n下载结果：")
+    for name, path in downloaded_files.items():
+        print(f"{name}: {path}")

omni_split-0.0.1rc0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,139 @@
+Metadata-Version: 2.4
+Name: omni_split
+Version: 0.0.1rc0
+Summary: A comprehensive document splitting toolkit
+Home-page: https://github.com/dinobot22/omni_split
+Author: dinobot22
+Author-email: 2802701695yyb@gmail.com
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: mistletoe
+Requires-Dist: transformers
+Requires-Dist: markitdown[docx,pptx,xls,xlsx]
+Requires-Dist: python-docx
+Requires-Dist: loguru
+Requires-Dist: wand
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# omni_split: Split commonly used document (md, doc etc.) forms for RAG that support LLM.
+---
+<img src="./docs/icon.png" alt="omni_split" >
+---
+###  note: All other text formats are highly recommended to be converted to Markdown, and we focus on optimizing documents for Markdown.
+---
+# usage
+###  install
+```bash
+pip install omni_split
+```
+###  use case
+```python
+import json
+from omni_split import OmniSplit
+from omni_split import word_preprocessing_and_return_bytesIO
+from omni_split import download_files_to_test_doc
+### == step 2: download test_doc file ==
+doc_dict = download_files_to_test_doc()
+text_doc_file_path = doc_dict["text_test.txt"]
+json_list_doc_file_path = doc_dict["json_list_test.json"]
+markdown_doc_file_path = doc_dict["markdown_test.md"]
+word_doc_file_path = doc_dict["docx_test.docx"]
+### == step 3: split  to chunk ==
+omni_spliter = OmniSplit()
+## note: test text split
+test_text = True
+if test_text:
+    with open(text_doc_file_path, "r") as f:
+        text_content = "".join(f.readlines())
+    res = omni_spliter.text_chunk_func(text_content,txt_chunk_size=1000)
+    for item in res:
+        print(item)
+        print("------------")
+    print("=" * 10)
+## note: test markdown json split
+test_markdown = True
+if test_markdown:
+    with open(json_list_doc_file_path, "r") as f:
+        md_content_json = json.load(f)
+    res = omni_spliter.markdown_json_chunk_func(md_content_json)
+    for item in res:
+        print(item)
+        print("------------")
+    print("=" * 10)
+    res = omni_spliter.markdown_json_chunk_func(md_content_json, clear_model=True)
+    for item in res:
+        print(item)
+        print("------------")
+    print("=" * 10)
+## note: test markdown split
+test_markdown = True
+if test_markdown:
+    with open(markdown_doc_file_path, "r") as f:
+        md_content = f.read()
+    res = omni_spliter.markdown_chunk_func(md_content)
+    for item in res:
+        print(item)
+        print("------------")
+    print("=" * 10)
+    res = omni_spliter.markdown_chunk_func(md_content, clear_model=True)
+    for item in res:
+        print(item)
+        print("------------")
+    print("=" * 10)
+## note: test word split
+test_document = True
+if test_document:
+    new_doc_io = word_preprocessing_and_return_bytesIO(word_doc_file_path)
+    res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False)
+    for item in res:
+        print(item)
+        print("------------")
+    print("=" * 10)
+    res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=False, save_local_images_dir="./images")
+    for item in res:
+        print(item)
+        print("------------")
+    print("=" * 10)
+    res = omni_spliter.document_chunk_func(new_doc_io, txt_chunk_size=1000, clear_model=True)
+    for item in res:
+        print(item)
+        print("------------")
+    print("=" * 10)
+```
+# Reminder of dependency:
+To automatically convert binary metafiles(e.g. x-wmf.) in Word to PNG, you need to install ImageMagick on Linux.
+Try to install:
+https://docs.wand-py.org/en/latest/guide/install.html

omni_split-0.0.1rc0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+omni_split/__init__.py,sha256=FjV45dqfcsb95aLuwCMCBssNwQ0gQ-DVOWLpFO8_A3A,401
+omni_split/main.py,sha256=T_yNeKmyqqTVCx93plow3kixUVMI23sxeyAJDFVYtaM,2166
+omni_split/omni_split.py,sha256=TthOj6j4Lj11eH1giJcccegYkRsbBO9TppsolXLqK6c,3898
+omni_split/test.py,sha256=zYQhqmLUF6gfubsUhGn3RMy3M-13soOMgGdQajlTsZQ,2268
+omni_split/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+omni_split/base/chonkie_base.py,sha256=TP1QTGI4Dg6qJVVk0G1bTZWVMJqEQAq2DeVsBlwppEQ,4900
+omni_split/base/chonkie_tokenizer.py,sha256=_oIbCT9Hd8dPXBHaa5SS4Hcu1Maj1eYdqZv6EGqivys,11891
+omni_split/base/chonkie_types.py,sha256=bI0KDNDbcqeuUnYVK9t_qcqEEjW8QuUvZEZIYyG9QPo,19508
+omni_split/base/md2json_list.py,sha256=7bWukl9e4k0ogvMkFUygyxk2YbJ1xNUozjgSBEMbm68,13126
+omni_split/base/md_json_list2chunk.py,sha256=HJZ5ULNx-Hr508tpeHkpLnjWqqKwZe_f5FpcqTvVBhY,12012
+omni_split/base/native_text_split_utils4content2.py,sha256=mychdMgxDAIxJki_dbrETFhY4r4r5EQOv6D0u_5vGhA,11883
+omni_split/model/text_chunker_tokenizer/qwen_tokenizer.json,sha256=wDghF-oynN8JcEETL21zWSS2l5JNb2_DlFcT6WzodTk,7031645
+omni_split/sub_chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+omni_split/sub_chunker/document_split.py,sha256=aPB0AZBESDp9UEnhnOhWQim_hPWwrwTNA0eUbC4nwbA,1287
+omni_split/sub_chunker/markdown_split.py,sha256=JV1qatm5StIxydnZYofhrulveAtkCkhijZFrup5tNhI,2104
+omni_split/sub_chunker/text_split.py,sha256=QahGJRK9K9F4MB-KZzZIW7r-tnvaYYnw7qPo0AeQNLM,14317
+omni_split/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+omni_split/utils/base_utils.py,sha256=DYY_rJb2g8clTze1GwG4_b3ao0vNA6tUYI0gLKflnVI,6089
+omni_split/utils/download_test_doc.py,sha256=ACQZczmRjU47gXNPB_tXnZ2z4k4OmCnjFI0jYbSbRNg,2229
+omni_split-0.0.1rc0.dist-info/licenses/LICENSE,sha256=6QyD4IqK00qF51METr9C3I9IoAvkjb8OgKBiudMIEZg,1081
+omni_split-0.0.1rc0.dist-info/METADATA,sha256=ZrgaC0Hh0bnNdbwl8boCtyoYa0lu5moItmmcB7UICsM,3932
+omni_split-0.0.1rc0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+omni_split-0.0.1rc0.dist-info/top_level.txt,sha256=pBBPFY-j8ZzTLrwn21qs3kdg8DtquEun20OYWHmdR1M,11
+omni_split-0.0.1rc0.dist-info/RECORD,,

omni_split-0.0.1rc0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (78.1.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

omni_split-0.0.1rc0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 M5Stack Technology CO LTD
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

omni_split-0.0.1rc0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ omni_split