PyPI - pydatamax - Versions diffs - 0.1.14__tar.gz → 0.1.15.post2__tar.gz - Mend

pydatamax 0.1.14tar.gz → 0.1.15.post2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{pydatamax-0.1.14 → pydatamax-0.1.15.post2}/LICENSE RENAMED Viewed

@@ -1,21 +1,21 @@
-MIT License
-Copyright (c) 2024 Hi-Dolphin
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+MIT License
+Copyright (c) 2024 Hi-Dolphin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

{pydatamax-0.1.14 → pydatamax-0.1.15.post2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydatamax
-Version: 0.1.14
+Version: 0.1.15.post2
 Summary: A library for parsing and converting various file formats.
 Home-page: https://github.com/Hi-Dolphin/datamax
 Author: ccy
@@ -105,10 +105,15 @@ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
 # AI annotation
 qa_data = dm.get_pre_label(
-    api_key="your-api-key",
-    base_url="https://api.openai.com/v1",
-    model_name="gpt-3.5-turbo"
+    api_key="sk-xxx",
+    base_url="https://api.provider.com/v1",
+    model_name="model-name",
+    chunk_size=500,        # 文本块大小
+    chunk_overlap=100,     # 重叠长度
+    question_number=5,     # 每块生成问题数
+    max_workers=5          # 并发数
 )
+dm.save_label_data(res)
 ```
 ## 📖 Detailed Documentation
@@ -138,8 +143,54 @@ dm = DataMax(file_path="document.docx", to_markdown=True)
 # Image OCR
 dm = DataMax(file_path="image.jpg", use_ocr=True)
 ```
+### Batch Processing
+```python
+# Parse multiple files in batch
+dm = DataMax(
+    file_path=["file1.pdf", "file2.docx"],
+    use_mineru=True
+)
+data = dm.get_data()
+```
+### Cache parsed results
+```python
+# Cache parsed results to avoid repeated parsing
+dm = DataMax(
+    file_path=["file1.pdf", "file2.docx"],
+    ttl=3600  # Cache duration in seconds, default 3600s, 0 means no caching
+)
+data = dm.get_data()
+```
 ### Data Cleaning
+## Exception Handling
+- remove_abnormal_chars Remove abnormal characters from text
+- remove_html_tags Remove HTML tags
+- convert_newlines Convert \r to \n and merge multiple \n into single \n
+- single_space Convert multiple spaces (more than 2) to single space
+- tabs_to_spaces Convert tabs to 4 spaces
+- remove_invisible_chars Remove invisible ASCII characters
+- simplify_chinese Convert traditional Chinese to simplified Chinese
+## Text Filtering
+- filter_by_word_repetition Filter by word repetition rate
+- filter_by_char_count Filter by character count
+- filter_by_numeric_content Filter by numeric content ratio
+## Privacy Desensitization
+- replace_ip
+- replace_email
+- replace_customer_number Clean hotline numbers like 4008-123-123
+- replace_bank_id
+- replace_phone_number
+- replace_qq
+- replace_id_card
 ```python
 # Three cleaning modes
@@ -148,6 +199,67 @@ dm.clean_data(method_list=[
     "private",   # Privacy information masking
     "filter"     # Text filtering and normalization
 ])
+# Custom cleaning mode
+from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
+dm = DataMax(
+    file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
+)
+parsed_data = dm.get_data().get('content')
+# 1. Text filtering
+tf = TextFilter(parsed_data=parsed_data)
+    # Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
+tf_bool = tf.filter_by_word_repetition(threshold=0.6)
+if tf_bool:
+    print("Text passed word repetition filtering")
+else:
+    print("Text failed word repetition filtering")
+# Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
+tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
+if tf_bool:
+    print("Text passed character count filtering")
+else:
+    print("Text failed character count filtering")
+# Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
+tf_bool = tf.filter_by_numeric_content(threshold=0.6)
+if tf_bool:
+    print("Text passed numeric ratio filtering")
+else:
+    print("Text failed numeric ratio filtering")
+# 2. Privacy desensitization
+pd = PrivacyDesensitization(parsed_data=parsed_data)
+res = pd.replace_ip(
+    token="MyIP"
+)
+print(res)
+# 3. Abnormal character cleaning
+ac = AbnormalCleaner(parsed_data=parsed_data)
+res = ac.remove_abnormal_chars()
+res = ac.remove_html_tags()
+res = ac.convert_newlines()
+res = ac.single_space()
+res = ac.tabs_to_spaces()
+res = ac.remove_invisible_chars()
+res = ac.simplify_chinese()
+print(res)
+```
+# Text Segmentation
+```python
+dm.split_data(
+    chunk_size=500,      # Chunk size
+    chunk_overlap=100,    # Overlap length
+    use_langchain=True   # Use LangChain for text segmentation
+)
+# When use_langchain is False, use custom segmentation method
+# Using 。！？ as separators, consecutive separators will be merged
+# chunk_size strictly limits the string length
+for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
+    print(chunk)
 ```
 ### AI Annotation
@@ -225,4 +337,4 @@ This project is licensed under the [MIT License](LICENSE).
 ---
-⭐ If this project helps you, please give us a star!
+⭐ If this project helps you, please give us a star!

{pydatamax-0.1.14 → pydatamax-0.1.15.post2}/README.md RENAMED Viewed

@@ -44,10 +44,15 @@ cleaned_data = dm.clean_data(method_list=["abnormal", "private", "filter"])
 # AI annotation
 qa_data = dm.get_pre_label(
-    api_key="your-api-key",
-    base_url="https://api.openai.com/v1",
-    model_name="gpt-3.5-turbo"
+    api_key="sk-xxx",
+    base_url="https://api.provider.com/v1",
+    model_name="model-name",
+    chunk_size=500,        # 文本块大小
+    chunk_overlap=100,     # 重叠长度
+    question_number=5,     # 每块生成问题数
+    max_workers=5          # 并发数
 )
+dm.save_label_data(res)
 ```
 ## 📖 Detailed Documentation
@@ -77,8 +82,54 @@ dm = DataMax(file_path="document.docx", to_markdown=True)
 # Image OCR
 dm = DataMax(file_path="image.jpg", use_ocr=True)
 ```
+### Batch Processing
+```python
+# Parse multiple files in batch
+dm = DataMax(
+    file_path=["file1.pdf", "file2.docx"],
+    use_mineru=True
+)
+data = dm.get_data()
+```
+### Cache parsed results
+```python
+# Cache parsed results to avoid repeated parsing
+dm = DataMax(
+    file_path=["file1.pdf", "file2.docx"],
+    ttl=3600  # Cache duration in seconds, default 3600s, 0 means no caching
+)
+data = dm.get_data()
+```
 ### Data Cleaning
+## Exception Handling
+- remove_abnormal_chars Remove abnormal characters from text
+- remove_html_tags Remove HTML tags
+- convert_newlines Convert \r to \n and merge multiple \n into single \n
+- single_space Convert multiple spaces (more than 2) to single space
+- tabs_to_spaces Convert tabs to 4 spaces
+- remove_invisible_chars Remove invisible ASCII characters
+- simplify_chinese Convert traditional Chinese to simplified Chinese
+## Text Filtering
+- filter_by_word_repetition Filter by word repetition rate
+- filter_by_char_count Filter by character count
+- filter_by_numeric_content Filter by numeric content ratio
+## Privacy Desensitization
+- replace_ip
+- replace_email
+- replace_customer_number Clean hotline numbers like 4008-123-123
+- replace_bank_id
+- replace_phone_number
+- replace_qq
+- replace_id_card
 ```python
 # Three cleaning modes
@@ -87,6 +138,67 @@ dm.clean_data(method_list=[
     "private",   # Privacy information masking
     "filter"     # Text filtering and normalization
 ])
+# Custom cleaning mode
+from datamax.utils.data_cleaner import TextFilter, PrivacyDesensitization, AbnormalCleaner
+dm = DataMax(
+    file_path=r"C:\Users\cykro\Desktop\HongKongDevMachine.txt"
+)
+parsed_data = dm.get_data().get('content')
+# 1. Text filtering
+tf = TextFilter(parsed_data=parsed_data)
+    # Word repetition filtering - default threshold is 0.6 (max 60% of characters can be repeated)
+tf_bool = tf.filter_by_word_repetition(threshold=0.6)
+if tf_bool:
+    print("Text passed word repetition filtering")
+else:
+    print("Text failed word repetition filtering")
+# Character count filtering - default min_chars=30 (minimum 30 chars), max_chars=500000 (maximum 500000 chars)
+tf_bool = tf.filter_by_char_count(min_chars=30, max_chars=500000)
+if tf_bool:
+    print("Text passed character count filtering")
+else:
+    print("Text failed character count filtering")
+# Numeric content filtering - default threshold=0.6 (max 60% of characters can be digits)
+tf_bool = tf.filter_by_numeric_content(threshold=0.6)
+if tf_bool:
+    print("Text passed numeric ratio filtering")
+else:
+    print("Text failed numeric ratio filtering")
+# 2. Privacy desensitization
+pd = PrivacyDesensitization(parsed_data=parsed_data)
+res = pd.replace_ip(
+    token="MyIP"
+)
+print(res)
+# 3. Abnormal character cleaning
+ac = AbnormalCleaner(parsed_data=parsed_data)
+res = ac.remove_abnormal_chars()
+res = ac.remove_html_tags()
+res = ac.convert_newlines()
+res = ac.single_space()
+res = ac.tabs_to_spaces()
+res = ac.remove_invisible_chars()
+res = ac.simplify_chinese()
+print(res)
+```
+# Text Segmentation
+```python
+dm.split_data(
+    chunk_size=500,      # Chunk size
+    chunk_overlap=100,    # Overlap length
+    use_langchain=True   # Use LangChain for text segmentation
+)
+# When use_langchain is False, use custom segmentation method
+# Using 。！？ as separators, consecutive separators will be merged
+# chunk_size strictly limits the string length
+for chunk in parser.split_data(chunk_size=500, chunk_overlap=100, use_langchain=False).get("content"):
+    print(chunk)
 ```
 ### AI Annotation
@@ -164,4 +276,4 @@ This project is licensed under the [MIT License](LICENSE).
 ---
-⭐ If this project helps you, please give us a star!
+⭐ If this project helps you, please give us a star!

{pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/__init__.py RENAMED Viewed

	@@ -1 +1 @@
1	- from .parser import DataMax
1	+ from .parser import DataMax

{pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/loader/__init__.py RENAMED Viewed

File without changes

{pydatamax-0.1.14 → pydatamax-0.1.15.post2}/datamax/loader/core.py RENAMED Viewed

@@ -1,119 +1,119 @@
-import os
-from typing import List
-from datamax.loader.minio_handler import MinIOClient
-from datamax.loader.oss_handler import OssClient
-class DataLoader:
-    def __init__(self, endpoint: str = None, secret_key: str = None, access_key: str = None,
-                 bucket_name: str = None, source: str = None):
-        if source and source == 'Oss':
-            self.oss = OssClient(
-                oss_endpoint=endpoint,
-                oss_access_key_secret=secret_key,
-                oss_access_key_id=access_key,
-                oss_bucket_name=bucket_name
-            )
-        elif source and source == 'MinIO':
-            self.mi = MinIOClient(
-                endpoint=endpoint,
-                secret_key=secret_key,
-                access_key=access_key,
-                bucket_name=bucket_name
-            )
-        self.download_path = str('./download_file')
-        self.source = source
-        self.bucket_name = bucket_name
-    @staticmethod
-    def load_from_file(local_file_path) -> List[str]:
-        if os.path.isfile(local_file_path):
-            if os.path.exists(local_file_path):
-                if os.access(local_file_path, os.R_OK):
-                    return [local_file_path]
-                else:
-                    return []
-            else:
-                return []
-        elif os.path.isdir(local_file_path):
-            access_path = []
-            for root, dirs, files in os.walk(local_file_path):
-                for file in files:
-                    file_path = os.path.join(root, file)
-                    if os.path.exists(file_path):
-                        if os.access(file_path, os.R_OK):
-                            access_path.append(file_path)
-                        else:
-                            continue
-                    else:
-                        continue
-            return access_path
-        else:
-            return []
-    def load_from_oss_source(self, oss_path: str) -> List[str]:
-        if not os.path.exists(self.download_path):
-            os.makedirs(self.download_path)
-        self.download(oss_path=oss_path)
-        file_list = []
-        for root, dirs, files in os.walk(self.download_path):
-            for file in files:
-                file_path = os.path.join(self.download_path, file)
-                file_list.append(file_path)
-        success_file_list = []
-        for file_path in file_list:
-            if self.load_from_file(file_path):
-                success_file_list.append(file_path)
-        return success_file_list
-    def download(self, oss_path: str):
-        if self.source == 'MinIO':
-            file_list = self.mi.list_objects(bucket_name=self.bucket_name, prefix=oss_path)
-            for path in file_list:
-                self.mi.download_file(bucket_name=self.bucket_name, object_name=path,
-                                      file_path=f'{self.download_path}/{path.split("/")[-1]}')
-        elif self.source == "Oss":
-            keys = self.oss.get_objects_in_folders(prefix=oss_path)
-            for path in keys:
-                self.oss.get_object_to_file(object_name=path,
-                                            file_path=f'{self.download_path}/{path.split("/")[-1]}')
-    def upload(self, local_file_path: str, save_prefix: str):
-        if self.source == 'MinIO':
-            if os.path.isdir(local_file_path):
-                for root, dirs, files in os.walk(local_file_path):
-                    for file in files:
-                        file_path = os.path.join(root, file)
-                        self.mi.upload_file(bucket_name=self.bucket_name, object_name=save_prefix + f'{file}',
-                                            file_path=file_path)
-            elif os.path.isfile(local_file_path):
-                self.mi.upload_file(bucket_name=self.bucket_name,
-                                    object_name=save_prefix + os.path.basename(local_file_path),
-                                    file_path=local_file_path)
-            else:
-                pass
-        elif self.source == "Oss":
-            if os.path.isdir(local_file_path):
-                self.oss.put_object_from_folder(object_folder_name=save_prefix, local_folder_path=local_file_path)
-            elif os.path.isfile(local_file_path):
-                self.oss.put_object_from_file(object_name=save_prefix + os.path.basename(local_file_path),
-                                              file_path=local_file_path)
-        else:
-            pass
-    def share(self, oss_path: str,
-              expires: int = None,
-              aliyun_oss_url_prefix: str = None,
-              csnt_url_prefix: str = None):
-        if self.source == 'MinIO':
-            return self.mi.get_object_tmp_link(bucket_name=self.bucket_name, object_name=oss_path, expires=expires)
-        elif self.source == "Oss":
-            return self.oss.get_oss_url(object_name=oss_path,
-                                        url_expires_time=expires,
-                                        aliyun_oss_url_prefix=aliyun_oss_url_prefix,
+import os
+from typing import List
+from datamax.loader.minio_handler import MinIOClient
+from datamax.loader.oss_handler import OssClient
+class DataLoader:
+    def __init__(self, endpoint: str = None, secret_key: str = None, access_key: str = None,
+                 bucket_name: str = None, source: str = None):
+        if source and source == 'Oss':
+            self.oss = OssClient(
+                oss_endpoint=endpoint,
+                oss_access_key_secret=secret_key,
+                oss_access_key_id=access_key,
+                oss_bucket_name=bucket_name
+            )
+        elif source and source == 'MinIO':
+            self.mi = MinIOClient(
+                endpoint=endpoint,
+                secret_key=secret_key,
+                access_key=access_key,
+                bucket_name=bucket_name
+            )
+        self.download_path = str('./download_file')
+        self.source = source
+        self.bucket_name = bucket_name
+    @staticmethod
+    def load_from_file(local_file_path) -> List[str]:
+        if os.path.isfile(local_file_path):
+            if os.path.exists(local_file_path):
+                if os.access(local_file_path, os.R_OK):
+                    return [local_file_path]
+                else:
+                    return []
+            else:
+                return []
+        elif os.path.isdir(local_file_path):
+            access_path = []
+            for root, dirs, files in os.walk(local_file_path):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    if os.path.exists(file_path):
+                        if os.access(file_path, os.R_OK):
+                            access_path.append(file_path)
+                        else:
+                            continue
+                    else:
+                        continue
+            return access_path
+        else:
+            return []
+    def load_from_oss_source(self, oss_path: str) -> List[str]:
+        if not os.path.exists(self.download_path):
+            os.makedirs(self.download_path)
+        self.download(oss_path=oss_path)
+        file_list = []
+        for root, dirs, files in os.walk(self.download_path):
+            for file in files:
+                file_path = os.path.join(self.download_path, file)
+                file_list.append(file_path)
+        success_file_list = []
+        for file_path in file_list:
+            if self.load_from_file(file_path):
+                success_file_list.append(file_path)
+        return success_file_list
+    def download(self, oss_path: str):
+        if self.source == 'MinIO':
+            file_list = self.mi.list_objects(bucket_name=self.bucket_name, prefix=oss_path)
+            for path in file_list:
+                self.mi.download_file(bucket_name=self.bucket_name, object_name=path,
+                                      file_path=f'{self.download_path}/{path.split("/")[-1]}')
+        elif self.source == "Oss":
+            keys = self.oss.get_objects_in_folders(prefix=oss_path)
+            for path in keys:
+                self.oss.get_object_to_file(object_name=path,
+                                            file_path=f'{self.download_path}/{path.split("/")[-1]}')
+    def upload(self, local_file_path: str, save_prefix: str):
+        if self.source == 'MinIO':
+            if os.path.isdir(local_file_path):
+                for root, dirs, files in os.walk(local_file_path):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        self.mi.upload_file(bucket_name=self.bucket_name, object_name=save_prefix + f'{file}',
+                                            file_path=file_path)
+            elif os.path.isfile(local_file_path):
+                self.mi.upload_file(bucket_name=self.bucket_name,
+                                    object_name=save_prefix + os.path.basename(local_file_path),
+                                    file_path=local_file_path)
+            else:
+                pass
+        elif self.source == "Oss":
+            if os.path.isdir(local_file_path):
+                self.oss.put_object_from_folder(object_folder_name=save_prefix, local_folder_path=local_file_path)
+            elif os.path.isfile(local_file_path):
+                self.oss.put_object_from_file(object_name=save_prefix + os.path.basename(local_file_path),
+                                              file_path=local_file_path)
+        else:
+            pass
+    def share(self, oss_path: str,
+              expires: int = None,
+              aliyun_oss_url_prefix: str = None,
+              csnt_url_prefix: str = None):
+        if self.source == 'MinIO':
+            return self.mi.get_object_tmp_link(bucket_name=self.bucket_name, object_name=oss_path, expires=expires)
+        elif self.source == "Oss":
+            return self.oss.get_oss_url(object_name=oss_path,
+                                        url_expires_time=expires,
+                                        aliyun_oss_url_prefix=aliyun_oss_url_prefix,
                                         csnt_url_prefix=csnt_url_prefix)

pydatamax 0.1.14__tar.gz → 0.1.15.post2__tar.gz

pydatamax 0.1.14tar.gz → 0.1.15.post2tar.gz