PyPI - pydatamax - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

pydatamax 0.1.5py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

datamax/__init__.py +1 -1
datamax/loader/OssHandler.py +85 -51
datamax/parser/__init__.py +1 -1
datamax/parser/base.py +2 -2
datamax/parser/core.py +205 -31
datamax/parser/doc_parser.py +2 -5
datamax/parser/docx_parser.py +3 -6
datamax/parser/epub_parser.py +2 -5
datamax/parser/html_parser.py +2 -5
datamax/parser/image_parser.py +18 -14
datamax/parser/md_parser.py +67 -4
datamax/parser/pdf_parser.py +59 -20
datamax/parser/ppt_parser.py +3 -5
datamax/parser/pptx_parser.py +10 -13
datamax/parser/txt_parser.py +2 -5
datamax/parser/xls_parser.py +26 -0
datamax/parser/xlsx_parser.py +65 -4
datamax/utils/__init__.py +1 -0
datamax/utils/constants.py +58 -0
datamax/utils/data_cleaner.py +45 -28
datamax/utils/env_setup.py +80 -0
datamax/utils/gotocr_pdf.py +265 -0
datamax/utils/mineru_operator.py +62 -0
datamax/utils/paddleocr_pdf_operator.py +2 -1
datamax/utils/qa_generator.py +376 -0
datamax/utils/tokenizer.py +1 -1
pydatamax-0.1.11.dist-info/METADATA +271 -0
pydatamax-0.1.11.dist-info/RECORD +39 -0
{pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info}/WHEEL +1 -1
{pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info/licenses}/LICENSE +0 -0
{pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info}/top_level.txt +1 -0
tests/__init__.py +0 -0
tests/test_basic.py +20 -0
pydatamax-0.1.5.dist-info/METADATA +0 -282
pydatamax-0.1.5.dist-info/RECORD +0 -31

datamax/parser/image_parser.py CHANGED Viewed

@@ -1,30 +1,34 @@
+import os
 import pathlib
 import sys
-from paddleocr import PaddleOCR
+from datamax.utils import setup_environment
+setup_environment(use_gpu=True)
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
 from datamax.parser.base import MarkdownOutputVo
 ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
 sys.path.insert(0, str(ROOT_DIR))
 from datamax.parser.base import BaseLife
+from datamax.parser.pdf_parser import PdfParser
+from PIL import Image
 class ImageParser(BaseLife):
-    def __init__(self, file_path):
+    def __init__(self,file_path: str):
         super().__init__()
         self.file_path = file_path
     def parse(self, file_path: str) -> MarkdownOutputVo:
         try:
             title = self.get_file_extension(file_path)
-            ocr = PaddleOCR(use_angle_cls=True, lang='ch', show_log=False)
-            result = ocr.ocr(file_path, cls=True)
-            recognized_texts = [l[1][0] for line in result for l in line]
-            mk_content = '\n'.join(recognized_texts)
-            token_count = self.tk_client.get_tokenizer(content=mk_content)
-            lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
+            output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
+            image = Image.open(file_path)
+            image.save(output_pdf_path, 'PDF', resolution=100.0)
+            pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
+            output_vo = pdf_parser.parse(output_pdf_path)
+            if os.path.exists(output_pdf_path):
+                # shutil.rmtree(f'uploaded_files/markdown')
+                os.remove(output_pdf_path)
+            return output_vo
         except Exception as e:
             raise e

datamax/parser/md_parser.py CHANGED Viewed

@@ -1,10 +1,73 @@
+import pathlib
+import sys
+from typing import Union
+ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
+sys.path.insert(0, str(ROOT_DIR))
+from datamax.parser.base import BaseLife
 from datamax.parser.base import MarkdownOutputVo
+from loguru import logger
-class Parser:
+class MarkdownParser(BaseLife):
+    """
+    Parser for Markdown files that follows the same pattern as PdfParser.
+    Handles .md and .markdown file extensions.
+    """
-    def __init__(self, file_path):
+    def __init__(self,
+                 file_path: Union[str, list],
+                 ):
+        super().__init__()
         self.file_path = file_path
-    def parse(self) -> MarkdownOutputVo:
-        pass
+    @staticmethod
+    def read_markdown_file(file_path: str) -> str:
+        """
+        Reads the content of a markdown file.
+        Args:
+            file_path: Path to the markdown file
+        Returns:
+            str: Content of the markdown file
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            logger.error(f"Error reading markdown file {file_path}: {e}")
+            raise
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        """
+        Parses a markdown file and returns a MarkdownOutputVo.
+        Args:
+            file_path: Path to the markdown file
+        Returns:
+            MarkdownOutputVo: Structured output containing the markdown content
+        """
+        try:
+            title = self.get_file_extension(file_path)
+            # Read markdown content
+            md_content = self.read_markdown_file(file_path)
+            # Generate lifecycle metadata
+            lifecycle = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type="LLM_ORIGIN"
+            )
+            # Create and return output VO
+            output_vo = MarkdownOutputVo(title, md_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception as e:
+            logger.error(f"Failed to parse markdown file {file_path}: {e}")
+            raise

datamax/parser/pdf_parser.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import pathlib
 import sys
-import docx2markdown
+import subprocess
 from typing import Union
 ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
@@ -9,18 +9,51 @@ sys.path.insert(0, str(ROOT_DIR))
 from datamax.parser.base import BaseLife
 from datamax.parser.base import MarkdownOutputVo
 from langchain_community.document_loaders import PyMuPDFLoader
-from datamax.utils import clean_original_text
-from datamax.utils.paddleocr_pdf_operator import use_paddleocr
+from loguru import logger
+from datamax.utils.mineru_operator import pdf_processor
 class PdfParser(BaseLife):
-    def __init__(self, file_path: Union[str, list], use_ocr: bool = False, use_gpu: bool = False, gpu_id: int = 6):
+    def __init__(self,
+                 file_path: Union[str, list],
+                 use_mineru: bool = False,
+                 ):
         super().__init__()
         self.file_path = file_path
-        self.use_ocr = use_ocr
-        self.use_gpu = use_gpu
-        self.gpu_id = gpu_id
+        self.use_mineru = use_mineru
+    def mineru_process(self, input_pdf_filename, output_dir):
+        proc = None
+        try:
+            logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
+            command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
+            proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            # 等待命令执行完成
+            stdout, stderr = proc.communicate()
+            # 检查命令是否成功执行
+            if proc.returncode != 0:
+                raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
+            logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            if proc is not None:
+                proc.kill()
+                proc.wait()
+                logger.info("The process was terminated due to an error.")
+            raise  # Re-raise the exception to let the caller handle it
+        finally:
+            # 确保子进程已经结束
+            if proc is not None:
+                if proc.poll() is None:
+                    proc.kill()
+                    proc.wait()
+                    logger.info("The process was terminated due to timeout or completion.")
     @staticmethod
     def read_pdf_file(file_path) -> str:
@@ -37,23 +70,29 @@ class PdfParser(BaseLife):
     def parse(self, file_path: str) -> MarkdownOutputVo:
         try:
             title = self.get_file_extension(file_path)
-            if self.use_ocr:
-                output_docx_dir = f'./output/{os.path.basename(file_path).replace(".pdf", "_ocr.docx")}'
-                if os.path.exists(output_docx_dir):
-                    pass
+            if self.use_mineru:
+                output_dir = 'uploaded_files'
+                output_folder_name = os.path.basename(file_path).replace(".pdf", "")
+                # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
+                # if os.path.exists(output_mineru):
+                #     pass
+                # else:
+                    # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
+                # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
+                # todo: 是否有必要跟api的默认保存路径保持一致
+                output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
+                if os.path.exists(output_mineru):
+                    mk_content = open(output_mineru, 'r', encoding='utf-8').read()
                 else:
-                    use_paddleocr(file_path, './output', self.use_gpu, self.gpu_id)
-                output_md_dir = f'./output/{os.path.basename(file_path).replace(".pdf", "_ocr.md")}'
-                docx2markdown.docx_to_markdown(output_docx_dir, output_md_dir)
-                mk_content = open(output_md_dir, 'r', encoding='utf-8').read()
-                token_count = self.tk_client.get_tokenizer(content=mk_content)
+                    mk_content = pdf_processor.process_pdf(file_path)
             else:
                 content = self.read_pdf_file(file_path=file_path)
-                clean_text = clean_original_text(content)
-                mk_content = clean_text
-                token_count = self.tk_client.get_tokenizer(content=mk_content.get('text', ''))
+                mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
                                                 usage_purpose="Documentation", life_type="LLM_ORIGIN")
             output_vo = MarkdownOutputVo(title, mk_content)
             output_vo.add_lifecycle(lifecycle)

datamax/parser/ppt_parser.py CHANGED Viewed

@@ -7,7 +7,6 @@ from pathlib import Path
 from typing import Union
 from datamax.parser.base import BaseLife
 from datamax.parser.base import MarkdownOutputVo
-from datamax.utils import clean_original_text
 from datamax.utils.ppt_extract import PPtExtractor
@@ -64,10 +63,9 @@ class PPtParser(BaseLife):
         try:
             title = self.get_file_extension(file_path)
             content = self.read_ppt_file(file_path=file_path)
-            clean_text = clean_original_text(content)
-            mk_content = clean_text.get('text', '')
-            token_count = self.tk_client.get_tokenizer(content=mk_content)
-            lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
+            # clean_text = clean_original_text(content)
+            mk_content = content
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
                                                 usage_purpose="Documentation", life_type="LLM_ORIGIN")
             output_vo = MarkdownOutputVo(title, mk_content)
             output_vo.add_lifecycle(lifecycle)

datamax/parser/pptx_parser.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import Union
 from pptx import Presentation
 from datamax.parser.base import BaseLife
 from datamax.parser.base import MarkdownOutputVo
-from datamax.utils import clean_original_text
 class PPtxParser(BaseLife):
@@ -20,14 +19,14 @@ class PPtxParser(BaseLife):
                 for shape in slide.shapes:
                     if shape.has_text_frame:
                         content += shape.text + '\n'
-                    if shape.shape_type == 13:
-                        if not os.path.exists("extracted_images"):
-                            os.makedirs("extracted_images")
-                        image = shape.image
-                        image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
-                        with open(image_filename, 'wb') as img_file:
-                            img_file.write(image.blob)
-                        content += ('[' + image_filename + ']')
+                    # if shape.shape_type == 13:
+                    #     if not os.path.exists("extracted_images"):
+                    #         os.makedirs("extracted_images")
+                    #     image = shape.image
+                    #     image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
+                    #     with open(image_filename, 'wb') as img_file:
+                    #         img_file.write(image.blob)
+                    #     content += ('[' + image_filename + ']')
             return content
         except Exception:
             raise
@@ -36,10 +35,8 @@ class PPtxParser(BaseLife):
         try:
             title = self.get_file_extension(file_path)
             content = self.read_ppt_file(file_path=file_path)
-            clean_text = clean_original_text(content)
-            mk_content = clean_text.get('text', '')
-            token_count = self.tk_client.get_tokenizer(content=mk_content)
-            lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
+            mk_content = content
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
                                                 usage_purpose="Documentation", life_type="LLM_ORIGIN")
             output_vo = MarkdownOutputVo(title, mk_content)
             output_vo.add_lifecycle(lifecycle)

datamax/parser/txt_parser.py CHANGED Viewed

@@ -2,7 +2,6 @@ import chardet
 from typing import Union
 from datamax.parser.base import BaseLife
 from datamax.parser.base import MarkdownOutputVo
-from datamax.utils import clean_original_text
 class TxtParser(BaseLife):
@@ -37,10 +36,8 @@ class TxtParser(BaseLife):
         try:
             title = self.get_file_extension(file_path)
             content = self.read_txt_file(file_path=file_path)  # 真实数据是从load加载
-            clean_text = clean_original_text(content)
-            mk_content = clean_text.get('text', '')
-            token_count = self.tk_client.get_tokenizer(content=mk_content)
-            lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
+            mk_content = content
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
                                                 usage_purpose="Documentation", life_type="LLM_ORIGIN")
             output_vo = MarkdownOutputVo(title, mk_content)
             output_vo.add_lifecycle(lifecycle)

datamax/parser/xls_parser.py ADDED Viewed

@@ -0,0 +1,26 @@
+from datamax.parser.base import MarkdownOutputVo
+from datamax.parser.base import BaseLife
+import pandas as pd
+import warnings
+warnings.filterwarnings("ignore")
+class XlsParser(BaseLife):
+    """xlsx or xls table use markitdown from Microsoft  so magic for table!"""
+    def __init__(self, file_path):
+        super().__init__()
+        self.file_path = file_path
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            df = pd.read_excel(file_path)
+            mk_content = df.to_markdown(index=False)
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
+                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception as e:
+            raise e

datamax/parser/xlsx_parser.py CHANGED Viewed

@@ -1,10 +1,71 @@
+import multiprocessing
+import time
+from multiprocessing import Queue
 from datamax.parser.base import MarkdownOutputVo
+from datamax.parser.base import BaseLife
+from openpyxl import load_workbook
+import warnings
+from markitdown import MarkItDown
+warnings.filterwarnings("ignore")
-class Parser:
+class XlsxParser(BaseLife):
+    # single ton
-    def __init__(self, file_path):
+    _markitdown_instance = None
+    @classmethod
+    def get_markitdown(cls):
+        if cls._markitdown_instance is None:
+            cls._markitdown_instance = MarkItDown()
+        return cls._markitdown_instance
+    def __init__(self, file_path, timeout):
+        super().__init__()
         self.file_path = file_path
+        self.timeout = timeout
+        self.markitdown = self.get_markitdown()
+    def _parse(self, file_path: str, result_queue: Queue) -> dict:
+        try:
+            wb = load_workbook(
+                filename=file_path,
+                data_only=True,
+                read_only=True
+            )
+            wb.close()
+        except Exception as e:
+            raise e
+        mk_content = self.markitdown.convert(file_path).text_content
+        lifecycle = self.generate_lifecycle(
+            source_file=file_path,
+            domain="Technology",
+            usage_purpose="Documentation",
+            life_type="LLM_ORIGIN"
+        )
+        output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
+        output_vo.add_lifecycle(lifecycle)
+        result_queue.put(output_vo.to_dict())
+        time.sleep(0.5)
+        return output_vo.to_dict()
+    def parse(self, file_path: str) -> dict:
+        import time
+        result_queue = Queue()
+        process = multiprocessing.Process(target=self._parse, args=(file_path, result_queue))
+        process.start()
+        start_time = time.time()
-    def parse(self) -> MarkdownOutputVo:
-        pass
+        # ttl
+        while time.time() - start_time < self.timeout:
+            print(f"plz waiting...: {int(time.time() - start_time)}")
+            if not process.is_alive():
+                break
+            if not result_queue.empty():
+                return result_queue.get()
+            time.sleep(1)
+        else:
+            # killed
+            process.terminate()
+            process.join()

datamax/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from datamax.utils.data_cleaner import AbnormalCleaner, TextFilter, PrivacyDesensitization
+from datamax.utils.env_setup import setup_environment
 def clean_original_text(text):

datamax/utils/constants.py ADDED Viewed

@@ -0,0 +1,58 @@
+def get_system_prompt(knowledge):
+    system_prompt = f"""
+                你是一个精确并高效的航运问答对生成助手，你的职责是基于用户提供的特定航运知识为用户生成关于每个航运知识的问题和答案也被称为问答对，达到考察该知识点的效果。
+                    你需要完全按照标注要求以及注意事项来生成问答，请完整读取该格式的航运知识：{knowledge}
+                    目标 (Objective):
+                    你的目标是根据已知信息生成出正确和精准的问答对，并且确保包括原文里提及的所有正确选项以及保证所有的专业术语拼写正确。
+                    风格 (Style):
+                    你的回答风格应该是官方航运问题专家的风格。
+                    语气 (Tone):
+                    你的语气应该是正式的
+                    受众 (Audience):
+                    你的受众是数据标注人员，他们需要根据你的标注进行修改
+                    响应 (Response):
+                    你的响应格式应该是以json的格式返回如下：
+                    ```json
+                    {{
+                        "instruction": "<生成的相关问题>",
+                        "input": "",
+                        "output": "<根据知识生成的答案>"
+                    }}
+                    # 标注要求
+                    1. 对于可能存在歧义的名词，需要给出完整的定语，以消除歧义。
+                      a.《水面智能搜救机器人技术指南》
+                          i.搜救机器人的连续工作时间要求不小于30分钟。-> 错误
+                          ii.水面智能搜救机器人的连续工作时间要求不小于30分钟。-> 正确
+                    2. QA对答案的字符长度有要求，不能少于50字。仅要求下限为50字，不做字数上限要求。
+                    3. 可以引用规则或规定的章节号，但必须同时引用章节的原文相关内容。
+                      a.可继续按照原适用的CCS《钢质海船入级规范》第8篇第8章的要求维护CLEAN附加标志。X
+                      b.可继续按照原适用的CCS《钢质海船入级规范》第8篇第8章【原文】的要求维护CLEAN附加标志。√
+                    4. 请使用标准的markdown格式来表示多层级结构，例如使用"xxx\n1.1. xxx\n1.1.1. xxx"。
+                    5. 请勿使用序号标识，如（1），一、①、Ⅰ、壹等，以保持格式的统一性。
+                    6. 对于需要强调的专有名词或关键词，请使用** **来代替单引号或双引号。
+                    7. 在数据中，如果存在明显的错误，如语法错误或逻辑错误，需要自行进行剔除。
+                    8. 数据需要按照markdown格式进行格式化，并保留\n换行符标识。
+                    9. 对于具有明确意义的数字，要求准确率达到100%。
+                    10. 问答对的答案不能为①，A，等选项引用。
+                    11. 问答对的问题不能出现“以下错误的是”，“这几项哪个是正确的”等形式。
+                    12. 文本数据要具备专业性，减少出现“因为” “因此” “此外” “首先”等冗余的连词或副词。
+                    13. 大段文字内容，有并列逻辑的，需要存在换行符\n与有序列表1. 2. 3. 标识。
+                    14. 大段文字内容，有层级逻辑的，需要将原文层级合理编排。不能直接把第1章第2节1.1.2xxxxx内容堆叠在一起，要通过中文语言将层级合理编排为通顺的语句。
+                    # 注意事项
+                    1. 请选择最有价值的五个知识点, 最终返回一个jsonlist.
+                    2. 每个json内容需要保持格式一致, 且output中生成的答案不少于50字
+                        , 请以完整航运知识为主进行专业且不偏题的扩写
+                    3. 所有的专业术语拼写必须要完全正确
+                    4. 问答对的问题应该是提供的知识点的重点
+                    5. 你的信息来源只能是提供的航运知识
+                    6. jsonlist长度不超过5
+                    7. 最终仅返回结果，不要有其他代表格式的markdown文本例如 ```python  ```json
+            """
+    return system_prompt

datamax/utils/data_cleaner.py CHANGED Viewed

@@ -1,6 +1,26 @@
+import os
 import re
-import jionlp as jio
+import sys
 from collections import Counter
+from contextlib import contextmanager
+@contextmanager
+def suppress_stdout():
+    # Save the original standard output stream
+    original_stdout = sys.stdout
+    # Redirect standard output to an empty device ('nul' on Windows, '/dev/null' on Unix/Linux/MacOS)
+    with open(os.devnull, "w") as devnull:
+        sys.stdout = devnull
+        try:
+            yield
+        finally:
+            # Restore the original standard output stream
+            sys.stdout = original_stdout
+with suppress_stdout():
+    import jionlp as jio
 class AbnormalCleaner:
@@ -20,23 +40,25 @@ class AbnormalCleaner:
     def convert_newlines(self):
         """Convert \r to \n and multiple \n to a single \n"""
-        self.parsed_data = re.sub(r'\r', '', self.parsed_data)
-        self.parsed_data = re.sub(r'\n+', '\n', self.parsed_data)
+        self.parsed_data = re.sub(r"\r", "", self.parsed_data)
+        self.parsed_data = re.sub(r"\n+", "\n", self.parsed_data)
         return self.parsed_data
     def single_space(self):
         """Convert strings with more than 2 spaces to a single space"""
-        self.parsed_data = re.sub(r' {2,}', ' ', self.parsed_data)
+        self.parsed_data = re.sub(r" {2,}", " ", self.parsed_data)
         return self.parsed_data
     def tabs_to_spaces(self):
         """Convert tab characters to 4 spaces"""
-        self.parsed_data = self.parsed_data.replace('\t', '    ')
+        self.parsed_data = self.parsed_data.replace("\t", "    ")
         return self.parsed_data
     def remove_invisible_chars(self):
         """Remove invisible ASCII characters"""
-        self.parsed_data = re.sub(r'[\x00-\x09\x0b-\x1f\x7f-\xa0]', '', self.parsed_data)
+        self.parsed_data = re.sub(
+            r"[\x00-\x09\x0b-\x1f\x7f-\xa0]", "", self.parsed_data
+        )
         return self.parsed_data
     def simplify_chinese(self):
@@ -50,7 +72,7 @@ class AbnormalCleaner:
     def point_conversion(self):
         """Bullet point conversion"""
-        self.parsed_data = self.parsed_data.replace('\n• ', '\n- ')
+        self.parsed_data = self.parsed_data.replace("\n• ", "\n- ")
         return self.parsed_data
     def clean_space(self):
@@ -58,8 +80,9 @@ class AbnormalCleaner:
         return self.parsed_data
     def clean_tips(self):
-        self.parsed_data = self.parsed_data.replace("EvaluationWarning:ThedocumentwascreatedwithSpire.DocforPython.",
-                                                    "")
+        self.parsed_data = self.parsed_data.replace(
+            "EvaluationWarning:ThedocumentwascreatedwithSpire.DocforPython.", ""
+        )
         return self.parsed_data
     def markdown_format(self):
@@ -77,9 +100,7 @@ class AbnormalCleaner:
             # After cleaning invisible characters, perform another multi-line merge, remove space operation
             self.convert_newlines()
-            result = {
-                "text": self.parsed_data
-            }
+            result = {"text": self.parsed_data}
             return result
         except Exception as e:
@@ -99,12 +120,10 @@ class AbnormalCleaner:
             self.remove_invisible_chars()
             # After cleaning invisible characters, perform another multi-line merge, remove space operation
             self.convert_newlines()
-            self.clean_space()
+            # self.clean_space()
             self.clean_tips()
-            result = {
-                "text": self.parsed_data
-            }
+            result = {"text": self.parsed_data}
             return result
         except Exception as e:
@@ -114,13 +133,13 @@ class AbnormalCleaner:
 class TextFilter:
     def __init__(self, parsed_data):
-        self.parsed_data = parsed_data.get('text', '')
+        self.parsed_data = parsed_data
     def filter_by_word_repetition(self, threshold=0.6):
         """Filter by word repetition rate"""
         text = self.parsed_data
         # Each two characters form a word
-        bi_grams = [text[i:i + 2] for i in range(0, len(text) - 1, 2)]
+        bi_grams = [text[i : i + 2] for i in range(0, len(text) - 1, 2)]
         word_count = len(bi_grams)
         if word_count == 0:
             return False
@@ -146,7 +165,7 @@ class TextFilter:
         """Filter by numeric content"""
         text = self.parsed_data
         total_chars = len(text)
-        numeric_chars = len(re.findall(r'\d', text))
+        numeric_chars = len(re.findall(r"\d", text))
         if numeric_chars / total_chars > threshold:
             return False
         return True
@@ -160,9 +179,7 @@ class TextFilter:
         elif not self.filter_by_numeric_content():
             return {}
         else:
-            result = {
-                "text": self.parsed_data
-            }
+            result = {"text": self.parsed_data}
             return result
@@ -183,12 +200,12 @@ class PrivacyDesensitization:
     def replace_bank_id(self, text, token):
         # Match bank card numbers and replace
-        self.parsed_data = re.sub(r'\b\d{13,19}\b', token, text)
+        self.parsed_data = re.sub(r"\b\d{13,19}\b", token, text)
         return self.parsed_data
     def replace_customer_number(self, text, token):
         # Customer service hotlines are not easy to match and are not considered private data
-        self.parsed_data = re.sub(r'\d+-\d+-\d+', token, text)
+        self.parsed_data = re.sub(r"\d+-\d+-\d+", token, text)
         return self.parsed_data
     def replace_number(self):
@@ -201,7 +218,9 @@ class PrivacyDesensitization:
         # ID card
         self.parsed_data = jio.replace_id_card(self.parsed_data, "COSCO_NUMBER")
         # Bank card
-        self.parsed_data = self.replace_bank_id(self.parsed_data, token="COSCO_NUMBER")
+        self.parsed_data = self.replace_bank_id(
+            self.parsed_data, token="COSCO_NUMBER"
+        )  # nosec B106 - 这是数据脱敏标记，不是密码
         # Dash-separated customer service hotlines
         # self.parsed_data = self.replace_customer_number(self.parsed_data, token="COSCO_NUMBER")
@@ -213,8 +232,6 @@ class PrivacyDesensitization:
         self.replace_email()
         self.replace_number()
-        result = {
-            "text": self.parsed_data
-        }
+        result = {"text": self.parsed_data}
         return result

pydatamax 0.1.5__py3-none-any.whl → 0.1.11__py3-none-any.whl

pydatamax 0.1.5py3-none-any.whl → 0.1.11py3-none-any.whl