pydatamax 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,30 +1,34 @@
1
+ import os
1
2
  import pathlib
2
3
  import sys
3
- from paddleocr import PaddleOCR
4
+ from datamax.utils import setup_environment
5
+
6
+ setup_environment(use_gpu=True)
7
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
4
8
  from datamax.parser.base import MarkdownOutputVo
9
+
5
10
  ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
11
  sys.path.insert(0, str(ROOT_DIR))
7
12
  from datamax.parser.base import BaseLife
8
-
13
+ from datamax.parser.pdf_parser import PdfParser
14
+ from PIL import Image
9
15
 
10
16
  class ImageParser(BaseLife):
11
- def __init__(self, file_path):
17
+ def __init__(self,file_path: str):
12
18
  super().__init__()
13
19
  self.file_path = file_path
14
20
 
15
21
  def parse(self, file_path: str) -> MarkdownOutputVo:
16
22
  try:
17
23
  title = self.get_file_extension(file_path)
18
- ocr = PaddleOCR(use_angle_cls=True, lang='ch', show_log=False)
19
- result = ocr.ocr(file_path, cls=True)
20
- recognized_texts = [l[1][0] for line in result for l in line]
21
- mk_content = '\n'.join(recognized_texts)
22
- token_count = self.tk_client.get_tokenizer(content=mk_content)
23
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
24
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
25
- output_vo = MarkdownOutputVo(title, mk_content)
26
- output_vo.add_lifecycle(lifecycle)
27
- return output_vo.to_dict()
24
+ output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
25
+ image = Image.open(file_path)
26
+ image.save(output_pdf_path, 'PDF', resolution=100.0)
27
+ pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
28
+ output_vo = pdf_parser.parse(output_pdf_path)
29
+ if os.path.exists(output_pdf_path):
30
+ # shutil.rmtree(f'uploaded_files/markdown')
31
+ os.remove(output_pdf_path)
32
+ return output_vo
28
33
  except Exception as e:
29
34
  raise e
30
-
@@ -1,10 +1,73 @@
1
+ import pathlib
2
+ import sys
3
+ from typing import Union
4
+
5
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
+ sys.path.insert(0, str(ROOT_DIR))
7
+ from datamax.parser.base import BaseLife
1
8
  from datamax.parser.base import MarkdownOutputVo
9
+ from loguru import logger
2
10
 
3
11
 
4
- class Parser:
12
+ class MarkdownParser(BaseLife):
13
+ """
14
+ Parser for Markdown files that follows the same pattern as PdfParser.
15
+ Handles .md and .markdown file extensions.
16
+ """
5
17
 
6
- def __init__(self, file_path):
18
+ def __init__(self,
19
+ file_path: Union[str, list],
20
+ ):
21
+ super().__init__()
7
22
  self.file_path = file_path
8
23
 
9
- def parse(self) -> MarkdownOutputVo:
10
- pass
24
+ @staticmethod
25
+ def read_markdown_file(file_path: str) -> str:
26
+ """
27
+ Reads the content of a markdown file.
28
+
29
+ Args:
30
+ file_path: Path to the markdown file
31
+
32
+ Returns:
33
+ str: Content of the markdown file
34
+ """
35
+ try:
36
+ with open(file_path, 'r', encoding='utf-8') as f:
37
+ return f.read()
38
+ except Exception as e:
39
+ logger.error(f"Error reading markdown file {file_path}: {e}")
40
+ raise
41
+
42
+ def parse(self, file_path: str) -> MarkdownOutputVo:
43
+ """
44
+ Parses a markdown file and returns a MarkdownOutputVo.
45
+
46
+ Args:
47
+ file_path: Path to the markdown file
48
+
49
+ Returns:
50
+ MarkdownOutputVo: Structured output containing the markdown content
51
+ """
52
+ try:
53
+ title = self.get_file_extension(file_path)
54
+
55
+ # Read markdown content
56
+ md_content = self.read_markdown_file(file_path)
57
+
58
+ # Generate lifecycle metadata
59
+ lifecycle = self.generate_lifecycle(
60
+ source_file=file_path,
61
+ domain="Technology",
62
+ usage_purpose="Documentation",
63
+ life_type="LLM_ORIGIN"
64
+ )
65
+
66
+ # Create and return output VO
67
+ output_vo = MarkdownOutputVo(title, md_content)
68
+ output_vo.add_lifecycle(lifecycle)
69
+ return output_vo.to_dict()
70
+
71
+ except Exception as e:
72
+ logger.error(f"Failed to parse markdown file {file_path}: {e}")
73
+ raise
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import pathlib
3
3
  import sys
4
- import docx2markdown
4
+ import subprocess
5
5
  from typing import Union
6
6
 
7
7
  ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
@@ -9,18 +9,51 @@ sys.path.insert(0, str(ROOT_DIR))
9
9
  from datamax.parser.base import BaseLife
10
10
  from datamax.parser.base import MarkdownOutputVo
11
11
  from langchain_community.document_loaders import PyMuPDFLoader
12
- from datamax.utils import clean_original_text
13
- from datamax.utils.paddleocr_pdf_operator import use_paddleocr
12
+ from loguru import logger
13
+ from datamax.utils.mineru_operator import pdf_processor
14
14
 
15
15
 
16
16
  class PdfParser(BaseLife):
17
17
 
18
- def __init__(self, file_path: Union[str, list], use_ocr: bool = False, use_gpu: bool = False, gpu_id: int = 6):
18
+ def __init__(self,
19
+ file_path: Union[str, list],
20
+ use_mineru: bool = False,
21
+ ):
19
22
  super().__init__()
23
+
20
24
  self.file_path = file_path
21
- self.use_ocr = use_ocr
22
- self.use_gpu = use_gpu
23
- self.gpu_id = gpu_id
25
+ self.use_mineru = use_mineru
26
+
27
+ def mineru_process(self, input_pdf_filename, output_dir):
28
+ proc = None
29
+ try:
30
+ logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
31
+ command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
32
+ proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
33
+
34
+ # 等待命令执行完成
35
+ stdout, stderr = proc.communicate()
36
+ # 检查命令是否成功执行
37
+ if proc.returncode != 0:
38
+ raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
39
+
40
+ logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
41
+
42
+ except Exception as e:
43
+ logger.error(f"Error: {e}")
44
+ if proc is not None:
45
+ proc.kill()
46
+ proc.wait()
47
+ logger.info("The process was terminated due to an error.")
48
+ raise # Re-raise the exception to let the caller handle it
49
+
50
+ finally:
51
+ # 确保子进程已经结束
52
+ if proc is not None:
53
+ if proc.poll() is None:
54
+ proc.kill()
55
+ proc.wait()
56
+ logger.info("The process was terminated due to timeout or completion.")
24
57
 
25
58
  @staticmethod
26
59
  def read_pdf_file(file_path) -> str:
@@ -37,23 +70,29 @@ class PdfParser(BaseLife):
37
70
  def parse(self, file_path: str) -> MarkdownOutputVo:
38
71
  try:
39
72
  title = self.get_file_extension(file_path)
40
- if self.use_ocr:
41
- output_docx_dir = f'./output/{os.path.basename(file_path).replace(".pdf", "_ocr.docx")}'
42
- if os.path.exists(output_docx_dir):
43
- pass
73
+
74
+ if self.use_mineru:
75
+ output_dir = 'uploaded_files'
76
+ output_folder_name = os.path.basename(file_path).replace(".pdf", "")
77
+ # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
78
+ # if os.path.exists(output_mineru):
79
+ # pass
80
+ # else:
81
+ # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
82
+ # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
83
+
84
+ # todo: 是否有必要跟api的默认保存路径保持一致
85
+ output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
86
+
87
+ if os.path.exists(output_mineru):
88
+ mk_content = open(output_mineru, 'r', encoding='utf-8').read()
44
89
  else:
45
- use_paddleocr(file_path, './output', self.use_gpu, self.gpu_id)
46
- output_md_dir = f'./output/{os.path.basename(file_path).replace(".pdf", "_ocr.md")}'
47
- docx2markdown.docx_to_markdown(output_docx_dir, output_md_dir)
48
- mk_content = open(output_md_dir, 'r', encoding='utf-8').read()
49
- token_count = self.tk_client.get_tokenizer(content=mk_content)
90
+ mk_content = pdf_processor.process_pdf(file_path)
50
91
  else:
51
92
  content = self.read_pdf_file(file_path=file_path)
52
- clean_text = clean_original_text(content)
53
- mk_content = clean_text
54
- token_count = self.tk_client.get_tokenizer(content=mk_content.get('text', ''))
93
+ mk_content = content
55
94
 
56
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
95
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
57
96
  usage_purpose="Documentation", life_type="LLM_ORIGIN")
58
97
  output_vo = MarkdownOutputVo(title, mk_content)
59
98
  output_vo.add_lifecycle(lifecycle)
@@ -7,7 +7,6 @@ from pathlib import Path
7
7
  from typing import Union
8
8
  from datamax.parser.base import BaseLife
9
9
  from datamax.parser.base import MarkdownOutputVo
10
- from datamax.utils import clean_original_text
11
10
  from datamax.utils.ppt_extract import PPtExtractor
12
11
 
13
12
 
@@ -64,10 +63,9 @@ class PPtParser(BaseLife):
64
63
  try:
65
64
  title = self.get_file_extension(file_path)
66
65
  content = self.read_ppt_file(file_path=file_path)
67
- clean_text = clean_original_text(content)
68
- mk_content = clean_text.get('text', '')
69
- token_count = self.tk_client.get_tokenizer(content=mk_content)
70
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
66
+ # clean_text = clean_original_text(content)
67
+ mk_content = content
68
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
71
69
  usage_purpose="Documentation", life_type="LLM_ORIGIN")
72
70
  output_vo = MarkdownOutputVo(title, mk_content)
73
71
  output_vo.add_lifecycle(lifecycle)
@@ -3,7 +3,6 @@ from typing import Union
3
3
  from pptx import Presentation
4
4
  from datamax.parser.base import BaseLife
5
5
  from datamax.parser.base import MarkdownOutputVo
6
- from datamax.utils import clean_original_text
7
6
 
8
7
 
9
8
  class PPtxParser(BaseLife):
@@ -20,14 +19,14 @@ class PPtxParser(BaseLife):
20
19
  for shape in slide.shapes:
21
20
  if shape.has_text_frame:
22
21
  content += shape.text + '\n'
23
- if shape.shape_type == 13:
24
- if not os.path.exists("extracted_images"):
25
- os.makedirs("extracted_images")
26
- image = shape.image
27
- image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
28
- with open(image_filename, 'wb') as img_file:
29
- img_file.write(image.blob)
30
- content += ('[' + image_filename + ']')
22
+ # if shape.shape_type == 13:
23
+ # if not os.path.exists("extracted_images"):
24
+ # os.makedirs("extracted_images")
25
+ # image = shape.image
26
+ # image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
27
+ # with open(image_filename, 'wb') as img_file:
28
+ # img_file.write(image.blob)
29
+ # content += ('[' + image_filename + ']')
31
30
  return content
32
31
  except Exception:
33
32
  raise
@@ -36,10 +35,8 @@ class PPtxParser(BaseLife):
36
35
  try:
37
36
  title = self.get_file_extension(file_path)
38
37
  content = self.read_ppt_file(file_path=file_path)
39
- clean_text = clean_original_text(content)
40
- mk_content = clean_text.get('text', '')
41
- token_count = self.tk_client.get_tokenizer(content=mk_content)
42
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
38
+ mk_content = content
39
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
43
40
  usage_purpose="Documentation", life_type="LLM_ORIGIN")
44
41
  output_vo = MarkdownOutputVo(title, mk_content)
45
42
  output_vo.add_lifecycle(lifecycle)
@@ -2,7 +2,6 @@ import chardet
2
2
  from typing import Union
3
3
  from datamax.parser.base import BaseLife
4
4
  from datamax.parser.base import MarkdownOutputVo
5
- from datamax.utils import clean_original_text
6
5
 
7
6
 
8
7
  class TxtParser(BaseLife):
@@ -37,10 +36,8 @@ class TxtParser(BaseLife):
37
36
  try:
38
37
  title = self.get_file_extension(file_path)
39
38
  content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
40
- clean_text = clean_original_text(content)
41
- mk_content = clean_text.get('text', '')
42
- token_count = self.tk_client.get_tokenizer(content=mk_content)
43
- lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
39
+ mk_content = content
40
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
44
41
  usage_purpose="Documentation", life_type="LLM_ORIGIN")
45
42
  output_vo = MarkdownOutputVo(title, mk_content)
46
43
  output_vo.add_lifecycle(lifecycle)
@@ -0,0 +1,26 @@
1
+ from datamax.parser.base import MarkdownOutputVo
2
+ from datamax.parser.base import BaseLife
3
+ import pandas as pd
4
+ import warnings
5
+
6
+ warnings.filterwarnings("ignore")
7
+
8
+
9
+ class XlsParser(BaseLife):
10
+ """xlsx or xls table use markitdown from Microsoft so magic for table!"""
11
+
12
+ def __init__(self, file_path):
13
+ super().__init__()
14
+ self.file_path = file_path
15
+
16
+ def parse(self, file_path: str) -> MarkdownOutputVo:
17
+ try:
18
+ df = pd.read_excel(file_path)
19
+ mk_content = df.to_markdown(index=False)
20
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
21
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
22
+ output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
23
+ output_vo.add_lifecycle(lifecycle)
24
+ return output_vo.to_dict()
25
+ except Exception as e:
26
+ raise e
@@ -1,10 +1,71 @@
1
+ import multiprocessing
2
+ import time
3
+ from multiprocessing import Queue
1
4
  from datamax.parser.base import MarkdownOutputVo
5
+ from datamax.parser.base import BaseLife
6
+ from openpyxl import load_workbook
7
+ import warnings
8
+ from markitdown import MarkItDown
2
9
 
10
+ warnings.filterwarnings("ignore")
3
11
 
4
- class Parser:
12
+ class XlsxParser(BaseLife):
13
+ # single ton
5
14
 
6
- def __init__(self, file_path):
15
+ _markitdown_instance = None
16
+
17
+ @classmethod
18
+ def get_markitdown(cls):
19
+ if cls._markitdown_instance is None:
20
+ cls._markitdown_instance = MarkItDown()
21
+ return cls._markitdown_instance
22
+
23
+ def __init__(self, file_path, timeout):
24
+ super().__init__()
7
25
  self.file_path = file_path
26
+ self.timeout = timeout
27
+ self.markitdown = self.get_markitdown()
28
+
29
+ def _parse(self, file_path: str, result_queue: Queue) -> dict:
30
+ try:
31
+ wb = load_workbook(
32
+ filename=file_path,
33
+ data_only=True,
34
+ read_only=True
35
+ )
36
+ wb.close()
37
+ except Exception as e:
38
+ raise e
39
+
40
+ mk_content = self.markitdown.convert(file_path).text_content
41
+ lifecycle = self.generate_lifecycle(
42
+ source_file=file_path,
43
+ domain="Technology",
44
+ usage_purpose="Documentation",
45
+ life_type="LLM_ORIGIN"
46
+ )
47
+ output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
48
+ output_vo.add_lifecycle(lifecycle)
49
+ result_queue.put(output_vo.to_dict())
50
+ time.sleep(0.5)
51
+ return output_vo.to_dict()
52
+
53
+ def parse(self, file_path: str) -> dict:
54
+ import time
55
+ result_queue = Queue()
56
+ process = multiprocessing.Process(target=self._parse, args=(file_path, result_queue))
57
+ process.start()
58
+ start_time = time.time()
8
59
 
9
- def parse(self) -> MarkdownOutputVo:
10
- pass
60
+ # ttl
61
+ while time.time() - start_time < self.timeout:
62
+ print(f"plz waiting...: {int(time.time() - start_time)}")
63
+ if not process.is_alive():
64
+ break
65
+ if not result_queue.empty():
66
+ return result_queue.get()
67
+ time.sleep(1)
68
+ else:
69
+ # killed
70
+ process.terminate()
71
+ process.join()
datamax/utils/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
1
  from datamax.utils.data_cleaner import AbnormalCleaner, TextFilter, PrivacyDesensitization
2
+ from datamax.utils.env_setup import setup_environment
2
3
 
3
4
 
4
5
  def clean_original_text(text):
@@ -0,0 +1,58 @@
1
+ def get_system_prompt(knowledge):
2
+ system_prompt = f"""
3
+ 你是一个精确并高效的航运问答对生成助手,你的职责是基于用户提供的特定航运知识为用户生成关于每个航运知识的问题和答案也被称为问答对,达到考察该知识点的效果。
4
+ 你需要完全按照标注要求以及注意事项来生成问答,请完整读取该格式的航运知识:{knowledge}
5
+
6
+ 目标 (Objective):
7
+ 你的目标是根据已知信息生成出正确和精准的问答对,并且确保包括原文里提及的所有正确选项以及保证所有的专业术语拼写正确。
8
+
9
+ 风格 (Style):
10
+ 你的回答风格应该是官方航运问题专家的风格。
11
+
12
+ 语气 (Tone):
13
+ 你的语气应该是正式的
14
+
15
+ 受众 (Audience):
16
+ 你的受众是数据标注人员,他们需要根据你的标注进行修改
17
+
18
+ 响应 (Response):
19
+ 你的响应格式应该是以json的格式返回如下:
20
+ ```json
21
+ {{
22
+ "instruction": "<生成的相关问题>",
23
+ "input": "",
24
+ "output": "<根据知识生成的答案>"
25
+ }}
26
+
27
+ # 标注要求
28
+ 1. 对于可能存在歧义的名词,需要给出完整的定语,以消除歧义。
29
+ a.《水面智能搜救机器人技术指南》
30
+ i.搜救机器人的连续工作时间要求不小于30分钟。-> 错误
31
+ ii.水面智能搜救机器人的连续工作时间要求不小于30分钟。-> 正确
32
+ 2. QA对答案的字符长度有要求,不能少于50字。仅要求下限为50字,不做字数上限要求。
33
+ 3. 可以引用规则或规定的章节号,但必须同时引用章节的原文相关内容。
34
+ a.可继续按照原适用的CCS《钢质海船入级规范》第8篇第8章的要求维护CLEAN附加标志。X
35
+ b.可继续按照原适用的CCS《钢质海船入级规范》第8篇第8章【原文】的要求维护CLEAN附加标志。√
36
+ 4. 请使用标准的markdown格式来表示多层级结构,例如使用"xxx\n1.1. xxx\n1.1.1. xxx"。
37
+ 5. 请勿使用序号标识,如(1),一、①、Ⅰ、壹等,以保持格式的统一性。
38
+ 6. 对于需要强调的专有名词或关键词,请使用** **来代替单引号或双引号。
39
+ 7. 在数据中,如果存在明显的错误,如语法错误或逻辑错误,需要自行进行剔除。
40
+ 8. 数据需要按照markdown格式进行格式化,并保留\n换行符标识。
41
+ 9. 对于具有明确意义的数字,要求准确率达到100%。
42
+ 10. 问答对的答案不能为①,A,等选项引用。
43
+ 11. 问答对的问题不能出现“以下错误的是”,“这几项哪个是正确的”等形式。
44
+ 12. 文本数据要具备专业性,减少出现“因为” “因此” “此外” “首先”等冗余的连词或副词。
45
+ 13. 大段文字内容,有并列逻辑的,需要存在换行符\n与有序列表1. 2. 3. 标识。
46
+ 14. 大段文字内容,有层级逻辑的,需要将原文层级合理编排。不能直接把第1章第2节1.1.2xxxxx内容堆叠在一起,要通过中文语言将层级合理编排为通顺的语句。
47
+
48
+ # 注意事项
49
+ 1. 请选择最有价值的五个知识点, 最终返回一个jsonlist.
50
+ 2. 每个json内容需要保持格式一致, 且output中生成的答案不少于50字
51
+ , 请以完整航运知识为主进行专业且不偏题的扩写
52
+ 3. 所有的专业术语拼写必须要完全正确
53
+ 4. 问答对的问题应该是提供的知识点的重点
54
+ 5. 你的信息来源只能是提供的航运知识
55
+ 6. jsonlist长度不超过5
56
+ 7. 最终仅返回结果,不要有其他代表格式的markdown文本例如 ```python ```json
57
+ """
58
+ return system_prompt
@@ -1,6 +1,26 @@
1
+ import os
1
2
  import re
2
- import jionlp as jio
3
+ import sys
3
4
  from collections import Counter
5
+ from contextlib import contextmanager
6
+
7
+
8
+ @contextmanager
9
+ def suppress_stdout():
10
+ # Save the original standard output stream
11
+ original_stdout = sys.stdout
12
+ # Redirect standard output to an empty device ('nul' on Windows, '/dev/null' on Unix/Linux/MacOS)
13
+ with open(os.devnull, "w") as devnull:
14
+ sys.stdout = devnull
15
+ try:
16
+ yield
17
+ finally:
18
+ # Restore the original standard output stream
19
+ sys.stdout = original_stdout
20
+
21
+
22
+ with suppress_stdout():
23
+ import jionlp as jio
4
24
 
5
25
 
6
26
  class AbnormalCleaner:
@@ -20,23 +40,25 @@ class AbnormalCleaner:
20
40
 
21
41
  def convert_newlines(self):
22
42
  """Convert \r to \n and multiple \n to a single \n"""
23
- self.parsed_data = re.sub(r'\r', '', self.parsed_data)
24
- self.parsed_data = re.sub(r'\n+', '\n', self.parsed_data)
43
+ self.parsed_data = re.sub(r"\r", "", self.parsed_data)
44
+ self.parsed_data = re.sub(r"\n+", "\n", self.parsed_data)
25
45
  return self.parsed_data
26
46
 
27
47
  def single_space(self):
28
48
  """Convert strings with more than 2 spaces to a single space"""
29
- self.parsed_data = re.sub(r' {2,}', ' ', self.parsed_data)
49
+ self.parsed_data = re.sub(r" {2,}", " ", self.parsed_data)
30
50
  return self.parsed_data
31
51
 
32
52
  def tabs_to_spaces(self):
33
53
  """Convert tab characters to 4 spaces"""
34
- self.parsed_data = self.parsed_data.replace('\t', ' ')
54
+ self.parsed_data = self.parsed_data.replace("\t", " ")
35
55
  return self.parsed_data
36
56
 
37
57
  def remove_invisible_chars(self):
38
58
  """Remove invisible ASCII characters"""
39
- self.parsed_data = re.sub(r'[\x00-\x09\x0b-\x1f\x7f-\xa0]', '', self.parsed_data)
59
+ self.parsed_data = re.sub(
60
+ r"[\x00-\x09\x0b-\x1f\x7f-\xa0]", "", self.parsed_data
61
+ )
40
62
  return self.parsed_data
41
63
 
42
64
  def simplify_chinese(self):
@@ -50,7 +72,7 @@ class AbnormalCleaner:
50
72
 
51
73
  def point_conversion(self):
52
74
  """Bullet point conversion"""
53
- self.parsed_data = self.parsed_data.replace('\n• ', '\n- ')
75
+ self.parsed_data = self.parsed_data.replace("\n• ", "\n- ")
54
76
  return self.parsed_data
55
77
 
56
78
  def clean_space(self):
@@ -58,8 +80,9 @@ class AbnormalCleaner:
58
80
  return self.parsed_data
59
81
 
60
82
  def clean_tips(self):
61
- self.parsed_data = self.parsed_data.replace("EvaluationWarning:ThedocumentwascreatedwithSpire.DocforPython.",
62
- "")
83
+ self.parsed_data = self.parsed_data.replace(
84
+ "EvaluationWarning:ThedocumentwascreatedwithSpire.DocforPython.", ""
85
+ )
63
86
  return self.parsed_data
64
87
 
65
88
  def markdown_format(self):
@@ -77,9 +100,7 @@ class AbnormalCleaner:
77
100
  # After cleaning invisible characters, perform another multi-line merge, remove space operation
78
101
  self.convert_newlines()
79
102
 
80
- result = {
81
- "text": self.parsed_data
82
- }
103
+ result = {"text": self.parsed_data}
83
104
  return result
84
105
 
85
106
  except Exception as e:
@@ -99,12 +120,10 @@ class AbnormalCleaner:
99
120
  self.remove_invisible_chars()
100
121
  # After cleaning invisible characters, perform another multi-line merge, remove space operation
101
122
  self.convert_newlines()
102
- self.clean_space()
123
+ # self.clean_space()
103
124
  self.clean_tips()
104
125
 
105
- result = {
106
- "text": self.parsed_data
107
- }
126
+ result = {"text": self.parsed_data}
108
127
  return result
109
128
 
110
129
  except Exception as e:
@@ -114,13 +133,13 @@ class AbnormalCleaner:
114
133
 
115
134
  class TextFilter:
116
135
  def __init__(self, parsed_data):
117
- self.parsed_data = parsed_data.get('text', '')
136
+ self.parsed_data = parsed_data
118
137
 
119
138
  def filter_by_word_repetition(self, threshold=0.6):
120
139
  """Filter by word repetition rate"""
121
140
  text = self.parsed_data
122
141
  # Each two characters form a word
123
- bi_grams = [text[i:i + 2] for i in range(0, len(text) - 1, 2)]
142
+ bi_grams = [text[i : i + 2] for i in range(0, len(text) - 1, 2)]
124
143
  word_count = len(bi_grams)
125
144
  if word_count == 0:
126
145
  return False
@@ -146,7 +165,7 @@ class TextFilter:
146
165
  """Filter by numeric content"""
147
166
  text = self.parsed_data
148
167
  total_chars = len(text)
149
- numeric_chars = len(re.findall(r'\d', text))
168
+ numeric_chars = len(re.findall(r"\d", text))
150
169
  if numeric_chars / total_chars > threshold:
151
170
  return False
152
171
  return True
@@ -160,9 +179,7 @@ class TextFilter:
160
179
  elif not self.filter_by_numeric_content():
161
180
  return {}
162
181
  else:
163
- result = {
164
- "text": self.parsed_data
165
- }
182
+ result = {"text": self.parsed_data}
166
183
  return result
167
184
 
168
185
 
@@ -183,12 +200,12 @@ class PrivacyDesensitization:
183
200
 
184
201
  def replace_bank_id(self, text, token):
185
202
  # Match bank card numbers and replace
186
- self.parsed_data = re.sub(r'\b\d{13,19}\b', token, text)
203
+ self.parsed_data = re.sub(r"\b\d{13,19}\b", token, text)
187
204
  return self.parsed_data
188
205
 
189
206
  def replace_customer_number(self, text, token):
190
207
  # Customer service hotlines are not easy to match and are not considered private data
191
- self.parsed_data = re.sub(r'\d+-\d+-\d+', token, text)
208
+ self.parsed_data = re.sub(r"\d+-\d+-\d+", token, text)
192
209
  return self.parsed_data
193
210
 
194
211
  def replace_number(self):
@@ -201,7 +218,9 @@ class PrivacyDesensitization:
201
218
  # ID card
202
219
  self.parsed_data = jio.replace_id_card(self.parsed_data, "COSCO_NUMBER")
203
220
  # Bank card
204
- self.parsed_data = self.replace_bank_id(self.parsed_data, token="COSCO_NUMBER")
221
+ self.parsed_data = self.replace_bank_id(
222
+ self.parsed_data, token="COSCO_NUMBER"
223
+ ) # nosec B106 - 这是数据脱敏标记,不是密码
205
224
  # Dash-separated customer service hotlines
206
225
  # self.parsed_data = self.replace_customer_number(self.parsed_data, token="COSCO_NUMBER")
207
226
 
@@ -213,8 +232,6 @@ class PrivacyDesensitization:
213
232
  self.replace_email()
214
233
  self.replace_number()
215
234
 
216
- result = {
217
- "text": self.parsed_data
218
- }
235
+ result = {"text": self.parsed_data}
219
236
 
220
237
  return result