pydatamax 0.1.5__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/OssHandler.py +85 -51
- datamax/parser/__init__.py +1 -1
- datamax/parser/base.py +2 -2
- datamax/parser/core.py +205 -31
- datamax/parser/doc_parser.py +2 -5
- datamax/parser/docx_parser.py +3 -6
- datamax/parser/epub_parser.py +2 -5
- datamax/parser/html_parser.py +2 -5
- datamax/parser/image_parser.py +18 -14
- datamax/parser/md_parser.py +67 -4
- datamax/parser/pdf_parser.py +59 -20
- datamax/parser/ppt_parser.py +3 -5
- datamax/parser/pptx_parser.py +10 -13
- datamax/parser/txt_parser.py +2 -5
- datamax/parser/xls_parser.py +26 -0
- datamax/parser/xlsx_parser.py +65 -4
- datamax/utils/__init__.py +1 -0
- datamax/utils/constants.py +58 -0
- datamax/utils/data_cleaner.py +45 -28
- datamax/utils/env_setup.py +80 -0
- datamax/utils/gotocr_pdf.py +265 -0
- datamax/utils/mineru_operator.py +62 -0
- datamax/utils/paddleocr_pdf_operator.py +2 -1
- datamax/utils/qa_generator.py +376 -0
- datamax/utils/tokenizer.py +1 -1
- pydatamax-0.1.11.dist-info/METADATA +271 -0
- pydatamax-0.1.11.dist-info/RECORD +39 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info}/WHEEL +1 -1
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info/licenses}/LICENSE +0 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.11.dist-info}/top_level.txt +1 -0
- tests/__init__.py +0 -0
- tests/test_basic.py +20 -0
- pydatamax-0.1.5.dist-info/METADATA +0 -282
- pydatamax-0.1.5.dist-info/RECORD +0 -31
datamax/parser/image_parser.py
CHANGED
@@ -1,30 +1,34 @@
|
|
1
|
+
import os
|
1
2
|
import pathlib
|
2
3
|
import sys
|
3
|
-
from
|
4
|
+
from datamax.utils import setup_environment
|
5
|
+
|
6
|
+
setup_environment(use_gpu=True)
|
7
|
+
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
4
8
|
from datamax.parser.base import MarkdownOutputVo
|
9
|
+
|
5
10
|
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
11
|
sys.path.insert(0, str(ROOT_DIR))
|
7
12
|
from datamax.parser.base import BaseLife
|
8
|
-
|
13
|
+
from datamax.parser.pdf_parser import PdfParser
|
14
|
+
from PIL import Image
|
9
15
|
|
10
16
|
class ImageParser(BaseLife):
|
11
|
-
def __init__(self,
|
17
|
+
def __init__(self,file_path: str):
|
12
18
|
super().__init__()
|
13
19
|
self.file_path = file_path
|
14
20
|
|
15
21
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
16
22
|
try:
|
17
23
|
title = self.get_file_extension(file_path)
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
output_vo
|
27
|
-
return output_vo.to_dict()
|
24
|
+
output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
|
25
|
+
image = Image.open(file_path)
|
26
|
+
image.save(output_pdf_path, 'PDF', resolution=100.0)
|
27
|
+
pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
|
28
|
+
output_vo = pdf_parser.parse(output_pdf_path)
|
29
|
+
if os.path.exists(output_pdf_path):
|
30
|
+
# shutil.rmtree(f'uploaded_files/markdown')
|
31
|
+
os.remove(output_pdf_path)
|
32
|
+
return output_vo
|
28
33
|
except Exception as e:
|
29
34
|
raise e
|
30
|
-
|
datamax/parser/md_parser.py
CHANGED
@@ -1,10 +1,73 @@
|
|
1
|
+
import pathlib
|
2
|
+
import sys
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
|
+
sys.path.insert(0, str(ROOT_DIR))
|
7
|
+
from datamax.parser.base import BaseLife
|
1
8
|
from datamax.parser.base import MarkdownOutputVo
|
9
|
+
from loguru import logger
|
2
10
|
|
3
11
|
|
4
|
-
class
|
12
|
+
class MarkdownParser(BaseLife):
|
13
|
+
"""
|
14
|
+
Parser for Markdown files that follows the same pattern as PdfParser.
|
15
|
+
Handles .md and .markdown file extensions.
|
16
|
+
"""
|
5
17
|
|
6
|
-
def __init__(self,
|
18
|
+
def __init__(self,
|
19
|
+
file_path: Union[str, list],
|
20
|
+
):
|
21
|
+
super().__init__()
|
7
22
|
self.file_path = file_path
|
8
23
|
|
9
|
-
|
10
|
-
|
24
|
+
@staticmethod
|
25
|
+
def read_markdown_file(file_path: str) -> str:
|
26
|
+
"""
|
27
|
+
Reads the content of a markdown file.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
file_path: Path to the markdown file
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
str: Content of the markdown file
|
34
|
+
"""
|
35
|
+
try:
|
36
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
37
|
+
return f.read()
|
38
|
+
except Exception as e:
|
39
|
+
logger.error(f"Error reading markdown file {file_path}: {e}")
|
40
|
+
raise
|
41
|
+
|
42
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
43
|
+
"""
|
44
|
+
Parses a markdown file and returns a MarkdownOutputVo.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
file_path: Path to the markdown file
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
MarkdownOutputVo: Structured output containing the markdown content
|
51
|
+
"""
|
52
|
+
try:
|
53
|
+
title = self.get_file_extension(file_path)
|
54
|
+
|
55
|
+
# Read markdown content
|
56
|
+
md_content = self.read_markdown_file(file_path)
|
57
|
+
|
58
|
+
# Generate lifecycle metadata
|
59
|
+
lifecycle = self.generate_lifecycle(
|
60
|
+
source_file=file_path,
|
61
|
+
domain="Technology",
|
62
|
+
usage_purpose="Documentation",
|
63
|
+
life_type="LLM_ORIGIN"
|
64
|
+
)
|
65
|
+
|
66
|
+
# Create and return output VO
|
67
|
+
output_vo = MarkdownOutputVo(title, md_content)
|
68
|
+
output_vo.add_lifecycle(lifecycle)
|
69
|
+
return output_vo.to_dict()
|
70
|
+
|
71
|
+
except Exception as e:
|
72
|
+
logger.error(f"Failed to parse markdown file {file_path}: {e}")
|
73
|
+
raise
|
datamax/parser/pdf_parser.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
import pathlib
|
3
3
|
import sys
|
4
|
-
import
|
4
|
+
import subprocess
|
5
5
|
from typing import Union
|
6
6
|
|
7
7
|
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
@@ -9,18 +9,51 @@ sys.path.insert(0, str(ROOT_DIR))
|
|
9
9
|
from datamax.parser.base import BaseLife
|
10
10
|
from datamax.parser.base import MarkdownOutputVo
|
11
11
|
from langchain_community.document_loaders import PyMuPDFLoader
|
12
|
-
from
|
13
|
-
from datamax.utils.
|
12
|
+
from loguru import logger
|
13
|
+
from datamax.utils.mineru_operator import pdf_processor
|
14
14
|
|
15
15
|
|
16
16
|
class PdfParser(BaseLife):
|
17
17
|
|
18
|
-
def __init__(self,
|
18
|
+
def __init__(self,
|
19
|
+
file_path: Union[str, list],
|
20
|
+
use_mineru: bool = False,
|
21
|
+
):
|
19
22
|
super().__init__()
|
23
|
+
|
20
24
|
self.file_path = file_path
|
21
|
-
self.
|
22
|
-
|
23
|
-
|
25
|
+
self.use_mineru = use_mineru
|
26
|
+
|
27
|
+
def mineru_process(self, input_pdf_filename, output_dir):
|
28
|
+
proc = None
|
29
|
+
try:
|
30
|
+
logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
|
31
|
+
command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
|
32
|
+
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
33
|
+
|
34
|
+
# 等待命令执行完成
|
35
|
+
stdout, stderr = proc.communicate()
|
36
|
+
# 检查命令是否成功执行
|
37
|
+
if proc.returncode != 0:
|
38
|
+
raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
|
39
|
+
|
40
|
+
logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
|
41
|
+
|
42
|
+
except Exception as e:
|
43
|
+
logger.error(f"Error: {e}")
|
44
|
+
if proc is not None:
|
45
|
+
proc.kill()
|
46
|
+
proc.wait()
|
47
|
+
logger.info("The process was terminated due to an error.")
|
48
|
+
raise # Re-raise the exception to let the caller handle it
|
49
|
+
|
50
|
+
finally:
|
51
|
+
# 确保子进程已经结束
|
52
|
+
if proc is not None:
|
53
|
+
if proc.poll() is None:
|
54
|
+
proc.kill()
|
55
|
+
proc.wait()
|
56
|
+
logger.info("The process was terminated due to timeout or completion.")
|
24
57
|
|
25
58
|
@staticmethod
|
26
59
|
def read_pdf_file(file_path) -> str:
|
@@ -37,23 +70,29 @@ class PdfParser(BaseLife):
|
|
37
70
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
38
71
|
try:
|
39
72
|
title = self.get_file_extension(file_path)
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
73
|
+
|
74
|
+
if self.use_mineru:
|
75
|
+
output_dir = 'uploaded_files'
|
76
|
+
output_folder_name = os.path.basename(file_path).replace(".pdf", "")
|
77
|
+
# output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
|
78
|
+
# if os.path.exists(output_mineru):
|
79
|
+
# pass
|
80
|
+
# else:
|
81
|
+
# self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
|
82
|
+
# mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
83
|
+
|
84
|
+
# todo: 是否有必要跟api的默认保存路径保持一致
|
85
|
+
output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
|
86
|
+
|
87
|
+
if os.path.exists(output_mineru):
|
88
|
+
mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
44
89
|
else:
|
45
|
-
|
46
|
-
output_md_dir = f'./output/{os.path.basename(file_path).replace(".pdf", "_ocr.md")}'
|
47
|
-
docx2markdown.docx_to_markdown(output_docx_dir, output_md_dir)
|
48
|
-
mk_content = open(output_md_dir, 'r', encoding='utf-8').read()
|
49
|
-
token_count = self.tk_client.get_tokenizer(content=mk_content)
|
90
|
+
mk_content = pdf_processor.process_pdf(file_path)
|
50
91
|
else:
|
51
92
|
content = self.read_pdf_file(file_path=file_path)
|
52
|
-
|
53
|
-
mk_content = clean_text
|
54
|
-
token_count = self.tk_client.get_tokenizer(content=mk_content.get('text', ''))
|
93
|
+
mk_content = content
|
55
94
|
|
56
|
-
lifecycle = self.generate_lifecycle(source_file=file_path,
|
95
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
57
96
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
58
97
|
output_vo = MarkdownOutputVo(title, mk_content)
|
59
98
|
output_vo.add_lifecycle(lifecycle)
|
datamax/parser/ppt_parser.py
CHANGED
@@ -7,7 +7,6 @@ from pathlib import Path
|
|
7
7
|
from typing import Union
|
8
8
|
from datamax.parser.base import BaseLife
|
9
9
|
from datamax.parser.base import MarkdownOutputVo
|
10
|
-
from datamax.utils import clean_original_text
|
11
10
|
from datamax.utils.ppt_extract import PPtExtractor
|
12
11
|
|
13
12
|
|
@@ -64,10 +63,9 @@ class PPtParser(BaseLife):
|
|
64
63
|
try:
|
65
64
|
title = self.get_file_extension(file_path)
|
66
65
|
content = self.read_ppt_file(file_path=file_path)
|
67
|
-
clean_text = clean_original_text(content)
|
68
|
-
mk_content =
|
69
|
-
|
70
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
|
66
|
+
# clean_text = clean_original_text(content)
|
67
|
+
mk_content = content
|
68
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
71
69
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
72
70
|
output_vo = MarkdownOutputVo(title, mk_content)
|
73
71
|
output_vo.add_lifecycle(lifecycle)
|
datamax/parser/pptx_parser.py
CHANGED
@@ -3,7 +3,6 @@ from typing import Union
|
|
3
3
|
from pptx import Presentation
|
4
4
|
from datamax.parser.base import BaseLife
|
5
5
|
from datamax.parser.base import MarkdownOutputVo
|
6
|
-
from datamax.utils import clean_original_text
|
7
6
|
|
8
7
|
|
9
8
|
class PPtxParser(BaseLife):
|
@@ -20,14 +19,14 @@ class PPtxParser(BaseLife):
|
|
20
19
|
for shape in slide.shapes:
|
21
20
|
if shape.has_text_frame:
|
22
21
|
content += shape.text + '\n'
|
23
|
-
if shape.shape_type == 13:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
22
|
+
# if shape.shape_type == 13:
|
23
|
+
# if not os.path.exists("extracted_images"):
|
24
|
+
# os.makedirs("extracted_images")
|
25
|
+
# image = shape.image
|
26
|
+
# image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
|
27
|
+
# with open(image_filename, 'wb') as img_file:
|
28
|
+
# img_file.write(image.blob)
|
29
|
+
# content += ('[' + image_filename + ']')
|
31
30
|
return content
|
32
31
|
except Exception:
|
33
32
|
raise
|
@@ -36,10 +35,8 @@ class PPtxParser(BaseLife):
|
|
36
35
|
try:
|
37
36
|
title = self.get_file_extension(file_path)
|
38
37
|
content = self.read_ppt_file(file_path=file_path)
|
39
|
-
|
40
|
-
|
41
|
-
token_count = self.tk_client.get_tokenizer(content=mk_content)
|
42
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
|
38
|
+
mk_content = content
|
39
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
43
40
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
44
41
|
output_vo = MarkdownOutputVo(title, mk_content)
|
45
42
|
output_vo.add_lifecycle(lifecycle)
|
datamax/parser/txt_parser.py
CHANGED
@@ -2,7 +2,6 @@ import chardet
|
|
2
2
|
from typing import Union
|
3
3
|
from datamax.parser.base import BaseLife
|
4
4
|
from datamax.parser.base import MarkdownOutputVo
|
5
|
-
from datamax.utils import clean_original_text
|
6
5
|
|
7
6
|
|
8
7
|
class TxtParser(BaseLife):
|
@@ -37,10 +36,8 @@ class TxtParser(BaseLife):
|
|
37
36
|
try:
|
38
37
|
title = self.get_file_extension(file_path)
|
39
38
|
content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
|
40
|
-
|
41
|
-
|
42
|
-
token_count = self.tk_client.get_tokenizer(content=mk_content)
|
43
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, token_count=token_count, domain="Technology",
|
39
|
+
mk_content = content
|
40
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
44
41
|
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
45
42
|
output_vo = MarkdownOutputVo(title, mk_content)
|
46
43
|
output_vo.add_lifecycle(lifecycle)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from datamax.parser.base import MarkdownOutputVo
|
2
|
+
from datamax.parser.base import BaseLife
|
3
|
+
import pandas as pd
|
4
|
+
import warnings
|
5
|
+
|
6
|
+
warnings.filterwarnings("ignore")
|
7
|
+
|
8
|
+
|
9
|
+
class XlsParser(BaseLife):
|
10
|
+
"""xlsx or xls table use markitdown from Microsoft so magic for table!"""
|
11
|
+
|
12
|
+
def __init__(self, file_path):
|
13
|
+
super().__init__()
|
14
|
+
self.file_path = file_path
|
15
|
+
|
16
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
17
|
+
try:
|
18
|
+
df = pd.read_excel(file_path)
|
19
|
+
mk_content = df.to_markdown(index=False)
|
20
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
21
|
+
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
22
|
+
output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
|
23
|
+
output_vo.add_lifecycle(lifecycle)
|
24
|
+
return output_vo.to_dict()
|
25
|
+
except Exception as e:
|
26
|
+
raise e
|
datamax/parser/xlsx_parser.py
CHANGED
@@ -1,10 +1,71 @@
|
|
1
|
+
import multiprocessing
|
2
|
+
import time
|
3
|
+
from multiprocessing import Queue
|
1
4
|
from datamax.parser.base import MarkdownOutputVo
|
5
|
+
from datamax.parser.base import BaseLife
|
6
|
+
from openpyxl import load_workbook
|
7
|
+
import warnings
|
8
|
+
from markitdown import MarkItDown
|
2
9
|
|
10
|
+
warnings.filterwarnings("ignore")
|
3
11
|
|
4
|
-
class
|
12
|
+
class XlsxParser(BaseLife):
|
13
|
+
# single ton
|
5
14
|
|
6
|
-
|
15
|
+
_markitdown_instance = None
|
16
|
+
|
17
|
+
@classmethod
|
18
|
+
def get_markitdown(cls):
|
19
|
+
if cls._markitdown_instance is None:
|
20
|
+
cls._markitdown_instance = MarkItDown()
|
21
|
+
return cls._markitdown_instance
|
22
|
+
|
23
|
+
def __init__(self, file_path, timeout):
|
24
|
+
super().__init__()
|
7
25
|
self.file_path = file_path
|
26
|
+
self.timeout = timeout
|
27
|
+
self.markitdown = self.get_markitdown()
|
28
|
+
|
29
|
+
def _parse(self, file_path: str, result_queue: Queue) -> dict:
|
30
|
+
try:
|
31
|
+
wb = load_workbook(
|
32
|
+
filename=file_path,
|
33
|
+
data_only=True,
|
34
|
+
read_only=True
|
35
|
+
)
|
36
|
+
wb.close()
|
37
|
+
except Exception as e:
|
38
|
+
raise e
|
39
|
+
|
40
|
+
mk_content = self.markitdown.convert(file_path).text_content
|
41
|
+
lifecycle = self.generate_lifecycle(
|
42
|
+
source_file=file_path,
|
43
|
+
domain="Technology",
|
44
|
+
usage_purpose="Documentation",
|
45
|
+
life_type="LLM_ORIGIN"
|
46
|
+
)
|
47
|
+
output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
|
48
|
+
output_vo.add_lifecycle(lifecycle)
|
49
|
+
result_queue.put(output_vo.to_dict())
|
50
|
+
time.sleep(0.5)
|
51
|
+
return output_vo.to_dict()
|
52
|
+
|
53
|
+
def parse(self, file_path: str) -> dict:
|
54
|
+
import time
|
55
|
+
result_queue = Queue()
|
56
|
+
process = multiprocessing.Process(target=self._parse, args=(file_path, result_queue))
|
57
|
+
process.start()
|
58
|
+
start_time = time.time()
|
8
59
|
|
9
|
-
|
10
|
-
|
60
|
+
# ttl
|
61
|
+
while time.time() - start_time < self.timeout:
|
62
|
+
print(f"plz waiting...: {int(time.time() - start_time)}")
|
63
|
+
if not process.is_alive():
|
64
|
+
break
|
65
|
+
if not result_queue.empty():
|
66
|
+
return result_queue.get()
|
67
|
+
time.sleep(1)
|
68
|
+
else:
|
69
|
+
# killed
|
70
|
+
process.terminate()
|
71
|
+
process.join()
|
datamax/utils/__init__.py
CHANGED
@@ -0,0 +1,58 @@
|
|
1
|
+
def get_system_prompt(knowledge):
|
2
|
+
system_prompt = f"""
|
3
|
+
你是一个精确并高效的航运问答对生成助手,你的职责是基于用户提供的特定航运知识为用户生成关于每个航运知识的问题和答案也被称为问答对,达到考察该知识点的效果。
|
4
|
+
你需要完全按照标注要求以及注意事项来生成问答,请完整读取该格式的航运知识:{knowledge}
|
5
|
+
|
6
|
+
目标 (Objective):
|
7
|
+
你的目标是根据已知信息生成出正确和精准的问答对,并且确保包括原文里提及的所有正确选项以及保证所有的专业术语拼写正确。
|
8
|
+
|
9
|
+
风格 (Style):
|
10
|
+
你的回答风格应该是官方航运问题专家的风格。
|
11
|
+
|
12
|
+
语气 (Tone):
|
13
|
+
你的语气应该是正式的
|
14
|
+
|
15
|
+
受众 (Audience):
|
16
|
+
你的受众是数据标注人员,他们需要根据你的标注进行修改
|
17
|
+
|
18
|
+
响应 (Response):
|
19
|
+
你的响应格式应该是以json的格式返回如下:
|
20
|
+
```json
|
21
|
+
{{
|
22
|
+
"instruction": "<生成的相关问题>",
|
23
|
+
"input": "",
|
24
|
+
"output": "<根据知识生成的答案>"
|
25
|
+
}}
|
26
|
+
|
27
|
+
# 标注要求
|
28
|
+
1. 对于可能存在歧义的名词,需要给出完整的定语,以消除歧义。
|
29
|
+
a.《水面智能搜救机器人技术指南》
|
30
|
+
i.搜救机器人的连续工作时间要求不小于30分钟。-> 错误
|
31
|
+
ii.水面智能搜救机器人的连续工作时间要求不小于30分钟。-> 正确
|
32
|
+
2. QA对答案的字符长度有要求,不能少于50字。仅要求下限为50字,不做字数上限要求。
|
33
|
+
3. 可以引用规则或规定的章节号,但必须同时引用章节的原文相关内容。
|
34
|
+
a.可继续按照原适用的CCS《钢质海船入级规范》第8篇第8章的要求维护CLEAN附加标志。X
|
35
|
+
b.可继续按照原适用的CCS《钢质海船入级规范》第8篇第8章【原文】的要求维护CLEAN附加标志。√
|
36
|
+
4. 请使用标准的markdown格式来表示多层级结构,例如使用"xxx\n1.1. xxx\n1.1.1. xxx"。
|
37
|
+
5. 请勿使用序号标识,如(1),一、①、Ⅰ、壹等,以保持格式的统一性。
|
38
|
+
6. 对于需要强调的专有名词或关键词,请使用** **来代替单引号或双引号。
|
39
|
+
7. 在数据中,如果存在明显的错误,如语法错误或逻辑错误,需要自行进行剔除。
|
40
|
+
8. 数据需要按照markdown格式进行格式化,并保留\n换行符标识。
|
41
|
+
9. 对于具有明确意义的数字,要求准确率达到100%。
|
42
|
+
10. 问答对的答案不能为①,A,等选项引用。
|
43
|
+
11. 问答对的问题不能出现“以下错误的是”,“这几项哪个是正确的”等形式。
|
44
|
+
12. 文本数据要具备专业性,减少出现“因为” “因此” “此外” “首先”等冗余的连词或副词。
|
45
|
+
13. 大段文字内容,有并列逻辑的,需要存在换行符\n与有序列表1. 2. 3. 标识。
|
46
|
+
14. 大段文字内容,有层级逻辑的,需要将原文层级合理编排。不能直接把第1章第2节1.1.2xxxxx内容堆叠在一起,要通过中文语言将层级合理编排为通顺的语句。
|
47
|
+
|
48
|
+
# 注意事项
|
49
|
+
1. 请选择最有价值的五个知识点, 最终返回一个jsonlist.
|
50
|
+
2. 每个json内容需要保持格式一致, 且output中生成的答案不少于50字
|
51
|
+
, 请以完整航运知识为主进行专业且不偏题的扩写
|
52
|
+
3. 所有的专业术语拼写必须要完全正确
|
53
|
+
4. 问答对的问题应该是提供的知识点的重点
|
54
|
+
5. 你的信息来源只能是提供的航运知识
|
55
|
+
6. jsonlist长度不超过5
|
56
|
+
7. 最终仅返回结果,不要有其他代表格式的markdown文本例如 ```python ```json
|
57
|
+
"""
|
58
|
+
return system_prompt
|
datamax/utils/data_cleaner.py
CHANGED
@@ -1,6 +1,26 @@
|
|
1
|
+
import os
|
1
2
|
import re
|
2
|
-
import
|
3
|
+
import sys
|
3
4
|
from collections import Counter
|
5
|
+
from contextlib import contextmanager
|
6
|
+
|
7
|
+
|
8
|
+
@contextmanager
|
9
|
+
def suppress_stdout():
|
10
|
+
# Save the original standard output stream
|
11
|
+
original_stdout = sys.stdout
|
12
|
+
# Redirect standard output to an empty device ('nul' on Windows, '/dev/null' on Unix/Linux/MacOS)
|
13
|
+
with open(os.devnull, "w") as devnull:
|
14
|
+
sys.stdout = devnull
|
15
|
+
try:
|
16
|
+
yield
|
17
|
+
finally:
|
18
|
+
# Restore the original standard output stream
|
19
|
+
sys.stdout = original_stdout
|
20
|
+
|
21
|
+
|
22
|
+
with suppress_stdout():
|
23
|
+
import jionlp as jio
|
4
24
|
|
5
25
|
|
6
26
|
class AbnormalCleaner:
|
@@ -20,23 +40,25 @@ class AbnormalCleaner:
|
|
20
40
|
|
21
41
|
def convert_newlines(self):
|
22
42
|
"""Convert \r to \n and multiple \n to a single \n"""
|
23
|
-
self.parsed_data = re.sub(r
|
24
|
-
self.parsed_data = re.sub(r
|
43
|
+
self.parsed_data = re.sub(r"\r", "", self.parsed_data)
|
44
|
+
self.parsed_data = re.sub(r"\n+", "\n", self.parsed_data)
|
25
45
|
return self.parsed_data
|
26
46
|
|
27
47
|
def single_space(self):
|
28
48
|
"""Convert strings with more than 2 spaces to a single space"""
|
29
|
-
self.parsed_data = re.sub(r
|
49
|
+
self.parsed_data = re.sub(r" {2,}", " ", self.parsed_data)
|
30
50
|
return self.parsed_data
|
31
51
|
|
32
52
|
def tabs_to_spaces(self):
|
33
53
|
"""Convert tab characters to 4 spaces"""
|
34
|
-
self.parsed_data = self.parsed_data.replace(
|
54
|
+
self.parsed_data = self.parsed_data.replace("\t", " ")
|
35
55
|
return self.parsed_data
|
36
56
|
|
37
57
|
def remove_invisible_chars(self):
|
38
58
|
"""Remove invisible ASCII characters"""
|
39
|
-
self.parsed_data = re.sub(
|
59
|
+
self.parsed_data = re.sub(
|
60
|
+
r"[\x00-\x09\x0b-\x1f\x7f-\xa0]", "", self.parsed_data
|
61
|
+
)
|
40
62
|
return self.parsed_data
|
41
63
|
|
42
64
|
def simplify_chinese(self):
|
@@ -50,7 +72,7 @@ class AbnormalCleaner:
|
|
50
72
|
|
51
73
|
def point_conversion(self):
|
52
74
|
"""Bullet point conversion"""
|
53
|
-
self.parsed_data = self.parsed_data.replace(
|
75
|
+
self.parsed_data = self.parsed_data.replace("\n• ", "\n- ")
|
54
76
|
return self.parsed_data
|
55
77
|
|
56
78
|
def clean_space(self):
|
@@ -58,8 +80,9 @@ class AbnormalCleaner:
|
|
58
80
|
return self.parsed_data
|
59
81
|
|
60
82
|
def clean_tips(self):
|
61
|
-
self.parsed_data = self.parsed_data.replace(
|
62
|
-
|
83
|
+
self.parsed_data = self.parsed_data.replace(
|
84
|
+
"EvaluationWarning:ThedocumentwascreatedwithSpire.DocforPython.", ""
|
85
|
+
)
|
63
86
|
return self.parsed_data
|
64
87
|
|
65
88
|
def markdown_format(self):
|
@@ -77,9 +100,7 @@ class AbnormalCleaner:
|
|
77
100
|
# After cleaning invisible characters, perform another multi-line merge, remove space operation
|
78
101
|
self.convert_newlines()
|
79
102
|
|
80
|
-
result = {
|
81
|
-
"text": self.parsed_data
|
82
|
-
}
|
103
|
+
result = {"text": self.parsed_data}
|
83
104
|
return result
|
84
105
|
|
85
106
|
except Exception as e:
|
@@ -99,12 +120,10 @@ class AbnormalCleaner:
|
|
99
120
|
self.remove_invisible_chars()
|
100
121
|
# After cleaning invisible characters, perform another multi-line merge, remove space operation
|
101
122
|
self.convert_newlines()
|
102
|
-
self.clean_space()
|
123
|
+
# self.clean_space()
|
103
124
|
self.clean_tips()
|
104
125
|
|
105
|
-
result = {
|
106
|
-
"text": self.parsed_data
|
107
|
-
}
|
126
|
+
result = {"text": self.parsed_data}
|
108
127
|
return result
|
109
128
|
|
110
129
|
except Exception as e:
|
@@ -114,13 +133,13 @@ class AbnormalCleaner:
|
|
114
133
|
|
115
134
|
class TextFilter:
|
116
135
|
def __init__(self, parsed_data):
|
117
|
-
self.parsed_data = parsed_data
|
136
|
+
self.parsed_data = parsed_data
|
118
137
|
|
119
138
|
def filter_by_word_repetition(self, threshold=0.6):
|
120
139
|
"""Filter by word repetition rate"""
|
121
140
|
text = self.parsed_data
|
122
141
|
# Each two characters form a word
|
123
|
-
bi_grams = [text[i:i + 2] for i in range(0, len(text) - 1, 2)]
|
142
|
+
bi_grams = [text[i : i + 2] for i in range(0, len(text) - 1, 2)]
|
124
143
|
word_count = len(bi_grams)
|
125
144
|
if word_count == 0:
|
126
145
|
return False
|
@@ -146,7 +165,7 @@ class TextFilter:
|
|
146
165
|
"""Filter by numeric content"""
|
147
166
|
text = self.parsed_data
|
148
167
|
total_chars = len(text)
|
149
|
-
numeric_chars = len(re.findall(r
|
168
|
+
numeric_chars = len(re.findall(r"\d", text))
|
150
169
|
if numeric_chars / total_chars > threshold:
|
151
170
|
return False
|
152
171
|
return True
|
@@ -160,9 +179,7 @@ class TextFilter:
|
|
160
179
|
elif not self.filter_by_numeric_content():
|
161
180
|
return {}
|
162
181
|
else:
|
163
|
-
result = {
|
164
|
-
"text": self.parsed_data
|
165
|
-
}
|
182
|
+
result = {"text": self.parsed_data}
|
166
183
|
return result
|
167
184
|
|
168
185
|
|
@@ -183,12 +200,12 @@ class PrivacyDesensitization:
|
|
183
200
|
|
184
201
|
def replace_bank_id(self, text, token):
|
185
202
|
# Match bank card numbers and replace
|
186
|
-
self.parsed_data = re.sub(r
|
203
|
+
self.parsed_data = re.sub(r"\b\d{13,19}\b", token, text)
|
187
204
|
return self.parsed_data
|
188
205
|
|
189
206
|
def replace_customer_number(self, text, token):
|
190
207
|
# Customer service hotlines are not easy to match and are not considered private data
|
191
|
-
self.parsed_data = re.sub(r
|
208
|
+
self.parsed_data = re.sub(r"\d+-\d+-\d+", token, text)
|
192
209
|
return self.parsed_data
|
193
210
|
|
194
211
|
def replace_number(self):
|
@@ -201,7 +218,9 @@ class PrivacyDesensitization:
|
|
201
218
|
# ID card
|
202
219
|
self.parsed_data = jio.replace_id_card(self.parsed_data, "COSCO_NUMBER")
|
203
220
|
# Bank card
|
204
|
-
self.parsed_data = self.replace_bank_id(
|
221
|
+
self.parsed_data = self.replace_bank_id(
|
222
|
+
self.parsed_data, token="COSCO_NUMBER"
|
223
|
+
) # nosec B106 - 这是数据脱敏标记,不是密码
|
205
224
|
# Dash-separated customer service hotlines
|
206
225
|
# self.parsed_data = self.replace_customer_number(self.parsed_data, token="COSCO_NUMBER")
|
207
226
|
|
@@ -213,8 +232,6 @@ class PrivacyDesensitization:
|
|
213
232
|
self.replace_email()
|
214
233
|
self.replace_number()
|
215
234
|
|
216
|
-
result = {
|
217
|
-
"text": self.parsed_data
|
218
|
-
}
|
235
|
+
result = {"text": self.parsed_data}
|
219
236
|
|
220
237
|
return result
|