pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
- datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +525 -61
- datamax/parser/docx_parser.py +512 -62
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -208
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- pydatamax-0.1.15.dist-info/METADATA +340 -0
- pydatamax-0.1.15.dist-info/RECORD +38 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.13.dist-info/METADATA +0 -280
- pydatamax-0.1.13.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/utils/data_cleaner.py
CHANGED
@@ -1,237 +1,275 @@
|
|
1
|
-
import os
|
2
|
-
import re
|
3
|
-
import sys
|
4
|
-
from collections import Counter
|
5
|
-
from contextlib import contextmanager
|
6
|
-
|
7
|
-
|
8
|
-
@contextmanager
|
9
|
-
def suppress_stdout():
|
10
|
-
# Save the original standard output stream
|
11
|
-
original_stdout = sys.stdout
|
12
|
-
# Redirect standard output to an empty device ('nul' on Windows, '/dev/null' on Unix/Linux/MacOS)
|
13
|
-
with open(os.devnull, "w") as devnull:
|
14
|
-
sys.stdout = devnull
|
15
|
-
try:
|
16
|
-
yield
|
17
|
-
finally:
|
18
|
-
# Restore the original standard output stream
|
19
|
-
sys.stdout = original_stdout
|
20
|
-
|
21
|
-
|
22
|
-
with suppress_stdout():
|
23
|
-
import jionlp as jio
|
24
|
-
|
25
|
-
|
26
|
-
class AbnormalCleaner:
|
27
|
-
def __init__(self, parsed_data):
|
28
|
-
self.parsed_data = parsed_data
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
"""
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
self.parsed_data
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
"""
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
self.parsed_data = self.parsed_data.replace("
|
80
|
-
return self.parsed_data
|
81
|
-
|
82
|
-
def
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
self.
|
121
|
-
|
122
|
-
self.
|
123
|
-
|
124
|
-
self.
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
def
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
self.
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
import sys
|
4
|
+
from collections import Counter
|
5
|
+
from contextlib import contextmanager
|
6
|
+
|
7
|
+
|
8
|
+
@contextmanager
|
9
|
+
def suppress_stdout():
|
10
|
+
# Save the original standard output stream
|
11
|
+
original_stdout = sys.stdout
|
12
|
+
# Redirect standard output to an empty device ('nul' on Windows, '/dev/null' on Unix/Linux/MacOS)
|
13
|
+
with open(os.devnull, "w") as devnull:
|
14
|
+
sys.stdout = devnull
|
15
|
+
try:
|
16
|
+
yield
|
17
|
+
finally:
|
18
|
+
# Restore the original standard output stream
|
19
|
+
sys.stdout = original_stdout
|
20
|
+
|
21
|
+
|
22
|
+
with suppress_stdout():
|
23
|
+
import jionlp as jio
|
24
|
+
|
25
|
+
|
26
|
+
class AbnormalCleaner:
|
27
|
+
def __init__(self, parsed_data):
|
28
|
+
self.parsed_data = parsed_data
|
29
|
+
|
30
|
+
def extract_references(self) -> str:
|
31
|
+
"""
|
32
|
+
Extract reference entries and assign to self.parsed_data
|
33
|
+
(Original text will be replaced with extracted references, each item on a separate line)
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
str: Extracted reference text (same as self.parsed_data)
|
37
|
+
"""
|
38
|
+
patterns = [
|
39
|
+
r'([A-Z][a-z]+(?:, [A-Z](?:\.[a-z]*)?)+(?: et al\.)? $\d{4}$[^\n]+)', # APA format
|
40
|
+
r'($$\d+$$[^\n]+)', # Numbered references like [1]
|
41
|
+
r'(DOI:\s?\S+|https?://\S+)', # DOI/URL
|
42
|
+
r'([A-Z][a-z]+, [A-Z]\.?,? & [A-Z][a-z]+, [A-Z]\. \d{4}[^\n]+)' # Multi-author APA
|
43
|
+
]
|
44
|
+
references = []
|
45
|
+
for pattern in patterns:
|
46
|
+
try:
|
47
|
+
references.extend(re.findall(pattern, self.parsed_data))
|
48
|
+
except re.error as e:
|
49
|
+
print(f"Regex error {pattern}: {e}")
|
50
|
+
|
51
|
+
# Assign extraction results to parsed_data (each item on a separate line)
|
52
|
+
self.parsed_data = "\n".join(list(set(references))) # Deduplicate and merge into string
|
53
|
+
return self.parsed_data
|
54
|
+
|
55
|
+
# Exception cleaning class
|
56
|
+
def remove_abnormal_chars(self):
|
57
|
+
"""Remove abnormal characters from text"""
|
58
|
+
self.parsed_data = jio.remove_exception_char(self.parsed_data)
|
59
|
+
return self.parsed_data
|
60
|
+
|
61
|
+
def remove_html_tags(self):
|
62
|
+
"""Remove HTML tags"""
|
63
|
+
self.parsed_data = jio.remove_html_tag(self.parsed_data)
|
64
|
+
return self.parsed_data
|
65
|
+
|
66
|
+
def convert_newlines(self):
|
67
|
+
"""Convert \r to \n and multiple \n to a single \n"""
|
68
|
+
self.parsed_data = re.sub(r"\r", "", self.parsed_data)
|
69
|
+
self.parsed_data = re.sub(r"\n+", "\n", self.parsed_data)
|
70
|
+
return self.parsed_data
|
71
|
+
|
72
|
+
def single_space(self):
|
73
|
+
"""Convert strings with more than 2 spaces to a single space"""
|
74
|
+
self.parsed_data = re.sub(r" {2,}", " ", self.parsed_data)
|
75
|
+
return self.parsed_data
|
76
|
+
|
77
|
+
def tabs_to_spaces(self):
|
78
|
+
"""Convert tab characters to 4 spaces"""
|
79
|
+
self.parsed_data = self.parsed_data.replace("\t", " ")
|
80
|
+
return self.parsed_data
|
81
|
+
|
82
|
+
def remove_invisible_chars(self):
|
83
|
+
"""Remove invisible ASCII characters"""
|
84
|
+
self.parsed_data = re.sub(
|
85
|
+
r"[\x00-\x09\x0b-\x1f\x7f-\xa0]", "", self.parsed_data
|
86
|
+
)
|
87
|
+
return self.parsed_data
|
88
|
+
|
89
|
+
def simplify_chinese(self):
|
90
|
+
"""Convert traditional Chinese characters to simplified Chinese"""
|
91
|
+
self.parsed_data = jio.tra2sim(self.parsed_data, mode="word")
|
92
|
+
return self.parsed_data
|
93
|
+
|
94
|
+
def nlp_clean(self):
|
95
|
+
# jio nlp rough text cleaning
|
96
|
+
return jio.clean_text(self.parsed_data)
|
97
|
+
|
98
|
+
def point_conversion(self):
|
99
|
+
"""Bullet point conversion"""
|
100
|
+
self.parsed_data = self.parsed_data.replace("\n• ", "\n- ")
|
101
|
+
return self.parsed_data
|
102
|
+
|
103
|
+
def clean_space(self):
|
104
|
+
self.parsed_data = self.parsed_data.replace(" ", "")
|
105
|
+
return self.parsed_data
|
106
|
+
|
107
|
+
def clean_tips(self):
|
108
|
+
self.parsed_data = self.parsed_data.replace(
|
109
|
+
"EvaluationWarning:ThedocumentwascreatedwithSpire.DocforPython.", ""
|
110
|
+
)
|
111
|
+
return self.parsed_data
|
112
|
+
|
113
|
+
def markdown_format(self):
|
114
|
+
pass
|
115
|
+
|
116
|
+
def no_html_clean(self):
|
117
|
+
"""Perform cleaning without executing HTML cleaning"""
|
118
|
+
try:
|
119
|
+
self.convert_newlines()
|
120
|
+
self.single_space()
|
121
|
+
self.tabs_to_spaces()
|
122
|
+
self.simplify_chinese()
|
123
|
+
|
124
|
+
self.remove_invisible_chars()
|
125
|
+
# After cleaning invisible characters, perform another multi-line merge, remove space operation
|
126
|
+
self.convert_newlines()
|
127
|
+
|
128
|
+
result = {"text": self.parsed_data}
|
129
|
+
return result
|
130
|
+
|
131
|
+
except Exception as e:
|
132
|
+
print(f"Error: {e}, line: {e.__traceback__.tb_lineno}")
|
133
|
+
return {}
|
134
|
+
|
135
|
+
def to_clean(self):
|
136
|
+
"""Perform all cleaning operations"""
|
137
|
+
try:
|
138
|
+
self.point_conversion()
|
139
|
+
self.remove_html_tags()
|
140
|
+
self.convert_newlines()
|
141
|
+
self.single_space()
|
142
|
+
self.tabs_to_spaces()
|
143
|
+
self.simplify_chinese()
|
144
|
+
|
145
|
+
self.remove_invisible_chars()
|
146
|
+
# After cleaning invisible characters, perform another multi-line merge, remove space operation
|
147
|
+
self.convert_newlines()
|
148
|
+
# self.clean_space()
|
149
|
+
self.clean_tips()
|
150
|
+
|
151
|
+
result = {"text": self.parsed_data}
|
152
|
+
return result
|
153
|
+
|
154
|
+
except Exception as e:
|
155
|
+
print(f"Error: {e}, line: {e.__traceback__.tb_lineno}")
|
156
|
+
return {}
|
157
|
+
|
158
|
+
|
159
|
+
class TextFilter:
|
160
|
+
def __init__(self, parsed_data):
|
161
|
+
self.parsed_data = parsed_data
|
162
|
+
|
163
|
+
def filter_by_word_repetition(self, threshold=0.6):
|
164
|
+
"""Filter by word repetition rate"""
|
165
|
+
if not isinstance(self.parsed_data, str):
|
166
|
+
return False
|
167
|
+
|
168
|
+
text = str(self.parsed_data)
|
169
|
+
bi_grams = [text[i:i+2] for i in range(0, len(text)-1, 2)]
|
170
|
+
word_count = len(bi_grams)
|
171
|
+
if word_count == 0:
|
172
|
+
print("No words found.")
|
173
|
+
return False
|
174
|
+
|
175
|
+
word_freq = Counter(bi_grams)
|
176
|
+
most_common_word, most_common_count = word_freq.most_common(1)[0]
|
177
|
+
repetition_rate = most_common_count / word_count
|
178
|
+
print(f"Word repetition rate: {repetition_rate}")
|
179
|
+
|
180
|
+
return repetition_rate <= threshold
|
181
|
+
|
182
|
+
def filter_by_char_count(self, min_chars=30, max_chars=500000):
|
183
|
+
"""Filter by character count"""
|
184
|
+
char_count = len(self.parsed_data)
|
185
|
+
if char_count < min_chars or char_count > max_chars:
|
186
|
+
return False
|
187
|
+
return True
|
188
|
+
|
189
|
+
def filter_by_numeric_content(self, threshold=0.6):
|
190
|
+
"""Filter by numeric content"""
|
191
|
+
text = self.parsed_data
|
192
|
+
total_chars = len(text)
|
193
|
+
numeric_chars = len(re.findall(r"\d", text))
|
194
|
+
if numeric_chars / total_chars > threshold:
|
195
|
+
return False
|
196
|
+
return True
|
197
|
+
|
198
|
+
def to_filter(self):
|
199
|
+
"""Perform all filtering operations and filter out texts that do not meet the conditions"""
|
200
|
+
if not self.filter_by_word_repetition():
|
201
|
+
return {}
|
202
|
+
elif not self.filter_by_char_count():
|
203
|
+
return {}
|
204
|
+
elif not self.filter_by_numeric_content():
|
205
|
+
return {}
|
206
|
+
else:
|
207
|
+
result = {"text": self.parsed_data}
|
208
|
+
return result
|
209
|
+
|
210
|
+
|
211
|
+
class PrivacyDesensitization:
|
212
|
+
def __init__(self, parsed_data):
|
213
|
+
self.parsed_data = parsed_data
|
214
|
+
|
215
|
+
# Privacy data replacement class
|
216
|
+
def replace_ip(self, token="COSCO_IP"):
|
217
|
+
# Replace IP addresses
|
218
|
+
self.parsed_data = jio.replace_ip_address(self.parsed_data, token)
|
219
|
+
return self.parsed_data
|
220
|
+
|
221
|
+
def replace_email(self, token="COSCO_EMAIL"):
|
222
|
+
# Replace email addresses
|
223
|
+
self.parsed_data = jio.replace_email(self.parsed_data, token)
|
224
|
+
return self.parsed_data
|
225
|
+
|
226
|
+
def replace_customer_number(self, token="COSCO_NUMBER"):
|
227
|
+
# Customer service hotlines are not easy to match and are not considered private data
|
228
|
+
self.parsed_data = re.sub(r"\d+-\d+-\d+", token, self.parsed_data)
|
229
|
+
return self.parsed_data
|
230
|
+
|
231
|
+
def replace_bank_id(self, token="COSCO_NUMBER"):
|
232
|
+
# Match bank card numbers and replace
|
233
|
+
self.parsed_data = self.replace_bank_id(
|
234
|
+
self.parsed_data, token=token
|
235
|
+
)
|
236
|
+
return self.parsed_data
|
237
|
+
|
238
|
+
def replace_phone_number(self, token="COSCO_NUMBER"):
|
239
|
+
# Match phone numbers and replace
|
240
|
+
self.parsed_data = jio.replace_phone_number(self.parsed_data, token)
|
241
|
+
return self.parsed_data
|
242
|
+
|
243
|
+
def replace_qq(self, token="COSCO_NUMBER"):
|
244
|
+
# Match QQ numbers and replace
|
245
|
+
self.parsed_data = jio.replace_qq(self.parsed_data,token)
|
246
|
+
return self.parsed_data
|
247
|
+
|
248
|
+
def replace_id_card(self, token="COSCO_NUMBER"):
|
249
|
+
# Match ID card numbers and replace
|
250
|
+
self.parsed_data = jio.replace_id_card(self.parsed_data, token)
|
251
|
+
return self.parsed_data
|
252
|
+
|
253
|
+
def replace_number(self):
|
254
|
+
# Replace all types of numeric private data
|
255
|
+
|
256
|
+
# Landline + mobile phone
|
257
|
+
self.parsed_data = jio.replace_phone_number(self.parsed_data, "COSCO_NUMBER")
|
258
|
+
# QQ
|
259
|
+
self.parsed_data = jio.replace_qq(self.parsed_data, "COSCO_NUMBER")
|
260
|
+
# ID card
|
261
|
+
self.parsed_data = jio.replace_id_card(self.parsed_data, "COSCO_NUMBER")
|
262
|
+
# Bank card
|
263
|
+
self.parsed_data = self.replace_bank_id(
|
264
|
+
self.parsed_data, token="COSCO_NUMBER"
|
265
|
+
) # nosec B106 - 这是数据脱敏标记,不是密码
|
266
|
+
|
267
|
+
return self.parsed_data
|
268
|
+
|
269
|
+
def to_private(self):
|
270
|
+
"""Perform all privacy data replacement operations"""
|
271
|
+
self.replace_ip()
|
272
|
+
self.replace_email()
|
273
|
+
self.replace_number()
|
274
|
+
result = {"text": self.parsed_data}
|
275
|
+
return result
|