pydatamax 0.1.14__py3-none-any.whl → 0.1.15.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/minio_handler.py +171 -171
  4. datamax/loader/oss_handler.py +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +466 -10
  10. datamax/parser/docx_parser.py +449 -11
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -215
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/METADATA +117 -5
  34. pydatamax-0.1.15.post2.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.14.dist-info/RECORD +0 -39
  38. tests/__init__.py +0 -0
  39. tests/test_basic.py +0 -20
  40. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/WHEEL +0 -0
@@ -1,237 +1,275 @@
1
- import os
2
- import re
3
- import sys
4
- from collections import Counter
5
- from contextlib import contextmanager
6
-
7
-
8
- @contextmanager
9
- def suppress_stdout():
10
- # Save the original standard output stream
11
- original_stdout = sys.stdout
12
- # Redirect standard output to an empty device ('nul' on Windows, '/dev/null' on Unix/Linux/MacOS)
13
- with open(os.devnull, "w") as devnull:
14
- sys.stdout = devnull
15
- try:
16
- yield
17
- finally:
18
- # Restore the original standard output stream
19
- sys.stdout = original_stdout
20
-
21
-
22
- with suppress_stdout():
23
- import jionlp as jio
24
-
25
-
26
- class AbnormalCleaner:
27
- def __init__(self, parsed_data):
28
- self.parsed_data = parsed_data
29
-
30
- # Exception cleaning class
31
- def remove_abnormal_chars(self):
32
- """Remove abnormal characters from text"""
33
- self.parsed_data = jio.remove_exception_char(self.parsed_data)
34
- return self.parsed_data
35
-
36
- def remove_html_tags(self):
37
- """Remove HTML tags"""
38
- self.parsed_data = jio.remove_html_tag(self.parsed_data)
39
- return self.parsed_data
40
-
41
- def convert_newlines(self):
42
- """Convert \r to \n and multiple \n to a single \n"""
43
- self.parsed_data = re.sub(r"\r", "", self.parsed_data)
44
- self.parsed_data = re.sub(r"\n+", "\n", self.parsed_data)
45
- return self.parsed_data
46
-
47
- def single_space(self):
48
- """Convert strings with more than 2 spaces to a single space"""
49
- self.parsed_data = re.sub(r" {2,}", " ", self.parsed_data)
50
- return self.parsed_data
51
-
52
- def tabs_to_spaces(self):
53
- """Convert tab characters to 4 spaces"""
54
- self.parsed_data = self.parsed_data.replace("\t", " ")
55
- return self.parsed_data
56
-
57
- def remove_invisible_chars(self):
58
- """Remove invisible ASCII characters"""
59
- self.parsed_data = re.sub(
60
- r"[\x00-\x09\x0b-\x1f\x7f-\xa0]", "", self.parsed_data
61
- )
62
- return self.parsed_data
63
-
64
- def simplify_chinese(self):
65
- """Convert traditional Chinese characters to simplified Chinese"""
66
- self.parsed_data = jio.tra2sim(self.parsed_data, mode="word")
67
- return self.parsed_data
68
-
69
- def nlp_clean(self):
70
- # jio nlp rough text cleaning
71
- return jio.clean_text(self.parsed_data)
72
-
73
- def point_conversion(self):
74
- """Bullet point conversion"""
75
- self.parsed_data = self.parsed_data.replace("\n• ", "\n- ")
76
- return self.parsed_data
77
-
78
- def clean_space(self):
79
- self.parsed_data = self.parsed_data.replace(" ", "")
80
- return self.parsed_data
81
-
82
- def clean_tips(self):
83
- self.parsed_data = self.parsed_data.replace(
84
- "EvaluationWarning:ThedocumentwascreatedwithSpire.DocforPython.", ""
85
- )
86
- return self.parsed_data
87
-
88
- def markdown_format(self):
89
- pass
90
-
91
- def no_html_clean(self):
92
- """Perform cleaning without executing HTML cleaning"""
93
- try:
94
- self.convert_newlines()
95
- self.single_space()
96
- self.tabs_to_spaces()
97
- self.simplify_chinese()
98
-
99
- self.remove_invisible_chars()
100
- # After cleaning invisible characters, perform another multi-line merge, remove space operation
101
- self.convert_newlines()
102
-
103
- result = {"text": self.parsed_data}
104
- return result
105
-
106
- except Exception as e:
107
- print(f"Error: {e}, line: {e.__traceback__.tb_lineno}")
108
- return {}
109
-
110
- def to_clean(self):
111
- """Perform all cleaning operations"""
112
- try:
113
- self.point_conversion()
114
- self.remove_html_tags()
115
- self.convert_newlines()
116
- self.single_space()
117
- self.tabs_to_spaces()
118
- self.simplify_chinese()
119
-
120
- self.remove_invisible_chars()
121
- # After cleaning invisible characters, perform another multi-line merge, remove space operation
122
- self.convert_newlines()
123
- # self.clean_space()
124
- self.clean_tips()
125
-
126
- result = {"text": self.parsed_data}
127
- return result
128
-
129
- except Exception as e:
130
- print(f"Error: {e}, line: {e.__traceback__.tb_lineno}")
131
- return {}
132
-
133
-
134
- class TextFilter:
135
- def __init__(self, parsed_data):
136
- self.parsed_data = parsed_data
137
-
138
- def filter_by_word_repetition(self, threshold=0.6):
139
- """Filter by word repetition rate"""
140
- text = self.parsed_data
141
- # Each two characters form a word
142
- bi_grams = [text[i : i + 2] for i in range(0, len(text) - 1, 2)]
143
- word_count = len(bi_grams)
144
- if word_count == 0:
145
- return False
146
-
147
- word_freq = Counter(bi_grams)
148
-
149
- most_common_word, most_common_count = word_freq.most_common(1)[0]
150
- repetition_rate = most_common_count / word_count
151
-
152
- if repetition_rate > threshold:
153
- return False
154
-
155
- return True
156
-
157
- def filter_by_char_count(self, min_chars=30, max_chars=500000):
158
- """Filter by character count"""
159
- char_count = len(self.parsed_data)
160
- if char_count < min_chars or char_count > max_chars:
161
- return False
162
- return True
163
-
164
- def filter_by_numeric_content(self, threshold=0.6):
165
- """Filter by numeric content"""
166
- text = self.parsed_data
167
- total_chars = len(text)
168
- numeric_chars = len(re.findall(r"\d", text))
169
- if numeric_chars / total_chars > threshold:
170
- return False
171
- return True
172
-
173
- def to_filter(self):
174
- """Perform all filtering operations and filter out texts that do not meet the conditions"""
175
- if not self.filter_by_word_repetition():
176
- return {}
177
- elif not self.filter_by_char_count():
178
- return {}
179
- elif not self.filter_by_numeric_content():
180
- return {}
181
- else:
182
- result = {"text": self.parsed_data}
183
- return result
184
-
185
-
186
- class PrivacyDesensitization:
187
- def __init__(self, parsed_data):
188
- self.parsed_data = parsed_data
189
-
190
- # Privacy data replacement class
191
- def replace_ip(self):
192
- # Replace IP addresses
193
- self.parsed_data = jio.replace_ip_address(self.parsed_data, "COSCO_IP")
194
- return self.parsed_data
195
-
196
- def replace_email(self):
197
- # Replace email addresses
198
- self.parsed_data = jio.replace_email(self.parsed_data, "COSCO_EMAIL")
199
- return self.parsed_data
200
-
201
- def replace_bank_id(self, text, token):
202
- # Match bank card numbers and replace
203
- self.parsed_data = re.sub(r"\b\d{13,19}\b", token, text)
204
- return self.parsed_data
205
-
206
- def replace_customer_number(self, text, token):
207
- # Customer service hotlines are not easy to match and are not considered private data
208
- self.parsed_data = re.sub(r"\d+-\d+-\d+", token, text)
209
- return self.parsed_data
210
-
211
- def replace_number(self):
212
- # Replace all types of numeric private data
213
-
214
- # Landline + mobile phone
215
- self.parsed_data = jio.replace_phone_number(self.parsed_data, "COSCO_NUMBER")
216
- # QQ
217
- self.parsed_data = jio.replace_qq(self.parsed_data, "COSCO_NUMBER")
218
- # ID card
219
- self.parsed_data = jio.replace_id_card(self.parsed_data, "COSCO_NUMBER")
220
- # Bank card
221
- self.parsed_data = self.replace_bank_id(
222
- self.parsed_data, token="COSCO_NUMBER"
223
- ) # nosec B106 - 这是数据脱敏标记,不是密码
224
- # Dash-separated customer service hotlines
225
- # self.parsed_data = self.replace_customer_number(self.parsed_data, token="COSCO_NUMBER")
226
-
227
- return self.parsed_data
228
-
229
- def to_private(self):
230
- """Perform all privacy data replacement operations"""
231
- self.replace_ip()
232
- self.replace_email()
233
- self.replace_number()
234
-
235
- result = {"text": self.parsed_data}
236
-
237
- return result
1
+ import os
2
+ import re
3
+ import sys
4
+ from collections import Counter
5
+ from contextlib import contextmanager
6
+
7
+
8
+ @contextmanager
9
+ def suppress_stdout():
10
+ # Save the original standard output stream
11
+ original_stdout = sys.stdout
12
+ # Redirect standard output to an empty device ('nul' on Windows, '/dev/null' on Unix/Linux/MacOS)
13
+ with open(os.devnull, "w") as devnull:
14
+ sys.stdout = devnull
15
+ try:
16
+ yield
17
+ finally:
18
+ # Restore the original standard output stream
19
+ sys.stdout = original_stdout
20
+
21
+
22
+ with suppress_stdout():
23
+ import jionlp as jio
24
+
25
+
26
+ class AbnormalCleaner:
27
+ def __init__(self, parsed_data):
28
+ self.parsed_data = parsed_data
29
+
30
+ def extract_references(self) -> str:
31
+ """
32
+ Extract reference entries and assign to self.parsed_data
33
+ (Original text will be replaced with extracted references, each item on a separate line)
34
+
35
+ Returns:
36
+ str: Extracted reference text (same as self.parsed_data)
37
+ """
38
+ patterns = [
39
+ r'([A-Z][a-z]+(?:, [A-Z](?:\.[a-z]*)?)+(?: et al\.)? $\d{4}$[^\n]+)', # APA format
40
+ r'($$\d+$$[^\n]+)', # Numbered references like [1]
41
+ r'(DOI:\s?\S+|https?://\S+)', # DOI/URL
42
+ r'([A-Z][a-z]+, [A-Z]\.?,? & [A-Z][a-z]+, [A-Z]\. \d{4}[^\n]+)' # Multi-author APA
43
+ ]
44
+ references = []
45
+ for pattern in patterns:
46
+ try:
47
+ references.extend(re.findall(pattern, self.parsed_data))
48
+ except re.error as e:
49
+ print(f"Regex error {pattern}: {e}")
50
+
51
+ # Assign extraction results to parsed_data (each item on a separate line)
52
+ self.parsed_data = "\n".join(list(set(references))) # Deduplicate and merge into string
53
+ return self.parsed_data
54
+
55
+ # Exception cleaning class
56
+ def remove_abnormal_chars(self):
57
+ """Remove abnormal characters from text"""
58
+ self.parsed_data = jio.remove_exception_char(self.parsed_data)
59
+ return self.parsed_data
60
+
61
+ def remove_html_tags(self):
62
+ """Remove HTML tags"""
63
+ self.parsed_data = jio.remove_html_tag(self.parsed_data)
64
+ return self.parsed_data
65
+
66
+ def convert_newlines(self):
67
+ """Convert \r to \n and multiple \n to a single \n"""
68
+ self.parsed_data = re.sub(r"\r", "", self.parsed_data)
69
+ self.parsed_data = re.sub(r"\n+", "\n", self.parsed_data)
70
+ return self.parsed_data
71
+
72
+ def single_space(self):
73
+ """Convert strings with more than 2 spaces to a single space"""
74
+ self.parsed_data = re.sub(r" {2,}", " ", self.parsed_data)
75
+ return self.parsed_data
76
+
77
+ def tabs_to_spaces(self):
78
+ """Convert tab characters to 4 spaces"""
79
+ self.parsed_data = self.parsed_data.replace("\t", " ")
80
+ return self.parsed_data
81
+
82
+ def remove_invisible_chars(self):
83
+ """Remove invisible ASCII characters"""
84
+ self.parsed_data = re.sub(
85
+ r"[\x00-\x09\x0b-\x1f\x7f-\xa0]", "", self.parsed_data
86
+ )
87
+ return self.parsed_data
88
+
89
+ def simplify_chinese(self):
90
+ """Convert traditional Chinese characters to simplified Chinese"""
91
+ self.parsed_data = jio.tra2sim(self.parsed_data, mode="word")
92
+ return self.parsed_data
93
+
94
+ def nlp_clean(self):
95
+ # jio nlp rough text cleaning
96
+ return jio.clean_text(self.parsed_data)
97
+
98
+ def point_conversion(self):
99
+ """Bullet point conversion"""
100
+ self.parsed_data = self.parsed_data.replace("\n• ", "\n- ")
101
+ return self.parsed_data
102
+
103
+ def clean_space(self):
104
+ self.parsed_data = self.parsed_data.replace(" ", "")
105
+ return self.parsed_data
106
+
107
+ def clean_tips(self):
108
+ self.parsed_data = self.parsed_data.replace(
109
+ "EvaluationWarning:ThedocumentwascreatedwithSpire.DocforPython.", ""
110
+ )
111
+ return self.parsed_data
112
+
113
+ def markdown_format(self):
114
+ pass
115
+
116
+ def no_html_clean(self):
117
+ """Perform cleaning without executing HTML cleaning"""
118
+ try:
119
+ self.convert_newlines()
120
+ self.single_space()
121
+ self.tabs_to_spaces()
122
+ self.simplify_chinese()
123
+
124
+ self.remove_invisible_chars()
125
+ # After cleaning invisible characters, perform another multi-line merge, remove space operation
126
+ self.convert_newlines()
127
+
128
+ result = {"text": self.parsed_data}
129
+ return result
130
+
131
+ except Exception as e:
132
+ print(f"Error: {e}, line: {e.__traceback__.tb_lineno}")
133
+ return {}
134
+
135
+ def to_clean(self):
136
+ """Perform all cleaning operations"""
137
+ try:
138
+ self.point_conversion()
139
+ self.remove_html_tags()
140
+ self.convert_newlines()
141
+ self.single_space()
142
+ self.tabs_to_spaces()
143
+ self.simplify_chinese()
144
+
145
+ self.remove_invisible_chars()
146
+ # After cleaning invisible characters, perform another multi-line merge, remove space operation
147
+ self.convert_newlines()
148
+ # self.clean_space()
149
+ self.clean_tips()
150
+
151
+ result = {"text": self.parsed_data}
152
+ return result
153
+
154
+ except Exception as e:
155
+ print(f"Error: {e}, line: {e.__traceback__.tb_lineno}")
156
+ return {}
157
+
158
+
159
+ class TextFilter:
160
+ def __init__(self, parsed_data):
161
+ self.parsed_data = parsed_data
162
+
163
+ def filter_by_word_repetition(self, threshold=0.6):
164
+ """Filter by word repetition rate"""
165
+ if not isinstance(self.parsed_data, str):
166
+ return False
167
+
168
+ text = str(self.parsed_data)
169
+ bi_grams = [text[i:i+2] for i in range(0, len(text)-1, 2)]
170
+ word_count = len(bi_grams)
171
+ if word_count == 0:
172
+ print("No words found.")
173
+ return False
174
+
175
+ word_freq = Counter(bi_grams)
176
+ most_common_word, most_common_count = word_freq.most_common(1)[0]
177
+ repetition_rate = most_common_count / word_count
178
+ print(f"Word repetition rate: {repetition_rate}")
179
+
180
+ return repetition_rate <= threshold
181
+
182
+ def filter_by_char_count(self, min_chars=30, max_chars=500000):
183
+ """Filter by character count"""
184
+ char_count = len(self.parsed_data)
185
+ if char_count < min_chars or char_count > max_chars:
186
+ return False
187
+ return True
188
+
189
+ def filter_by_numeric_content(self, threshold=0.6):
190
+ """Filter by numeric content"""
191
+ text = self.parsed_data
192
+ total_chars = len(text)
193
+ numeric_chars = len(re.findall(r"\d", text))
194
+ if numeric_chars / total_chars > threshold:
195
+ return False
196
+ return True
197
+
198
+ def to_filter(self):
199
+ """Perform all filtering operations and filter out texts that do not meet the conditions"""
200
+ if not self.filter_by_word_repetition():
201
+ return {}
202
+ elif not self.filter_by_char_count():
203
+ return {}
204
+ elif not self.filter_by_numeric_content():
205
+ return {}
206
+ else:
207
+ result = {"text": self.parsed_data}
208
+ return result
209
+
210
+
211
+ class PrivacyDesensitization:
212
+ def __init__(self, parsed_data):
213
+ self.parsed_data = parsed_data
214
+
215
+ # Privacy data replacement class
216
+ def replace_ip(self, token="COSCO_IP"):
217
+ # Replace IP addresses
218
+ self.parsed_data = jio.replace_ip_address(self.parsed_data, token)
219
+ return self.parsed_data
220
+
221
+ def replace_email(self, token="COSCO_EMAIL"):
222
+ # Replace email addresses
223
+ self.parsed_data = jio.replace_email(self.parsed_data, token)
224
+ return self.parsed_data
225
+
226
+ def replace_customer_number(self, token="COSCO_NUMBER"):
227
+ # Customer service hotlines are not easy to match and are not considered private data
228
+ self.parsed_data = re.sub(r"\d+-\d+-\d+", token, self.parsed_data)
229
+ return self.parsed_data
230
+
231
+ def replace_bank_id(self, token="COSCO_NUMBER"):
232
+ # Match bank card numbers and replace
233
+ self.parsed_data = self.replace_bank_id(
234
+ self.parsed_data, token=token
235
+ )
236
+ return self.parsed_data
237
+
238
+ def replace_phone_number(self, token="COSCO_NUMBER"):
239
+ # Match phone numbers and replace
240
+ self.parsed_data = jio.replace_phone_number(self.parsed_data, token)
241
+ return self.parsed_data
242
+
243
+ def replace_qq(self, token="COSCO_NUMBER"):
244
+ # Match QQ numbers and replace
245
+ self.parsed_data = jio.replace_qq(self.parsed_data,token)
246
+ return self.parsed_data
247
+
248
+ def replace_id_card(self, token="COSCO_NUMBER"):
249
+ # Match ID card numbers and replace
250
+ self.parsed_data = jio.replace_id_card(self.parsed_data, token)
251
+ return self.parsed_data
252
+
253
+ def replace_number(self):
254
+ # Replace all types of numeric private data
255
+
256
+ # Landline + mobile phone
257
+ self.parsed_data = jio.replace_phone_number(self.parsed_data, "COSCO_NUMBER")
258
+ # QQ
259
+ self.parsed_data = jio.replace_qq(self.parsed_data, "COSCO_NUMBER")
260
+ # ID card
261
+ self.parsed_data = jio.replace_id_card(self.parsed_data, "COSCO_NUMBER")
262
+ # Bank card
263
+ self.parsed_data = self.replace_bank_id(
264
+ self.parsed_data, token="COSCO_NUMBER"
265
+ ) # nosec B106 - 这是数据脱敏标记,不是密码
266
+
267
+ return self.parsed_data
268
+
269
+ def to_private(self):
270
+ """Perform all privacy data replacement operations"""
271
+ self.replace_ip()
272
+ self.replace_email()
273
+ self.replace_number()
274
+ result = {"text": self.parsed_data}
275
+ return result