deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
#
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
|
|
20
|
+
from markdown import markdown
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RAGFlowMarkdownParser:
|
|
24
|
+
def __init__(self, chunk_token_num=128):
|
|
25
|
+
self.chunk_token_num = int(chunk_token_num)
|
|
26
|
+
|
|
27
|
+
def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
|
|
28
|
+
tables = []
|
|
29
|
+
working_text = markdown_text
|
|
30
|
+
|
|
31
|
+
def replace_tables_with_rendered_html(pattern, table_list, render=True):
|
|
32
|
+
new_text = ""
|
|
33
|
+
last_end = 0
|
|
34
|
+
for match in pattern.finditer(working_text):
|
|
35
|
+
raw_table = match.group()
|
|
36
|
+
table_list.append(raw_table)
|
|
37
|
+
if separate_tables:
|
|
38
|
+
# Skip this match (i.e., remove it)
|
|
39
|
+
new_text += working_text[last_end : match.start()] + "\n\n"
|
|
40
|
+
else:
|
|
41
|
+
# Replace with rendered HTML
|
|
42
|
+
html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
|
|
43
|
+
new_text += working_text[last_end : match.start()] + html_table + "\n\n"
|
|
44
|
+
last_end = match.end()
|
|
45
|
+
new_text += working_text[last_end:]
|
|
46
|
+
return new_text
|
|
47
|
+
|
|
48
|
+
if "|" in markdown_text: # for optimize performance
|
|
49
|
+
# Standard Markdown table
|
|
50
|
+
border_table_pattern = re.compile(
|
|
51
|
+
r"""
|
|
52
|
+
(?:\n|^)
|
|
53
|
+
(?:\|.*?\|.*?\|.*?\n)
|
|
54
|
+
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
|
55
|
+
(?:\|.*?\|.*?\|.*?\n)+
|
|
56
|
+
""",
|
|
57
|
+
re.VERBOSE,
|
|
58
|
+
)
|
|
59
|
+
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
|
|
60
|
+
|
|
61
|
+
# Borderless Markdown table
|
|
62
|
+
no_border_table_pattern = re.compile(
|
|
63
|
+
r"""
|
|
64
|
+
(?:\n|^)
|
|
65
|
+
(?:\S.*?\|.*?\n)
|
|
66
|
+
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
|
67
|
+
(?:\S.*?\|.*?\n)+
|
|
68
|
+
""",
|
|
69
|
+
re.VERBOSE,
|
|
70
|
+
)
|
|
71
|
+
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
|
|
72
|
+
|
|
73
|
+
# Replace any TAGS e.g. <table ...> to <table>
|
|
74
|
+
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
|
|
75
|
+
table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE)
|
|
76
|
+
|
|
77
|
+
def replace_tag(m):
|
|
78
|
+
tag_name = re.match(r"<(\w+)", m.group()).group(1)
|
|
79
|
+
return "<{}>".format(tag_name)
|
|
80
|
+
|
|
81
|
+
working_text = re.sub(table_with_attributes_pattern, replace_tag, working_text)
|
|
82
|
+
|
|
83
|
+
if "<table>" in working_text.lower(): # for optimize performance
|
|
84
|
+
# HTML table extraction - handle possible html/body wrapper tags
|
|
85
|
+
html_table_pattern = re.compile(
|
|
86
|
+
r"""
|
|
87
|
+
(?:\n|^)
|
|
88
|
+
\s*
|
|
89
|
+
(?:
|
|
90
|
+
# case1: <html><body><table>...</table></body></html>
|
|
91
|
+
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
|
92
|
+
|
|
|
93
|
+
# case2: <body><table>...</table></body>
|
|
94
|
+
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
|
95
|
+
|
|
|
96
|
+
# case3: only<table>...</table>
|
|
97
|
+
(?:<table[^>]*>.*?</table>)
|
|
98
|
+
)
|
|
99
|
+
\s*
|
|
100
|
+
(?=\n|$)
|
|
101
|
+
""",
|
|
102
|
+
re.VERBOSE | re.DOTALL | re.IGNORECASE,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def replace_html_tables():
|
|
106
|
+
nonlocal working_text
|
|
107
|
+
new_text = ""
|
|
108
|
+
last_end = 0
|
|
109
|
+
for match in html_table_pattern.finditer(working_text):
|
|
110
|
+
raw_table = match.group()
|
|
111
|
+
tables.append(raw_table)
|
|
112
|
+
if separate_tables:
|
|
113
|
+
new_text += working_text[last_end : match.start()] + "\n\n"
|
|
114
|
+
else:
|
|
115
|
+
new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
|
|
116
|
+
last_end = match.end()
|
|
117
|
+
new_text += working_text[last_end:]
|
|
118
|
+
working_text = new_text
|
|
119
|
+
|
|
120
|
+
replace_html_tables()
|
|
121
|
+
|
|
122
|
+
return working_text, tables
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class MarkdownElementExtractor:
|
|
126
|
+
def __init__(self, markdown_content):
|
|
127
|
+
self.markdown_content = markdown_content
|
|
128
|
+
self.lines = markdown_content.split("\n")
|
|
129
|
+
|
|
130
|
+
def get_delimiters(self, delimiters):
|
|
131
|
+
toks = re.findall(r"`([^`]+)`", delimiters)
|
|
132
|
+
toks = sorted(set(toks), key=lambda x: -len(x))
|
|
133
|
+
return "|".join(re.escape(t) for t in toks if t)
|
|
134
|
+
|
|
135
|
+
def extract_elements(self, delimiter=None, include_meta=False):
|
|
136
|
+
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
|
137
|
+
sections = []
|
|
138
|
+
|
|
139
|
+
i = 0
|
|
140
|
+
dels = ""
|
|
141
|
+
if delimiter:
|
|
142
|
+
dels = self.get_delimiters(delimiter)
|
|
143
|
+
if len(dels) > 0:
|
|
144
|
+
text = "\n".join(self.lines)
|
|
145
|
+
if include_meta:
|
|
146
|
+
pattern = re.compile(dels)
|
|
147
|
+
last_end = 0
|
|
148
|
+
for m in pattern.finditer(text):
|
|
149
|
+
part = text[last_end : m.start()]
|
|
150
|
+
if part and part.strip():
|
|
151
|
+
sections.append(
|
|
152
|
+
{
|
|
153
|
+
"content": part.strip(),
|
|
154
|
+
"start_line": text.count("\n", 0, last_end),
|
|
155
|
+
"end_line": text.count("\n", 0, m.start()),
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
last_end = m.end()
|
|
159
|
+
|
|
160
|
+
part = text[last_end:]
|
|
161
|
+
if part and part.strip():
|
|
162
|
+
sections.append(
|
|
163
|
+
{
|
|
164
|
+
"content": part.strip(),
|
|
165
|
+
"start_line": text.count("\n", 0, last_end),
|
|
166
|
+
"end_line": text.count("\n", 0, len(text)),
|
|
167
|
+
}
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
parts = re.split(dels, text)
|
|
171
|
+
sections = [p.strip() for p in parts if p and p.strip()]
|
|
172
|
+
return sections
|
|
173
|
+
while i < len(self.lines):
|
|
174
|
+
line = self.lines[i]
|
|
175
|
+
|
|
176
|
+
if re.match(r"^#{1,6}\s+.*$", line):
|
|
177
|
+
# header
|
|
178
|
+
element = self._extract_header(i)
|
|
179
|
+
sections.append(element if include_meta else element["content"])
|
|
180
|
+
i = element["end_line"] + 1
|
|
181
|
+
elif line.strip().startswith("```"):
|
|
182
|
+
# code block
|
|
183
|
+
element = self._extract_code_block(i)
|
|
184
|
+
sections.append(element if include_meta else element["content"])
|
|
185
|
+
i = element["end_line"] + 1
|
|
186
|
+
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
|
187
|
+
# list block
|
|
188
|
+
element = self._extract_list_block(i)
|
|
189
|
+
sections.append(element if include_meta else element["content"])
|
|
190
|
+
i = element["end_line"] + 1
|
|
191
|
+
elif line.strip().startswith(">"):
|
|
192
|
+
# blockquote
|
|
193
|
+
element = self._extract_blockquote(i)
|
|
194
|
+
sections.append(element if include_meta else element["content"])
|
|
195
|
+
i = element["end_line"] + 1
|
|
196
|
+
elif line.strip():
|
|
197
|
+
# text block (paragraphs and inline elements until next block element)
|
|
198
|
+
element = self._extract_text_block(i)
|
|
199
|
+
sections.append(element if include_meta else element["content"])
|
|
200
|
+
i = element["end_line"] + 1
|
|
201
|
+
else:
|
|
202
|
+
i += 1
|
|
203
|
+
|
|
204
|
+
if include_meta:
|
|
205
|
+
sections = [section for section in sections if section["content"].strip()]
|
|
206
|
+
else:
|
|
207
|
+
sections = [section for section in sections if section.strip()]
|
|
208
|
+
return sections
|
|
209
|
+
|
|
210
|
+
def _extract_header(self, start_pos):
|
|
211
|
+
return {
|
|
212
|
+
"type": "header",
|
|
213
|
+
"content": self.lines[start_pos],
|
|
214
|
+
"start_line": start_pos,
|
|
215
|
+
"end_line": start_pos,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
def _extract_code_block(self, start_pos):
|
|
219
|
+
end_pos = start_pos
|
|
220
|
+
content_lines = [self.lines[start_pos]]
|
|
221
|
+
|
|
222
|
+
# Find the end of the code block
|
|
223
|
+
for i in range(start_pos + 1, len(self.lines)):
|
|
224
|
+
content_lines.append(self.lines[i])
|
|
225
|
+
end_pos = i
|
|
226
|
+
if self.lines[i].strip().startswith("```"):
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
return {
|
|
230
|
+
"type": "code_block",
|
|
231
|
+
"content": "\n".join(content_lines),
|
|
232
|
+
"start_line": start_pos,
|
|
233
|
+
"end_line": end_pos,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
def _extract_list_block(self, start_pos):
|
|
237
|
+
end_pos = start_pos
|
|
238
|
+
content_lines = []
|
|
239
|
+
|
|
240
|
+
i = start_pos
|
|
241
|
+
while i < len(self.lines):
|
|
242
|
+
line = self.lines[i]
|
|
243
|
+
# check if this line is a list item or continuation of a list
|
|
244
|
+
if (
|
|
245
|
+
re.match(r"^\s*[-*+]\s+.*$", line)
|
|
246
|
+
or re.match(r"^\s*\d+\.\s+.*$", line)
|
|
247
|
+
or (i > start_pos and not line.strip())
|
|
248
|
+
or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
|
|
249
|
+
or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
|
|
250
|
+
or (i > start_pos and re.match(r"^\s+\w+.*$", line))
|
|
251
|
+
):
|
|
252
|
+
content_lines.append(line)
|
|
253
|
+
end_pos = i
|
|
254
|
+
i += 1
|
|
255
|
+
else:
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
return {
|
|
259
|
+
"type": "list_block",
|
|
260
|
+
"content": "\n".join(content_lines),
|
|
261
|
+
"start_line": start_pos,
|
|
262
|
+
"end_line": end_pos,
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
def _extract_blockquote(self, start_pos):
|
|
266
|
+
end_pos = start_pos
|
|
267
|
+
content_lines = []
|
|
268
|
+
|
|
269
|
+
i = start_pos
|
|
270
|
+
while i < len(self.lines):
|
|
271
|
+
line = self.lines[i]
|
|
272
|
+
if line.strip().startswith(">") or (i > start_pos and not line.strip()):
|
|
273
|
+
content_lines.append(line)
|
|
274
|
+
end_pos = i
|
|
275
|
+
i += 1
|
|
276
|
+
else:
|
|
277
|
+
break
|
|
278
|
+
|
|
279
|
+
return {
|
|
280
|
+
"type": "blockquote",
|
|
281
|
+
"content": "\n".join(content_lines),
|
|
282
|
+
"start_line": start_pos,
|
|
283
|
+
"end_line": end_pos,
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
def _extract_text_block(self, start_pos):
|
|
287
|
+
"""Extract a text block (paragraphs, inline elements) until next block element"""
|
|
288
|
+
end_pos = start_pos
|
|
289
|
+
content_lines = [self.lines[start_pos]]
|
|
290
|
+
|
|
291
|
+
i = start_pos + 1
|
|
292
|
+
while i < len(self.lines):
|
|
293
|
+
line = self.lines[i]
|
|
294
|
+
# stop if we encounter a block element
|
|
295
|
+
if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
|
|
296
|
+
break
|
|
297
|
+
elif not line.strip():
|
|
298
|
+
# check if the next line is a block element
|
|
299
|
+
if i + 1 < len(self.lines) and (
|
|
300
|
+
re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
|
|
301
|
+
or self.lines[i + 1].strip().startswith("```")
|
|
302
|
+
or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
|
|
303
|
+
or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
|
|
304
|
+
or self.lines[i + 1].strip().startswith(">")
|
|
305
|
+
):
|
|
306
|
+
break
|
|
307
|
+
else:
|
|
308
|
+
content_lines.append(line)
|
|
309
|
+
end_pos = i
|
|
310
|
+
i += 1
|
|
311
|
+
else:
|
|
312
|
+
content_lines.append(line)
|
|
313
|
+
end_pos = i
|
|
314
|
+
i += 1
|
|
315
|
+
|
|
316
|
+
return {
|
|
317
|
+
"type": "text_block",
|
|
318
|
+
"content": "\n".join(content_lines),
|
|
319
|
+
"start_line": start_pos,
|
|
320
|
+
"end_line": end_pos,
|
|
321
|
+
}
|