deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,321 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import re
19
+
20
+ from markdown import markdown
21
+
22
+
23
+ class RAGFlowMarkdownParser:
24
+ def __init__(self, chunk_token_num=128):
25
+ self.chunk_token_num = int(chunk_token_num)
26
+
27
+ def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
28
+ tables = []
29
+ working_text = markdown_text
30
+
31
+ def replace_tables_with_rendered_html(pattern, table_list, render=True):
32
+ new_text = ""
33
+ last_end = 0
34
+ for match in pattern.finditer(working_text):
35
+ raw_table = match.group()
36
+ table_list.append(raw_table)
37
+ if separate_tables:
38
+ # Skip this match (i.e., remove it)
39
+ new_text += working_text[last_end : match.start()] + "\n\n"
40
+ else:
41
+ # Replace with rendered HTML
42
+ html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
43
+ new_text += working_text[last_end : match.start()] + html_table + "\n\n"
44
+ last_end = match.end()
45
+ new_text += working_text[last_end:]
46
+ return new_text
47
+
48
+ if "|" in markdown_text: # for optimize performance
49
+ # Standard Markdown table
50
+ border_table_pattern = re.compile(
51
+ r"""
52
+ (?:\n|^)
53
+ (?:\|.*?\|.*?\|.*?\n)
54
+ (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
55
+ (?:\|.*?\|.*?\|.*?\n)+
56
+ """,
57
+ re.VERBOSE,
58
+ )
59
+ working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
60
+
61
+ # Borderless Markdown table
62
+ no_border_table_pattern = re.compile(
63
+ r"""
64
+ (?:\n|^)
65
+ (?:\S.*?\|.*?\n)
66
+ (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
67
+ (?:\S.*?\|.*?\n)+
68
+ """,
69
+ re.VERBOSE,
70
+ )
71
+ working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
72
+
73
+ # Replace any TAGS e.g. <table ...> to <table>
74
+ TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
75
+ table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE)
76
+
77
+ def replace_tag(m):
78
+ tag_name = re.match(r"<(\w+)", m.group()).group(1)
79
+ return "<{}>".format(tag_name)
80
+
81
+ working_text = re.sub(table_with_attributes_pattern, replace_tag, working_text)
82
+
83
+ if "<table>" in working_text.lower(): # for optimize performance
84
+ # HTML table extraction - handle possible html/body wrapper tags
85
+ html_table_pattern = re.compile(
86
+ r"""
87
+ (?:\n|^)
88
+ \s*
89
+ (?:
90
+ # case1: <html><body><table>...</table></body></html>
91
+ (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
92
+ |
93
+ # case2: <body><table>...</table></body>
94
+ (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
95
+ |
96
+ # case3: only<table>...</table>
97
+ (?:<table[^>]*>.*?</table>)
98
+ )
99
+ \s*
100
+ (?=\n|$)
101
+ """,
102
+ re.VERBOSE | re.DOTALL | re.IGNORECASE,
103
+ )
104
+
105
+ def replace_html_tables():
106
+ nonlocal working_text
107
+ new_text = ""
108
+ last_end = 0
109
+ for match in html_table_pattern.finditer(working_text):
110
+ raw_table = match.group()
111
+ tables.append(raw_table)
112
+ if separate_tables:
113
+ new_text += working_text[last_end : match.start()] + "\n\n"
114
+ else:
115
+ new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
116
+ last_end = match.end()
117
+ new_text += working_text[last_end:]
118
+ working_text = new_text
119
+
120
+ replace_html_tables()
121
+
122
+ return working_text, tables
123
+
124
+
125
+ class MarkdownElementExtractor:
126
+ def __init__(self, markdown_content):
127
+ self.markdown_content = markdown_content
128
+ self.lines = markdown_content.split("\n")
129
+
130
+ def get_delimiters(self, delimiters):
131
+ toks = re.findall(r"`([^`]+)`", delimiters)
132
+ toks = sorted(set(toks), key=lambda x: -len(x))
133
+ return "|".join(re.escape(t) for t in toks if t)
134
+
135
+ def extract_elements(self, delimiter=None, include_meta=False):
136
+ """Extract individual elements (headers, code blocks, lists, etc.)"""
137
+ sections = []
138
+
139
+ i = 0
140
+ dels = ""
141
+ if delimiter:
142
+ dels = self.get_delimiters(delimiter)
143
+ if len(dels) > 0:
144
+ text = "\n".join(self.lines)
145
+ if include_meta:
146
+ pattern = re.compile(dels)
147
+ last_end = 0
148
+ for m in pattern.finditer(text):
149
+ part = text[last_end : m.start()]
150
+ if part and part.strip():
151
+ sections.append(
152
+ {
153
+ "content": part.strip(),
154
+ "start_line": text.count("\n", 0, last_end),
155
+ "end_line": text.count("\n", 0, m.start()),
156
+ }
157
+ )
158
+ last_end = m.end()
159
+
160
+ part = text[last_end:]
161
+ if part and part.strip():
162
+ sections.append(
163
+ {
164
+ "content": part.strip(),
165
+ "start_line": text.count("\n", 0, last_end),
166
+ "end_line": text.count("\n", 0, len(text)),
167
+ }
168
+ )
169
+ else:
170
+ parts = re.split(dels, text)
171
+ sections = [p.strip() for p in parts if p and p.strip()]
172
+ return sections
173
+ while i < len(self.lines):
174
+ line = self.lines[i]
175
+
176
+ if re.match(r"^#{1,6}\s+.*$", line):
177
+ # header
178
+ element = self._extract_header(i)
179
+ sections.append(element if include_meta else element["content"])
180
+ i = element["end_line"] + 1
181
+ elif line.strip().startswith("```"):
182
+ # code block
183
+ element = self._extract_code_block(i)
184
+ sections.append(element if include_meta else element["content"])
185
+ i = element["end_line"] + 1
186
+ elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
187
+ # list block
188
+ element = self._extract_list_block(i)
189
+ sections.append(element if include_meta else element["content"])
190
+ i = element["end_line"] + 1
191
+ elif line.strip().startswith(">"):
192
+ # blockquote
193
+ element = self._extract_blockquote(i)
194
+ sections.append(element if include_meta else element["content"])
195
+ i = element["end_line"] + 1
196
+ elif line.strip():
197
+ # text block (paragraphs and inline elements until next block element)
198
+ element = self._extract_text_block(i)
199
+ sections.append(element if include_meta else element["content"])
200
+ i = element["end_line"] + 1
201
+ else:
202
+ i += 1
203
+
204
+ if include_meta:
205
+ sections = [section for section in sections if section["content"].strip()]
206
+ else:
207
+ sections = [section for section in sections if section.strip()]
208
+ return sections
209
+
210
+ def _extract_header(self, start_pos):
211
+ return {
212
+ "type": "header",
213
+ "content": self.lines[start_pos],
214
+ "start_line": start_pos,
215
+ "end_line": start_pos,
216
+ }
217
+
218
+ def _extract_code_block(self, start_pos):
219
+ end_pos = start_pos
220
+ content_lines = [self.lines[start_pos]]
221
+
222
+ # Find the end of the code block
223
+ for i in range(start_pos + 1, len(self.lines)):
224
+ content_lines.append(self.lines[i])
225
+ end_pos = i
226
+ if self.lines[i].strip().startswith("```"):
227
+ break
228
+
229
+ return {
230
+ "type": "code_block",
231
+ "content": "\n".join(content_lines),
232
+ "start_line": start_pos,
233
+ "end_line": end_pos,
234
+ }
235
+
236
+ def _extract_list_block(self, start_pos):
237
+ end_pos = start_pos
238
+ content_lines = []
239
+
240
+ i = start_pos
241
+ while i < len(self.lines):
242
+ line = self.lines[i]
243
+ # check if this line is a list item or continuation of a list
244
+ if (
245
+ re.match(r"^\s*[-*+]\s+.*$", line)
246
+ or re.match(r"^\s*\d+\.\s+.*$", line)
247
+ or (i > start_pos and not line.strip())
248
+ or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
249
+ or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
250
+ or (i > start_pos and re.match(r"^\s+\w+.*$", line))
251
+ ):
252
+ content_lines.append(line)
253
+ end_pos = i
254
+ i += 1
255
+ else:
256
+ break
257
+
258
+ return {
259
+ "type": "list_block",
260
+ "content": "\n".join(content_lines),
261
+ "start_line": start_pos,
262
+ "end_line": end_pos,
263
+ }
264
+
265
+ def _extract_blockquote(self, start_pos):
266
+ end_pos = start_pos
267
+ content_lines = []
268
+
269
+ i = start_pos
270
+ while i < len(self.lines):
271
+ line = self.lines[i]
272
+ if line.strip().startswith(">") or (i > start_pos and not line.strip()):
273
+ content_lines.append(line)
274
+ end_pos = i
275
+ i += 1
276
+ else:
277
+ break
278
+
279
+ return {
280
+ "type": "blockquote",
281
+ "content": "\n".join(content_lines),
282
+ "start_line": start_pos,
283
+ "end_line": end_pos,
284
+ }
285
+
286
+ def _extract_text_block(self, start_pos):
287
+ """Extract a text block (paragraphs, inline elements) until next block element"""
288
+ end_pos = start_pos
289
+ content_lines = [self.lines[start_pos]]
290
+
291
+ i = start_pos + 1
292
+ while i < len(self.lines):
293
+ line = self.lines[i]
294
+ # stop if we encounter a block element
295
+ if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
296
+ break
297
+ elif not line.strip():
298
+ # check if the next line is a block element
299
+ if i + 1 < len(self.lines) and (
300
+ re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
301
+ or self.lines[i + 1].strip().startswith("```")
302
+ or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
303
+ or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
304
+ or self.lines[i + 1].strip().startswith(">")
305
+ ):
306
+ break
307
+ else:
308
+ content_lines.append(line)
309
+ end_pos = i
310
+ i += 1
311
+ else:
312
+ content_lines.append(line)
313
+ end_pos = i
314
+ i += 1
315
+
316
+ return {
317
+ "type": "text_block",
318
+ "content": "\n".join(content_lines),
319
+ "start_line": start_pos,
320
+ "end_line": end_pos,
321
+ }