deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,221 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import html
19
+ import uuid
20
+
21
+ import chardet
22
+ from bs4 import BeautifulSoup, NavigableString, Tag, Comment
23
+
24
+ from ..config import TokenizerConfig
25
+ from ..depend.find_codec import find_codec
26
+ from ..depend.rag_tokenizer import RagTokenizer
27
+
28
+ def get_encoding(file):
29
+ with open(file,'rb') as f:
30
+ tmp = chardet.detect(f.read())
31
+ return tmp['encoding']
32
+
33
+ BLOCK_TAGS = [
34
+ "h1", "h2", "h3", "h4", "h5", "h6",
35
+ "p", "div", "article", "section", "aside",
36
+ "ul", "ol", "li",
37
+ "table", "pre", "code", "blockquote",
38
+ "figure", "figcaption"
39
+ ]
40
+ TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
41
+
42
+
43
+ class RAGFlowHtmlParser:
44
+ def __init__(self, tokenizer_cfg: TokenizerConfig | None = None):
45
+ if tokenizer_cfg is None:
46
+ tokenizer_cfg = TokenizerConfig.from_env()
47
+ self.tokenizer_cfg = tokenizer_cfg
48
+ self.tokenizer = RagTokenizer(
49
+ dict_prefix=tokenizer_cfg.resolve_dict_prefix(),
50
+ offline=tokenizer_cfg.offline,
51
+ nltk_data_dir=tokenizer_cfg.nltk_data_dir,
52
+ )
53
+
54
+ def __call__(self, fnm, binary=None, chunk_token_num=512):
55
+ if binary:
56
+ encoding = find_codec(binary)
57
+ txt = binary.decode(encoding, errors="ignore")
58
+ else:
59
+ with open(fnm, "r",encoding=get_encoding(fnm)) as f:
60
+ txt = f.read()
61
+ return self.parser_txt(txt, chunk_token_num)
62
+
63
+ def parser_txt(self, txt, chunk_token_num):
64
+ if not isinstance(txt, str):
65
+ raise TypeError("txt type should be string!")
66
+
67
+ temp_sections = []
68
+ soup = BeautifulSoup(txt, "html5lib")
69
+ # delete <style> tag
70
+ for style_tag in soup.find_all(["style", "script"]):
71
+ style_tag.decompose()
72
+ # delete <script> tag in <div>
73
+ for div_tag in soup.find_all("div"):
74
+ for script_tag in div_tag.find_all("script"):
75
+ script_tag.decompose()
76
+ # delete inline style
77
+ for tag in soup.find_all(True):
78
+ if 'style' in tag.attrs:
79
+ del tag.attrs['style']
80
+ # delete HTML comment
81
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
82
+ comment.extract()
83
+
84
+ self.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
85
+ block_txt_list, table_list = self.merge_block_text(temp_sections)
86
+ sections = self.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
87
+ for table in table_list:
88
+ sections.append(table.get("content", ""))
89
+ return sections
90
+
91
+ def split_table(self, html_table, chunk_token_num=512):
92
+ soup = BeautifulSoup(html_table, "html.parser")
93
+ rows = soup.find_all("tr")
94
+ tables = []
95
+ current_table = []
96
+ current_count = 0
97
+ table_str_list = []
98
+ for row in rows:
99
+ tks_str = self.tokenizer.tokenize(str(row))
100
+ token_count = len(tks_str.split(" ")) if tks_str else 0
101
+ if current_count + token_count > chunk_token_num:
102
+ tables.append(current_table)
103
+ current_table = []
104
+ current_count = 0
105
+ current_table.append(row)
106
+ current_count += token_count
107
+ if current_table:
108
+ tables.append(current_table)
109
+
110
+ for table_rows in tables:
111
+ new_table = soup.new_tag("table")
112
+ for row in table_rows:
113
+ new_table.append(row)
114
+ table_str_list.append(str(new_table))
115
+
116
+ return table_str_list
117
+
118
+ def read_text_recursively(self, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None):
119
+ if isinstance(element, NavigableString):
120
+ content = element.strip()
121
+
122
+ def is_valid_html(content):
123
+ try:
124
+ soup = BeautifulSoup(content, "html.parser")
125
+ return bool(soup.find())
126
+ except Exception:
127
+ return False
128
+
129
+ return_info = []
130
+ if content:
131
+ if is_valid_html(content):
132
+ soup = BeautifulSoup(content, "html.parser")
133
+ child_info = self.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id)
134
+ parser_result.extend(child_info)
135
+ else:
136
+ info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}}
137
+ if parent_name:
138
+ info["tag_name"] = parent_name
139
+ return_info.append(info)
140
+ return return_info
141
+ elif isinstance(element, Tag):
142
+
143
+ if str.lower(element.name) == "table":
144
+ table_info_list = []
145
+ table_id = str(uuid.uuid1())
146
+ table_list = [html.unescape(str(element))]
147
+ for t in table_list:
148
+ table_info_list.append({"content": t, "tag_name": "table",
149
+ "metadata": {"table_id": table_id, "index": table_list.index(t)}})
150
+ return table_info_list
151
+ else:
152
+ if str.lower(element.name) in BLOCK_TAGS:
153
+ block_id = str(uuid.uuid1())
154
+ for child in element.children:
155
+ child_info = self.read_text_recursively(child, parser_result, chunk_token_num, element.name,
156
+ block_id)
157
+ parser_result.extend(child_info)
158
+ return []
159
+
160
+ def merge_block_text(self, parser_result):
161
+ block_content = []
162
+ current_content = ""
163
+ table_info_list = []
164
+ last_block_id = None
165
+ for item in parser_result:
166
+ content = item.get("content")
167
+ tag_name = item.get("tag_name")
168
+ title_flag = tag_name in TITLE_TAGS
169
+ block_id = item.get("metadata", {}).get("block_id")
170
+ if block_id:
171
+ if title_flag:
172
+ content = f"{TITLE_TAGS[tag_name]} {content}"
173
+ if last_block_id != block_id:
174
+ if last_block_id is not None:
175
+ block_content.append(current_content)
176
+ current_content = content
177
+ last_block_id = block_id
178
+ else:
179
+ current_content += (" " if current_content else "") + content
180
+ else:
181
+ if tag_name == "table":
182
+ table_info_list.append(item)
183
+ else:
184
+ current_content += (" " if current_content else "") + content
185
+ if current_content:
186
+ block_content.append(current_content)
187
+ return block_content, table_info_list
188
+
189
+ def chunk_block(self, block_txt_list, chunk_token_num=512):
190
+ chunks = []
191
+ current_block = ""
192
+ current_token_count = 0
193
+
194
+ for block in block_txt_list:
195
+ tks_str = self.tokenizer.tokenize(block)
196
+ block_token_count = len(tks_str.split(" ")) if tks_str else 0
197
+ if block_token_count > chunk_token_num:
198
+ if current_block:
199
+ chunks.append(current_block)
200
+ start = 0
201
+ tokens = tks_str.split(" ")
202
+ while start < len(tokens):
203
+ end = start + chunk_token_num
204
+ split_tokens = tokens[start:end]
205
+ chunks.append(" ".join(split_tokens))
206
+ start = end
207
+ current_block = ""
208
+ current_token_count = 0
209
+ else:
210
+ if current_token_count + block_token_count <= chunk_token_num:
211
+ current_block += ("\n" if current_block else "") + block
212
+ current_token_count += block_token_count
213
+ else:
214
+ chunks.append(current_block)
215
+ current_block = block
216
+ current_token_count = block_token_count
217
+
218
+ if current_block:
219
+ chunks.append(current_block)
220
+
221
+ return chunks
@@ -0,0 +1,179 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ # The following documents are mainly referenced, and only adaptation modifications have been made
19
+ # from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
20
+
21
+ import json
22
+ from typing import Any
23
+
24
+ from ..depend.find_codec import find_codec
25
+
26
+
27
+ class RAGFlowJsonParser:
28
+ def __init__(self, max_chunk_size: int = 2000, min_chunk_size: int | None = None):
29
+ super().__init__()
30
+ self.max_chunk_size = max_chunk_size * 2
31
+ self.min_chunk_size = min_chunk_size if min_chunk_size is not None else max(max_chunk_size - 200, 50)
32
+
33
+ def __call__(self, binary):
34
+ encoding = find_codec(binary)
35
+ txt = binary.decode(encoding, errors="ignore")
36
+
37
+ if self.is_jsonl_format(txt):
38
+ sections = self._parse_jsonl(txt)
39
+ else:
40
+ sections = self._parse_json(txt)
41
+ return sections
42
+
43
+ @staticmethod
44
+ def _json_size(data: dict) -> int:
45
+ """Calculate the size of the serialized JSON object."""
46
+ return len(json.dumps(data, ensure_ascii=False))
47
+
48
+ @staticmethod
49
+ def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
50
+ """Set a value in a nested dictionary based on the given path."""
51
+ for key in path[:-1]:
52
+ d = d.setdefault(key, {})
53
+ d[path[-1]] = value
54
+
55
+ def _list_to_dict_preprocessing(self, data: Any) -> Any:
56
+ if isinstance(data, dict):
57
+ # Process each key-value pair in the dictionary
58
+ return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
59
+ elif isinstance(data, list):
60
+ # Convert the list to a dictionary with index-based keys
61
+ return {str(i): self._list_to_dict_preprocessing(item) for i, item in enumerate(data)}
62
+ else:
63
+ # Base case: the item is neither a dict nor a list, so return it unchanged
64
+ return data
65
+
66
+ def _json_split(
67
+ self,
68
+ data,
69
+ current_path: list[str] | None,
70
+ chunks: list[dict] | None,
71
+ ) -> list[dict]:
72
+ """
73
+ Split json into maximum size dictionaries while preserving structure.
74
+ """
75
+ current_path = current_path or []
76
+ chunks = chunks or [{}]
77
+ if isinstance(data, dict):
78
+ for key, value in data.items():
79
+ new_path = current_path + [key]
80
+ chunk_size = self._json_size(chunks[-1])
81
+ size = self._json_size({key: value})
82
+ remaining = self.max_chunk_size - chunk_size
83
+
84
+ if size < remaining:
85
+ # Add item to current chunk
86
+ self._set_nested_dict(chunks[-1], new_path, value)
87
+ else:
88
+ if chunk_size >= self.min_chunk_size:
89
+ # Chunk is big enough, start a new chunk
90
+ chunks.append({})
91
+
92
+ # Iterate
93
+ self._json_split(value, new_path, chunks)
94
+ else:
95
+ # handle single item
96
+ self._set_nested_dict(chunks[-1], current_path, data)
97
+ return chunks
98
+
99
+ def split_json(
100
+ self,
101
+ json_data,
102
+ convert_lists: bool = False,
103
+ ) -> list[dict]:
104
+ """Splits JSON into a list of JSON chunks"""
105
+
106
+ if convert_lists:
107
+ preprocessed_data = self._list_to_dict_preprocessing(json_data)
108
+ chunks = self._json_split(preprocessed_data, None, None)
109
+ else:
110
+ chunks = self._json_split(json_data, None, None)
111
+
112
+ # Remove the last chunk if it's empty
113
+ if not chunks[-1]:
114
+ chunks.pop()
115
+ return chunks
116
+
117
+ def split_text(
118
+ self,
119
+ json_data: dict[str, Any],
120
+ convert_lists: bool = False,
121
+ ensure_ascii: bool = True,
122
+ ) -> list[str]:
123
+ """Splits JSON into a list of JSON formatted strings"""
124
+
125
+ chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
126
+
127
+ # Convert to string
128
+ return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
129
+
130
+ def _parse_json(self, content: str) -> list[str]:
131
+ sections = []
132
+ try:
133
+ json_data = json.loads(content)
134
+ chunks = self.split_json(json_data, True)
135
+ sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
136
+ except json.JSONDecodeError:
137
+ pass
138
+ return sections
139
+
140
+ def _parse_jsonl(self, content: str) -> list[str]:
141
+ lines = content.strip().splitlines()
142
+ all_chunks = []
143
+ for line in lines:
144
+ if not line.strip():
145
+ continue
146
+ try:
147
+ data = json.loads(line)
148
+ chunks = self.split_json(data, convert_lists=True)
149
+ all_chunks.extend(json.dumps(chunk, ensure_ascii=False) for chunk in chunks if chunk)
150
+ except json.JSONDecodeError:
151
+ continue
152
+ return all_chunks
153
+
154
+ def is_jsonl_format(self, txt: str, sample_limit: int = 10, threshold: float = 0.8) -> bool:
155
+ lines = [line.strip() for line in txt.strip().splitlines() if line.strip()]
156
+ if not lines:
157
+ return False
158
+
159
+ try:
160
+ json.loads(txt)
161
+ return False
162
+ except json.JSONDecodeError:
163
+ pass
164
+
165
+ sample_limit = min(len(lines), sample_limit)
166
+ sample_lines = lines[:sample_limit]
167
+ valid_lines = sum(1 for line in sample_lines if self._is_valid_json(line))
168
+
169
+ if not valid_lines:
170
+ return False
171
+
172
+ return (valid_lines / len(sample_lines)) >= threshold
173
+
174
+ def _is_valid_json(self, line: str) -> bool:
175
+ try:
176
+ json.loads(line)
177
+ return True
178
+ except json.JSONDecodeError:
179
+ return False