deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,150 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ from docx import Document
18
+ import re
19
+ import pandas as pd
20
+ from collections import Counter
21
+ from io import BytesIO
22
+
23
+ from ..config import TokenizerConfig
24
+ from ..depend.rag_tokenizer import RagTokenizer
25
+
26
+
27
+ class RAGFlowDocxParser:
28
+ def __init__(self, tokenizer_cfg: TokenizerConfig | None = None):
29
+ if tokenizer_cfg is None:
30
+ tokenizer_cfg = TokenizerConfig.from_env()
31
+ self.tokenizer_cfg = tokenizer_cfg
32
+ self.tokenizer = RagTokenizer(
33
+ dict_prefix=tokenizer_cfg.resolve_dict_prefix(),
34
+ offline=tokenizer_cfg.offline,
35
+ nltk_data_dir=tokenizer_cfg.nltk_data_dir,
36
+ )
37
+
38
+ def __extract_table_content(self, tb):
39
+ df = []
40
+ for row in tb.rows:
41
+ df.append([c.text for c in row.cells])
42
+ return self.__compose_table_content(pd.DataFrame(df))
43
+
44
+ def __compose_table_content(self, df):
45
+
46
+ def blockType(b):
47
+ pattern = [
48
+ ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
49
+ (r"^(20|19)[0-9]{2}年$", "Dt"),
50
+ (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
51
+ ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
52
+ (r"^第*[一二三四1-4]季度$", "Dt"),
53
+ (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
54
+ (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
55
+ ("^[0-9.,+%/ -]+$", "Nu"),
56
+ (r"^[0-9A-Z/\._~-]+$", "Ca"),
57
+ (r"^[A-Z]*[a-z' -]+$", "En"),
58
+ (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
59
+ (r"^.{1}$", "Sg")
60
+ ]
61
+ for p, n in pattern:
62
+ if re.search(p, b):
63
+ return n
64
+ tks = [t for t in self.tokenizer.tokenize(b).split() if len(t) > 1]
65
+ if len(tks) > 3:
66
+ if len(tks) < 12:
67
+ return "Tx"
68
+ else:
69
+ return "Lx"
70
+
71
+ if len(tks) == 1 and self.tokenizer.tag(tks[0]) == "nr":
72
+ return "Nr"
73
+
74
+ return "Ot"
75
+
76
+ if len(df) < 2:
77
+ return []
78
+ max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
79
+ 1, len(df)) for j in range(len(df.iloc[i, :]))])
80
+ max_type = max(max_type.items(), key=lambda x: x[1])[0]
81
+
82
+ colnm = len(df.iloc[0, :])
83
+ hdrows = [0] # header is not necessarily appear in the first line
84
+ if max_type == "Nu":
85
+ for r in range(1, len(df)):
86
+ tys = Counter([blockType(str(df.iloc[r, j]))
87
+ for j in range(len(df.iloc[r, :]))])
88
+ tys = max(tys.items(), key=lambda x: x[1])[0]
89
+ if tys != max_type:
90
+ hdrows.append(r)
91
+
92
+ lines = []
93
+ for i in range(1, len(df)):
94
+ if i in hdrows:
95
+ continue
96
+ hr = [r - i for r in hdrows]
97
+ hr = [r for r in hr if r < 0]
98
+ t = len(hr) - 1
99
+ while t > 0:
100
+ if hr[t] - hr[t - 1] > 1:
101
+ hr = hr[t:]
102
+ break
103
+ t -= 1
104
+ headers = []
105
+ for j in range(len(df.iloc[i, :])):
106
+ t = []
107
+ for h in hr:
108
+ x = str(df.iloc[i + h, j]).strip()
109
+ if x in t:
110
+ continue
111
+ t.append(x)
112
+ t = ",".join(t)
113
+ if t:
114
+ t += ": "
115
+ headers.append(t)
116
+ cells = []
117
+ for j in range(len(df.iloc[i, :])):
118
+ if not str(df.iloc[i, j]):
119
+ continue
120
+ cells.append(headers[j] + str(df.iloc[i, j]))
121
+ lines.append(";".join(cells))
122
+
123
+ if colnm > 3:
124
+ return lines
125
+ return ["\n".join(lines)]
126
+
127
+ def __call__(self, fnm, from_page=0, to_page=100000000):
128
+ self.doc = Document(fnm) if isinstance(
129
+ fnm, str) else Document(BytesIO(fnm))
130
+ pn = 0 # parsed page
131
+ secs = [] # parsed contents
132
+ for p in self.doc.paragraphs:
133
+ if pn > to_page:
134
+ break
135
+
136
+ runs_within_single_paragraph = [] # save runs within the range of pages
137
+ for run in p.runs:
138
+ if pn > to_page:
139
+ break
140
+ if from_page <= pn < to_page and p.text.strip():
141
+ runs_within_single_paragraph.append(run.text) # append run.text first
142
+
143
+ # wrap page break checker into a static method
144
+ if 'lastRenderedPageBreak' in run._element.xml:
145
+ pn += 1
146
+
147
+ secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
148
+
149
+ tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
150
+ return secs, tbls
@@ -0,0 +1,270 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+
14
+ import logging
15
+ import re
16
+ import sys
17
+ from io import BytesIO
18
+
19
+ import pandas as pd
20
+ from openpyxl import Workbook, load_workbook
21
+ from PIL import Image
22
+
23
+ from ..depend.find_codec import find_codec
24
+
25
+ # copied from `/openpyxl/cell/cell.py`
26
+ ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
27
+
28
+
29
+ class RAGFlowExcelParser:
30
+ @staticmethod
31
+ def _load_excel_to_workbook(file_like_object):
32
+ if isinstance(file_like_object, bytes):
33
+ file_like_object = BytesIO(file_like_object)
34
+
35
+ # Read first 4 bytes to determine file type
36
+ file_like_object.seek(0)
37
+ file_head = file_like_object.read(4)
38
+ file_like_object.seek(0)
39
+
40
+ if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")):
41
+ logging.info("Not an Excel file, converting CSV to Excel Workbook")
42
+
43
+ try:
44
+ file_like_object.seek(0)
45
+ df = pd.read_csv(file_like_object, on_bad_lines='skip')
46
+ return RAGFlowExcelParser._dataframe_to_workbook(df)
47
+
48
+ except Exception as e_csv:
49
+ raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
50
+
51
+ try:
52
+ return load_workbook(file_like_object, data_only=True)
53
+ except Exception as e:
54
+ logging.info(f"openpyxl load error: {e}, try pandas instead")
55
+ try:
56
+ file_like_object.seek(0)
57
+ try:
58
+ dfs = pd.read_excel(file_like_object, sheet_name=None)
59
+ return RAGFlowExcelParser._dataframe_to_workbook(dfs)
60
+ except Exception as ex:
61
+ logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
62
+ file_like_object.seek(0)
63
+ df = pd.read_excel(file_like_object, engine="calamine")
64
+ return RAGFlowExcelParser._dataframe_to_workbook(df)
65
+ except Exception as e_pandas:
66
+ raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
67
+
68
+ @staticmethod
69
+ def _clean_dataframe(df: pd.DataFrame):
70
+ def clean_string(s):
71
+ if isinstance(s, str):
72
+ return ILLEGAL_CHARACTERS_RE.sub(" ", s)
73
+ return s
74
+
75
+ return df.apply(lambda col: col.map(clean_string))
76
+
77
+ @staticmethod
78
+ def _dataframe_to_workbook(df):
79
+ # if contains multiple sheets use _dataframes_to_workbook
80
+ if isinstance(df, dict) and len(df) > 1:
81
+ return RAGFlowExcelParser._dataframes_to_workbook(df)
82
+
83
+ df = RAGFlowExcelParser._clean_dataframe(df)
84
+ wb = Workbook()
85
+ ws = wb.active
86
+ ws.title = "Data"
87
+
88
+ for col_num, column_name in enumerate(df.columns, 1):
89
+ ws.cell(row=1, column=col_num, value=column_name)
90
+
91
+ for row_num, row in enumerate(df.values, 2):
92
+ for col_num, value in enumerate(row, 1):
93
+ ws.cell(row=row_num, column=col_num, value=value)
94
+
95
+ return wb
96
+
97
+ @staticmethod
98
+ def _dataframes_to_workbook(dfs: dict):
99
+ wb = Workbook()
100
+ default_sheet = wb.active
101
+ wb.remove(default_sheet)
102
+
103
+ for sheet_name, df in dfs.items():
104
+ df = RAGFlowExcelParser._clean_dataframe(df)
105
+ ws = wb.create_sheet(title=sheet_name)
106
+ for col_num, column_name in enumerate(df.columns, 1):
107
+ ws.cell(row=1, column=col_num, value=column_name)
108
+ for row_num, row in enumerate(df.values, 2):
109
+ for col_num, value in enumerate(row, 1):
110
+ ws.cell(row=row_num, column=col_num, value=value)
111
+ return wb
112
+
113
+ @staticmethod
114
+ def _extract_images_from_worksheet(ws, sheetname=None):
115
+ """
116
+ Extract images from a worksheet and enrich them with vision-based descriptions.
117
+
118
+ Returns: List[dict]
119
+ """
120
+ images = getattr(ws, "_images", [])
121
+ if not images:
122
+ return []
123
+
124
+ raw_items = []
125
+
126
+ for img in images:
127
+ try:
128
+ img_bytes = img._data()
129
+ pil_img = Image.open(BytesIO(img_bytes)).convert("RGB")
130
+
131
+ anchor = img.anchor
132
+ if hasattr(anchor, "_from") and hasattr(anchor, "_to"):
133
+ r1, c1 = anchor._from.row + 1, anchor._from.col + 1
134
+ r2, c2 = anchor._to.row + 1, anchor._to.col + 1
135
+ if r1 == r2 and c1 == c2:
136
+ span = "single_cell"
137
+ else:
138
+ span = "multi_cell"
139
+ else:
140
+ r1, c1 = anchor._from.row + 1, anchor._from.col + 1
141
+ r2, c2 = r1, c1
142
+ span = "single_cell"
143
+
144
+ item = {
145
+ "sheet": sheetname or ws.title,
146
+ "image": pil_img,
147
+ "image_description": "",
148
+ "row_from": r1,
149
+ "col_from": c1,
150
+ "row_to": r2,
151
+ "col_to": c2,
152
+ "span_type": span,
153
+ }
154
+ raw_items.append(item)
155
+ except Exception:
156
+ continue
157
+ return raw_items
158
+
159
+ def html(self, fnm, chunk_rows=256):
160
+ from html import escape
161
+
162
+ file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
163
+ wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
164
+ tb_chunks = []
165
+
166
+ def _fmt(v):
167
+ if v is None:
168
+ return ""
169
+ return str(v).strip()
170
+
171
+ for sheetname in wb.sheetnames:
172
+ ws = wb[sheetname]
173
+ try:
174
+ rows = list(ws.rows)
175
+ except Exception as e:
176
+ logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
177
+ continue
178
+
179
+ if not rows:
180
+ continue
181
+
182
+ tb_rows_0 = "<tr>"
183
+ for t in list(rows[0]):
184
+ tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
185
+ tb_rows_0 += "</tr>"
186
+
187
+ for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
188
+ tb = ""
189
+ tb += f"<table><caption>{sheetname}</caption>"
190
+ tb += tb_rows_0
191
+ for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
192
+ tb += "<tr>"
193
+ for i, c in enumerate(r):
194
+ if c.value is None:
195
+ tb += "<td></td>"
196
+ else:
197
+ tb += f"<td>{escape(_fmt(c.value))}</td>"
198
+ tb += "</tr>"
199
+ tb += "</table>\n"
200
+ tb_chunks.append(tb)
201
+
202
+ return tb_chunks
203
+
204
+ def markdown(self, fnm):
205
+ import pandas as pd
206
+
207
+ file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
208
+ try:
209
+ file_like_object.seek(0)
210
+ df = pd.read_excel(file_like_object)
211
+ except Exception as e:
212
+ logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
213
+ file_like_object.seek(0)
214
+ df = pd.read_csv(file_like_object, on_bad_lines='skip')
215
+ df = df.replace(r"^\s*$", "", regex=True)
216
+ return df.to_markdown(index=False)
217
+
218
+ def __call__(self, fnm):
219
+ file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
220
+ wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
221
+
222
+ res = []
223
+ for sheetname in wb.sheetnames:
224
+ ws = wb[sheetname]
225
+ try:
226
+ rows = list(ws.rows)
227
+ except Exception as e:
228
+ logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
229
+ continue
230
+ if not rows:
231
+ continue
232
+ ti = list(rows[0])
233
+ for r in list(rows[1:]):
234
+ fields = []
235
+ for i, c in enumerate(r):
236
+ if not c.value:
237
+ continue
238
+ t = str(ti[i].value) if i < len(ti) else ""
239
+ t += (":" if t else "") + str(c.value)
240
+ fields.append(t)
241
+ line = "; ".join(fields)
242
+ if sheetname.lower().find("sheet") < 0:
243
+ line += " ——" + sheetname
244
+ res.append(line)
245
+ return res
246
+
247
+ @staticmethod
248
+ def row_number(fnm, binary):
249
+ if fnm.split(".")[-1].lower().find("xls") >= 0:
250
+ wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
251
+ total = 0
252
+
253
+ for sheetname in wb.sheetnames:
254
+ try:
255
+ ws = wb[sheetname]
256
+ total += len(list(ws.rows))
257
+ except Exception as e:
258
+ logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
259
+ continue
260
+ return total
261
+
262
+ if fnm.split(".")[-1].lower() in ["csv", "txt"]:
263
+ encoding = find_codec(binary)
264
+ txt = binary.decode(encoding, errors="ignore")
265
+ return len(txt.split("\n"))
266
+
267
+
268
+ if __name__ == "__main__":
269
+ psr = RAGFlowExcelParser()
270
+ psr(sys.argv[1])
@@ -0,0 +1,182 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ from concurrent.futures import ThreadPoolExecutor, as_completed
17
+
18
+ from PIL import Image
19
+
20
+ from ..llm_adapter import LLMType, LLMAdapter
21
+ from ..llm_adapter.vision import vision_llm_chunk as picture_vision_llm_chunk
22
+ from ..depend.prompts import vision_llm_figure_describe_prompt
23
+
24
+ # Try to import timeout from common, fallback to local
25
+ try:
26
+ from ..common.connection_utils import timeout
27
+ except ImportError:
28
+ from ..depend.timeout import timeout
29
+
30
+
31
+ def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
32
+ if not figures_data_without_positions:
33
+ return []
34
+ return [
35
+ (
36
+ (figure_data[1], [figure_data[0]]),
37
+ [(0, 0, 0, 0, 0)],
38
+ )
39
+ for figure_data in figures_data_without_positions
40
+ if isinstance(figure_data[1], Image.Image)
41
+ ]
42
+
43
+
44
+ def vision_figure_parser_docx_wrapper(sections, tbls, callback=None,**kwargs):
45
+ if not sections:
46
+ return tbls
47
+ try:
48
+ vision_model = LLMAdapter(kwargs.get("tenant_id"), LLMType.IMAGE2TEXT)
49
+ callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
50
+ except Exception:
51
+ vision_model = None
52
+ if vision_model:
53
+ figures_data = vision_figure_parser_figure_data_wrapper(sections)
54
+ try:
55
+ docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
56
+ boosted_figures = docx_vision_parser(callback=callback)
57
+ tbls.extend(boosted_figures)
58
+ except Exception as e:
59
+ callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
60
+ return tbls
61
+
62
+ def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs):
63
+ tbls = []
64
+ if not images:
65
+ return []
66
+ try:
67
+ vision_model = LLMAdapter(kwargs.get("tenant_id"), LLMType.IMAGE2TEXT)
68
+ callback(0.2, "Visual model detected. Attempting to enhance Excel image extraction...")
69
+ except Exception:
70
+ vision_model = None
71
+ if vision_model:
72
+ figures_data = [((
73
+ img["image"], # Image.Image
74
+ [img["image_description"]] # description list (must be list)
75
+ ),
76
+ [
77
+ (0, 0, 0, 0, 0) # dummy position
78
+ ]) for img in images]
79
+ try:
80
+ parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
81
+ callback(0.22, "Parsing images...")
82
+ boosted_figures = parser(callback=callback)
83
+ tbls.extend(boosted_figures)
84
+ except Exception as e:
85
+ callback(0.25, f"Excel visual model error: {e}. Skipping vision enhancement.")
86
+ return tbls
87
+
88
+ def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs):
89
+ if not tbls:
90
+ return []
91
+ try:
92
+ vision_model = LLMAdapter(kwargs.get("tenant_id"), LLMType.IMAGE2TEXT)
93
+ callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
94
+ except Exception:
95
+ vision_model = None
96
+ if vision_model:
97
+ def is_figure_item(item):
98
+ return (
99
+ isinstance(item[0][0], Image.Image) and
100
+ isinstance(item[0][1], list)
101
+ )
102
+ figures_data = [item for item in tbls if is_figure_item(item)]
103
+ try:
104
+ docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
105
+ boosted_figures = docx_vision_parser(callback=callback)
106
+ tbls = [item for item in tbls if not is_figure_item(item)]
107
+ tbls.extend(boosted_figures)
108
+ except Exception as e:
109
+ callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
110
+ return tbls
111
+
112
+
113
+ shared_executor = ThreadPoolExecutor(max_workers=10)
114
+
115
+
116
+ class VisionFigureParser:
117
+ def __init__(self, vision_model, figures_data, *args, **kwargs):
118
+ self.vision_model = vision_model
119
+ self._extract_figures_info(figures_data)
120
+ assert len(self.figures) == len(self.descriptions)
121
+ assert not self.positions or (len(self.figures) == len(self.positions))
122
+
123
+ def _extract_figures_info(self, figures_data):
124
+ self.figures = []
125
+ self.descriptions = []
126
+ self.positions = []
127
+
128
+ for item in figures_data:
129
+ # position
130
+ if len(item) == 2 and isinstance(item[0], tuple) and len(item[0]) == 2 and isinstance(item[1], list) and isinstance(item[1][0], tuple) and len(item[1][0]) == 5:
131
+ img_desc = item[0]
132
+ assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])"
133
+ self.figures.append(img_desc[0])
134
+ self.descriptions.append(img_desc[1])
135
+ self.positions.append(item[1])
136
+ else:
137
+ assert len(item) == 2 and isinstance(item[0], Image.Image) and isinstance(item[1], list), f"Unexpected form of figure data: get {len(item)=}, {item=}"
138
+ self.figures.append(item[0])
139
+ self.descriptions.append(item[1])
140
+
141
+ def _assemble(self):
142
+ self.assembled = []
143
+ self.has_positions = len(self.positions) != 0
144
+ for i in range(len(self.figures)):
145
+ figure = self.figures[i]
146
+ desc = self.descriptions[i]
147
+ pos = self.positions[i] if self.has_positions else None
148
+
149
+ figure_desc = (figure, desc)
150
+
151
+ if pos is not None:
152
+ self.assembled.append((figure_desc, pos))
153
+ else:
154
+ self.assembled.append((figure_desc,))
155
+
156
+ return self.assembled
157
+
158
+ def __call__(self, **kwargs):
159
+ callback = kwargs.get("callback", lambda prog, msg: None)
160
+
161
+ @timeout(30, 3)
162
+ def process(figure_idx, figure_binary):
163
+ description_text = picture_vision_llm_chunk(
164
+ binary=figure_binary,
165
+ vision_model=self.vision_model,
166
+ prompt=vision_llm_figure_describe_prompt(),
167
+ callback=callback,
168
+ )
169
+ return figure_idx, description_text
170
+
171
+ futures = []
172
+ for idx, img_binary in enumerate(self.figures or []):
173
+ futures.append(shared_executor.submit(process, idx, img_binary))
174
+
175
+ for future in as_completed(futures):
176
+ figure_num, txt = future.result()
177
+ if txt:
178
+ self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
179
+
180
+ self._assemble()
181
+
182
+ return self.assembled