deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,1591 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import asyncio
18
+ import logging
19
+ import math
20
+ import os
21
+ import queue
22
+ import random
23
+ import re
24
+ import sys
25
+ import threading
26
+ from collections import Counter, defaultdict
27
+ from copy import deepcopy
28
+ from io import BytesIO
29
+ from timeit import default_timer as timer
30
+
31
+ import numpy as np
32
+ import pdfplumber
33
+ import xgboost as xgb
34
+ from PIL import Image
35
+ from pypdf import PdfReader as pdf2_read
36
+ from sklearn.cluster import KMeans
37
+ from sklearn.metrics import silhouette_score
38
+
39
+ from ..common.misc_utils import pip_install_torch
40
+ from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
41
+ from ..config import PdfModelConfig, TokenizerConfig
42
+ from ..depend.rag_tokenizer import RagTokenizer, is_chinese
43
+ from ..depend.prompts import vision_llm_describe_prompt
44
+ from ..common import settings
45
+
46
+ LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
47
+ if LOCK_KEY_pdfplumber not in sys.modules:
48
+ sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
49
+
50
+
51
+ class RAGFlowPdfParser:
52
+ def __init__(
53
+ self,
54
+ model_cfg: PdfModelConfig | None = None,
55
+ tokenizer_cfg: TokenizerConfig | None = None,
56
+ ):
57
+ # Allow constructing parsers without explicitly passing configs.
58
+ # Env-based factories keep backwards compatibility for users that already
59
+ # configure via DEEPDOC_* env vars.
60
+ if model_cfg is None:
61
+ model_cfg = PdfModelConfig.from_env()
62
+ if tokenizer_cfg is None:
63
+ tokenizer_cfg = TokenizerConfig.from_env()
64
+
65
+ self.model_cfg = model_cfg
66
+ self.tokenizer_cfg = tokenizer_cfg
67
+
68
+ provider = model_cfg.normalized_provider()
69
+ model_offline = provider == "local"
70
+ self.tokenizer = RagTokenizer(
71
+ dict_prefix=tokenizer_cfg.resolve_dict_prefix(),
72
+ offline=tokenizer_cfg.offline,
73
+ nltk_data_dir=tokenizer_cfg.nltk_data_dir,
74
+ )
75
+
76
+ vision_model_dir = model_cfg.resolve_vision_model_dir()
77
+ xgb_model_dir = model_cfg.resolve_xgb_model_dir()
78
+ ascend_model_dir = model_cfg.resolve_ascend_model_dir()
79
+
80
+ self.ocr = OCR(
81
+ model_dir=vision_model_dir,
82
+ model_home=model_cfg.model_home,
83
+ model_provider=provider,
84
+ offline=model_offline,
85
+ )
86
+ self.parallel_limiter = None
87
+ if settings.PARALLEL_DEVICES > 1:
88
+ self.parallel_limiter = [asyncio.Semaphore(1) for _ in range(settings.PARALLEL_DEVICES)]
89
+
90
+ layout_recognizer_type = os.getenv("LAYOUT_RECOGNIZER_TYPE", "onnx").lower()
91
+ if layout_recognizer_type not in ["onnx", "ascend"]:
92
+ raise RuntimeError("Unsupported layout recognizer type.")
93
+
94
+ if hasattr(self, "model_speciess"):
95
+ recognizer_domain = "layout." + self.model_speciess
96
+ else:
97
+ recognizer_domain = "layout"
98
+
99
+ if layout_recognizer_type == "ascend":
100
+ logging.debug("Using Ascend LayoutRecognizer")
101
+ if not ascend_model_dir:
102
+ raise ValueError("ascend_model_dir is required when LAYOUT_RECOGNIZER_TYPE=ascend")
103
+ self.layouter = AscendLayoutRecognizer(recognizer_domain, model_dir=ascend_model_dir)
104
+ else: # onnx
105
+ logging.debug("Using Onnx LayoutRecognizer")
106
+ self.layouter = LayoutRecognizer(
107
+ recognizer_domain,
108
+ model_dir=vision_model_dir,
109
+ model_home=model_cfg.model_home,
110
+ model_provider=provider,
111
+ offline=model_offline,
112
+ )
113
+ self.tbl_det = TableStructureRecognizer(
114
+ model_dir=vision_model_dir,
115
+ model_home=model_cfg.model_home,
116
+ model_provider=provider,
117
+ offline=model_offline,
118
+ )
119
+
120
+ self.updown_cnt_mdl = xgb.Booster()
121
+ try:
122
+ pip_install_torch()
123
+ import torch.cuda
124
+ if torch.cuda.is_available():
125
+ self.updown_cnt_mdl.set_param({"device": "cuda"})
126
+ except Exception:
127
+ logging.info("No torch found.")
128
+
129
+ self.updown_cnt_mdl.load_model(os.path.join(xgb_model_dir, "updown_concat_xgb.model"))
130
+
131
+ self.page_from = 0
132
+ self.column_num = 1
133
+
134
+ def __char_width(self, c):
135
+ return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
136
+
137
+ def __height(self, c):
138
+ return c["bottom"] - c["top"]
139
+
140
+ def _x_dis(self, a, b):
141
+ return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]), abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
142
+
143
+ def _y_dis(self, a, b):
144
+ return (b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
145
+
146
+ def _match_proj(self, b):
147
+ proj_patt = [
148
+ r"第[零一二三四五六七八九十百]+章",
149
+ r"第[零一二三四五六七八九十百]+[条节]",
150
+ r"[零一二三四五六七八九十百]+[、是  ]",
151
+ r"[\((][零一二三四五六七八九十百]+[)\)]",
152
+ r"[\((][0-9]+[)\)]",
153
+ r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
154
+ r"[0-9]+\.[0-9.]+(、|\.[  ])",
155
+ r"[⚫•➢①② ]",
156
+ ]
157
+ return any([re.match(p, b["text"]) for p in proj_patt])
158
+
159
+ def _updown_concat_features(self, up, down):
160
+ w = max(self.__char_width(up), self.__char_width(down))
161
+ h = max(self.__height(up), self.__height(down))
162
+ y_dis = self._y_dis(up, down)
163
+ LEN = 6
164
+ tks_down = self.tokenizer.tokenize(down["text"][:LEN]).split()
165
+ tks_up = self.tokenizer.tokenize(up["text"][-LEN:]).split()
166
+ tks_all = up["text"][-LEN:].strip() + (" " if re.match(r"[a-zA-Z0-9]+", up["text"][-1] + down["text"][0]) else "") + down["text"][:LEN].strip()
167
+ tks_all = self.tokenizer.tokenize(tks_all).split()
168
+ fea = [
169
+ up.get("R", -1) == down.get("R", -1),
170
+ y_dis / h,
171
+ down["page_number"] - up["page_number"],
172
+ up["layout_type"] == down["layout_type"],
173
+ up["layout_type"] == "text",
174
+ down["layout_type"] == "text",
175
+ up["layout_type"] == "table",
176
+ down["layout_type"] == "table",
177
+ True if re.search(r"([。?!;!?;+))]|[a-z]\.)$", up["text"]) else False,
178
+ True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
179
+ True if re.search(r"(^.?[/,?;:\],。;:’”?!》】)-])", down["text"]) else False,
180
+ True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
181
+ True if re.search(r"[,,][^。.]+$", up["text"]) else False,
182
+ True if re.search(r"[,,][^。.]+$", up["text"]) else False,
183
+ True if re.search(r"[\((][^\))]+$", up["text"]) and re.search(r"[\))]", down["text"]) else False,
184
+ self._match_proj(down),
185
+ True if re.match(r"[A-Z]", down["text"]) else False,
186
+ True if re.match(r"[A-Z]", up["text"][-1]) else False,
187
+ True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
188
+ True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
189
+ up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()) > 1 and len(down["text"].strip()) > 1 else False,
190
+ up["x0"] > down["x1"],
191
+ abs(self.__height(up) - self.__height(down)) / min(self.__height(up), self.__height(down)),
192
+ self._x_dis(up, down) / max(w, 0.000001),
193
+ (len(up["text"]) - len(down["text"])) / max(len(up["text"]), len(down["text"])),
194
+ len(tks_all) - len(tks_up) - len(tks_down),
195
+ len(tks_down) - len(tks_up),
196
+ tks_down[-1] == tks_up[-1] if tks_down and tks_up else False,
197
+ max(down["in_row"], up["in_row"]),
198
+ abs(down["in_row"] - up["in_row"]),
199
+ len(tks_down) == 1 and self.tokenizer.tag(tks_down[0]).find("n") >= 0,
200
+ len(tks_up) == 1 and self.tokenizer.tag(tks_up[0]).find("n") >= 0,
201
+ ]
202
+ return fea
203
+
204
+ @staticmethod
205
+ def sort_X_by_page(arr, threshold):
206
+ # sort using y1 first and then x1
207
+ arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
208
+ for i in range(len(arr) - 1):
209
+ for j in range(i, -1, -1):
210
+ # restore the order using th
211
+ if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold and arr[j + 1]["top"] < arr[j]["top"] and arr[j + 1]["page_number"] == arr[j]["page_number"]:
212
+ tmp = arr[j]
213
+ arr[j] = arr[j + 1]
214
+ arr[j + 1] = tmp
215
+ return arr
216
+
217
+ def _has_color(self, o):
218
+ if o.get("ncs", "") == "DeviceGray":
219
+ if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and o["non_stroking_color"][0] == 1:
220
+ if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
221
+ return False
222
+ return True
223
+
224
+ def _table_transformer_job(self, ZM):
225
+ logging.debug("Table processing...")
226
+ imgs, pos = [], []
227
+ tbcnt = [0]
228
+ MARGIN = 10
229
+ self.tb_cpns = []
230
+ assert len(self.page_layout) == len(self.page_images)
231
+ for p, tbls in enumerate(self.page_layout): # for page
232
+ tbls = [f for f in tbls if f["type"] == "table"]
233
+ tbcnt.append(len(tbls))
234
+ if not tbls:
235
+ continue
236
+ for tb in tbls: # for table
237
+ left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, tb["x1"] + MARGIN, tb["bottom"] + MARGIN
238
+ left *= ZM
239
+ top *= ZM
240
+ right *= ZM
241
+ bott *= ZM
242
+ pos.append((left, top))
243
+ imgs.append(self.page_images[p].crop((left, top, right, bott)))
244
+
245
+ assert len(self.page_images) == len(tbcnt) - 1
246
+ if not imgs:
247
+ return
248
+ recos = self.tbl_det(imgs)
249
+ tbcnt = np.cumsum(tbcnt)
250
+ for i in range(len(tbcnt) - 1): # for page
251
+ pg = []
252
+ for j, tb_items in enumerate(recos[tbcnt[i] : tbcnt[i + 1]]): # for table
253
+ poss = pos[tbcnt[i] : tbcnt[i + 1]]
254
+ for it in tb_items: # for table components
255
+ it["x0"] = it["x0"] + poss[j][0]
256
+ it["x1"] = it["x1"] + poss[j][0]
257
+ it["top"] = it["top"] + poss[j][1]
258
+ it["bottom"] = it["bottom"] + poss[j][1]
259
+ for n in ["x0", "x1", "top", "bottom"]:
260
+ it[n] /= ZM
261
+ it["top"] += self.page_cum_height[i]
262
+ it["bottom"] += self.page_cum_height[i]
263
+ it["pn"] = i
264
+ it["layoutno"] = j
265
+ pg.append(it)
266
+ self.tb_cpns.extend(pg)
267
+
268
+ def gather(kwd, fzy=10, ption=0.6):
269
+ eles = Recognizer.sort_Y_firstly([r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
270
+ eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
271
+ return Recognizer.sort_Y_firstly(eles, 0)
272
+
273
+ # add R,H,C,SP tag to boxes within table layout
274
+ headers = gather(r".*header$")
275
+ rows = gather(r".* (row|header)")
276
+ spans = gather(r".*spanning")
277
+ clmns = sorted([r for r in self.tb_cpns if re.match(r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
278
+ clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
279
+ for b in self.boxes:
280
+ if b.get("layout_type", "") != "table":
281
+ continue
282
+ ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
283
+ if ii is not None:
284
+ b["R"] = ii
285
+ b["R_top"] = rows[ii]["top"]
286
+ b["R_bott"] = rows[ii]["bottom"]
287
+
288
+ ii = Recognizer.find_overlapped_with_threshold(b, headers, thr=0.3)
289
+ if ii is not None:
290
+ b["H_top"] = headers[ii]["top"]
291
+ b["H_bott"] = headers[ii]["bottom"]
292
+ b["H_left"] = headers[ii]["x0"]
293
+ b["H_right"] = headers[ii]["x1"]
294
+ b["H"] = ii
295
+
296
+ ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
297
+ if ii is not None:
298
+ b["C"] = ii
299
+ b["C_left"] = clmns[ii]["x0"]
300
+ b["C_right"] = clmns[ii]["x1"]
301
+
302
+ ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
303
+ if ii is not None:
304
+ b["H_top"] = spans[ii]["top"]
305
+ b["H_bott"] = spans[ii]["bottom"]
306
+ b["H_left"] = spans[ii]["x0"]
307
+ b["H_right"] = spans[ii]["x1"]
308
+ b["SP"] = ii
309
+
310
+ def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None):
311
+ start = timer()
312
+ bxs = self.ocr.detect(np.array(img), device_id)
313
+ logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)")
314
+
315
+ start = timer()
316
+ if not bxs:
317
+ self.boxes.append([])
318
+ return
319
+ bxs = [(line[0], line[1][0]) for line in bxs]
320
+ bxs = Recognizer.sort_Y_firstly(
321
+ [
322
+ {"x0": b[0][0] / ZM, "x1": b[1][0] / ZM, "top": b[0][1] / ZM, "text": "", "txt": t, "bottom": b[-1][1] / ZM, "chars": [], "page_number": pagenum}
323
+ for b, t in bxs
324
+ if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]
325
+ ],
326
+ self.mean_height[pagenum - 1] / 3,
327
+ )
328
+
329
+ # merge chars in the same rect
330
+ for c in chars:
331
+ ii = Recognizer.find_overlapped(c, bxs)
332
+ if ii is None:
333
+ self.lefted_chars.append(c)
334
+ continue
335
+ ch = c["bottom"] - c["top"]
336
+ bh = bxs[ii]["bottom"] - bxs[ii]["top"]
337
+ if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != " ":
338
+ self.lefted_chars.append(c)
339
+ continue
340
+ bxs[ii]["chars"].append(c)
341
+
342
+ for b in bxs:
343
+ if not b["chars"]:
344
+ del b["chars"]
345
+ continue
346
+ m_ht = np.mean([c["height"] for c in b["chars"]])
347
+ for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
348
+ if c["text"] == " " and b["text"]:
349
+ if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
350
+ b["text"] += " "
351
+ else:
352
+ b["text"] += c["text"]
353
+ del b["chars"]
354
+
355
+ logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
356
+ start = timer()
357
+ boxes_to_reg = []
358
+ img_np = np.array(img)
359
+ for b in bxs:
360
+ if not b["text"]:
361
+ left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM
362
+ b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
363
+ boxes_to_reg.append(b)
364
+ del b["txt"]
365
+ texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg], device_id)
366
+ for i in range(len(boxes_to_reg)):
367
+ boxes_to_reg[i]["text"] = texts[i]
368
+ del boxes_to_reg[i]["box_image"]
369
+ logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s")
370
+ bxs = [b for b in bxs if b["text"]]
371
+ if self.mean_height[pagenum - 1] == 0:
372
+ self.mean_height[pagenum - 1] = np.median([b["bottom"] - b["top"] for b in bxs])
373
+ self.boxes.append(bxs)
374
+
375
+ def _layouts_rec(self, ZM, drop=True):
376
+ assert len(self.page_images) == len(self.boxes)
377
+ self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM, drop=drop)
378
+ # cumlative Y
379
+ for i in range(len(self.boxes)):
380
+ self.boxes[i]["top"] += self.page_cum_height[self.boxes[i]["page_number"] - 1]
381
+ self.boxes[i]["bottom"] += self.page_cum_height[self.boxes[i]["page_number"] - 1]
382
+
383
+ def _assign_column(self, boxes, zoomin=3):
384
+ if not boxes:
385
+ return boxes
386
+ if all("col_id" in b for b in boxes):
387
+ return boxes
388
+
389
+ by_page = defaultdict(list)
390
+ for b in boxes:
391
+ by_page[b["page_number"]].append(b)
392
+
393
+ page_cols = {}
394
+
395
+ for pg, bxs in by_page.items():
396
+ if not bxs:
397
+ page_cols[pg] = 1
398
+ continue
399
+
400
+ x0s_raw = np.array([b["x0"] for b in bxs], dtype=float)
401
+
402
+ min_x0 = np.min(x0s_raw)
403
+ max_x1 = np.max([b["x1"] for b in bxs])
404
+ width = max_x1 - min_x0
405
+
406
+ INDENT_TOL = width * 0.12
407
+ x0s = []
408
+ for x in x0s_raw:
409
+ if abs(x - min_x0) < INDENT_TOL:
410
+ x0s.append([min_x0])
411
+ else:
412
+ x0s.append([x])
413
+ x0s = np.array(x0s, dtype=float)
414
+
415
+ max_try = min(4, len(bxs))
416
+ if max_try < 2:
417
+ max_try = 1
418
+ best_k = 1
419
+ best_score = -1
420
+
421
+ for k in range(1, max_try + 1):
422
+ km = KMeans(n_clusters=k, n_init="auto")
423
+ labels = km.fit_predict(x0s)
424
+
425
+ centers = np.sort(km.cluster_centers_.flatten())
426
+ if len(centers) > 1:
427
+ try:
428
+ score = silhouette_score(x0s, labels)
429
+ except ValueError:
430
+ continue
431
+ else:
432
+ score = 0
433
+ if score > best_score:
434
+ best_score = score
435
+ best_k = k
436
+
437
+ page_cols[pg] = best_k
438
+ logging.info(f"[Page {pg}] best_score={best_score:.2f}, best_k={best_k}")
439
+
440
+
441
+ global_cols = Counter(page_cols.values()).most_common(1)[0][0]
442
+ logging.info(f"Global column_num decided by majority: {global_cols}")
443
+
444
+
445
+ for pg, bxs in by_page.items():
446
+ if not bxs:
447
+ continue
448
+ k = page_cols[pg]
449
+ if len(bxs) < k:
450
+ k = 1
451
+ x0s = np.array([[b["x0"]] for b in bxs], dtype=float)
452
+ km = KMeans(n_clusters=k, n_init="auto")
453
+ labels = km.fit_predict(x0s)
454
+
455
+ centers = km.cluster_centers_.flatten()
456
+ order = np.argsort(centers)
457
+
458
+ remap = {orig: new for new, orig in enumerate(order)}
459
+
460
+ for b, lb in zip(bxs, labels):
461
+ b["col_id"] = remap[lb]
462
+
463
+ grouped = defaultdict(list)
464
+ for b in bxs:
465
+ grouped[b["col_id"]].append(b)
466
+
467
+ return boxes
468
+
469
+ def _text_merge(self, zoomin=3):
470
+ # merge adjusted boxes
471
+ bxs = self._assign_column(self.boxes, zoomin)
472
+
473
+ def end_with(b, txt):
474
+ txt = txt.strip()
475
+ tt = b.get("text", "").strip()
476
+ return tt and tt.find(txt) == len(tt) - len(txt)
477
+
478
+ def start_with(b, txts):
479
+ tt = b.get("text", "").strip()
480
+ return tt and any([tt.find(t.strip()) == 0 for t in txts])
481
+
482
+ # horizontally merge adjacent box with the same layout
483
+ i = 0
484
+ while i < len(bxs) - 1:
485
+ b = bxs[i]
486
+ b_ = bxs[i + 1]
487
+
488
+ if b["page_number"] != b_["page_number"] or b.get("col_id") != b_.get("col_id"):
489
+ i += 1
490
+ continue
491
+
492
+ if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
493
+ i += 1
494
+ continue
495
+
496
+ if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
497
+ # merge
498
+ bxs[i]["x1"] = b_["x1"]
499
+ bxs[i]["top"] = (b["top"] + b_["top"]) / 2
500
+ bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
501
+ bxs[i]["text"] += b_["text"]
502
+ bxs.pop(i + 1)
503
+ continue
504
+ i += 1
505
+ self.boxes = bxs
506
+
507
+ def _naive_vertical_merge(self, zoomin=3):
508
+ bxs = self._assign_column(self.boxes, zoomin)
509
+
510
+ grouped = defaultdict(list)
511
+ for b in bxs:
512
+ grouped[(b["page_number"], b.get("col_id", 0))].append(b)
513
+
514
+ merged_boxes = []
515
+ for (pg, col), bxs in grouped.items():
516
+ bxs = sorted(bxs, key=lambda x: (x["top"], x["x0"]))
517
+ if not bxs:
518
+ continue
519
+
520
+ mh = self.mean_height[pg - 1] if self.mean_height else np.median([b["bottom"] - b["top"] for b in bxs]) or 10
521
+
522
+ i = 0
523
+ while i + 1 < len(bxs):
524
+ b = bxs[i]
525
+ b_ = bxs[i + 1]
526
+
527
+ if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
528
+ bxs.pop(i)
529
+ continue
530
+
531
+ if not b["text"].strip():
532
+ bxs.pop(i)
533
+ continue
534
+
535
+ if not b["text"].strip() or b.get("layoutno") != b_.get("layoutno"):
536
+ i += 1
537
+ continue
538
+
539
+ if b_["top"] - b["bottom"] > mh * 1.5:
540
+ i += 1
541
+ continue
542
+
543
+ overlap = max(0, min(b["x1"], b_["x1"]) - max(b["x0"], b_["x0"]))
544
+ if overlap / max(1, min(b["x1"] - b["x0"], b_["x1"] - b_["x0"])) < 0.3:
545
+ i += 1
546
+ continue
547
+
548
+ concatting_feats = [
549
+ b["text"].strip()[-1] in ",;:'\",、‘“;:-",
550
+ len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
551
+ b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
552
+ ]
553
+ # features for not concating
554
+ feats = [
555
+ b.get("layoutno", 0) != b_.get("layoutno", 0),
556
+ b["text"].strip()[-1] in "。?!?",
557
+ self.is_english and b["text"].strip()[-1] in ".!?",
558
+ b["page_number"] == b_["page_number"] and b_["top"] - b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
559
+ b["page_number"] < b_["page_number"] and abs(b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
560
+ ]
561
+ # split features
562
+ detach_feats = [b["x1"] < b_["x0"], b["x0"] > b_["x1"]]
563
+ if (any(feats) and not any(concatting_feats)) or any(detach_feats):
564
+ logging.debug(
565
+ "{} {} {} {}".format(
566
+ b["text"],
567
+ b_["text"],
568
+ any(feats),
569
+ any(concatting_feats),
570
+ )
571
+ )
572
+ i += 1
573
+ continue
574
+
575
+ b["text"] = (b["text"].rstrip() + " " + b_["text"].lstrip()).strip()
576
+ b["bottom"] = b_["bottom"]
577
+ b["x0"] = min(b["x0"], b_["x0"])
578
+ b["x1"] = max(b["x1"], b_["x1"])
579
+ bxs.pop(i + 1)
580
+
581
+ merged_boxes.extend(bxs)
582
+
583
+ self.boxes = sorted(merged_boxes, key=lambda x: (x["page_number"], x.get("col_id", 0), x["top"]))
584
+
585
+ def _final_reading_order_merge(self, zoomin=3):
586
+ if not self.boxes:
587
+ return
588
+
589
+ self.boxes = self._assign_column(self.boxes, zoomin=zoomin)
590
+
591
+ pages = defaultdict(lambda: defaultdict(list))
592
+ for b in self.boxes:
593
+ pg = b["page_number"]
594
+ col = b.get("col_id", 0)
595
+ pages[pg][col].append(b)
596
+
597
+ for pg in pages:
598
+ for col in pages[pg]:
599
+ pages[pg][col].sort(key=lambda x: (x["top"], x["x0"]))
600
+
601
+ new_boxes = []
602
+ for pg in sorted(pages.keys()):
603
+ for col in sorted(pages[pg].keys()):
604
+ new_boxes.extend(pages[pg][col])
605
+
606
+ self.boxes = new_boxes
607
+
608
+ def _concat_downward(self, concat_between_pages=True):
609
+ self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0)
610
+ return
611
+
612
+ # count boxes in the same row as a feature
613
+ for i in range(len(self.boxes)):
614
+ mh = self.mean_height[self.boxes[i]["page_number"] - 1]
615
+ self.boxes[i]["in_row"] = 0
616
+ j = max(0, i - 12)
617
+ while j < min(i + 12, len(self.boxes)):
618
+ if j == i:
619
+ j += 1
620
+ continue
621
+ ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
622
+ if abs(ydis) < 1:
623
+ self.boxes[i]["in_row"] += 1
624
+ elif ydis > 0:
625
+ break
626
+ j += 1
627
+
628
+ # concat between rows
629
+ boxes = deepcopy(self.boxes)
630
+ blocks = []
631
+ while boxes:
632
+ chunks = []
633
+
634
+ def dfs(up, dp):
635
+ chunks.append(up)
636
+ i = dp
637
+ while i < min(dp + 12, len(boxes)):
638
+ ydis = self._y_dis(up, boxes[i])
639
+ smpg = up["page_number"] == boxes[i]["page_number"]
640
+ mh = self.mean_height[up["page_number"] - 1]
641
+ mw = self.mean_width[up["page_number"] - 1]
642
+ if smpg and ydis > mh * 4:
643
+ break
644
+ if not smpg and ydis > mh * 16:
645
+ break
646
+ down = boxes[i]
647
+ if not concat_between_pages and down["page_number"] > up["page_number"]:
648
+ break
649
+
650
+ if up.get("R", "") != down.get("R", "") and up["text"][-1] != ",":
651
+ i += 1
652
+ continue
653
+
654
+ if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) or not down["text"].strip():
655
+ i += 1
656
+ continue
657
+
658
+ if not down["text"].strip() or not up["text"].strip():
659
+ i += 1
660
+ continue
661
+
662
+ if up["x1"] < down["x0"] - 10 * mw or up["x0"] > down["x1"] + 10 * mw:
663
+ i += 1
664
+ continue
665
+
666
+ if i - dp < 5 and up.get("layout_type") == "text":
667
+ if up.get("layoutno", "1") == down.get("layoutno", "2"):
668
+ dfs(down, i + 1)
669
+ boxes.pop(i)
670
+ return
671
+ i += 1
672
+ continue
673
+
674
+ fea = self._updown_concat_features(up, down)
675
+ if self.updown_cnt_mdl.predict(xgb.DMatrix([fea]))[0] <= 0.5:
676
+ i += 1
677
+ continue
678
+ dfs(down, i + 1)
679
+ boxes.pop(i)
680
+ return
681
+
682
+ dfs(boxes[0], 1)
683
+ boxes.pop(0)
684
+ if chunks:
685
+ blocks.append(chunks)
686
+
687
+ # concat within each block
688
+ boxes = []
689
+ for b in blocks:
690
+ if len(b) == 1:
691
+ boxes.append(b[0])
692
+ continue
693
+ t = b[0]
694
+ for c in b[1:]:
695
+ t["text"] = t["text"].strip()
696
+ c["text"] = c["text"].strip()
697
+ if not c["text"]:
698
+ continue
699
+ if t["text"] and re.match(r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
700
+ t["text"] += " "
701
+ t["text"] += c["text"]
702
+ t["x0"] = min(t["x0"], c["x0"])
703
+ t["x1"] = max(t["x1"], c["x1"])
704
+ t["page_number"] = min(t["page_number"], c["page_number"])
705
+ t["bottom"] = c["bottom"]
706
+ if not t["layout_type"] and c["layout_type"]:
707
+ t["layout_type"] = c["layout_type"]
708
+ boxes.append(t)
709
+
710
+ self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
711
+
712
+ def _filter_forpages(self):
713
+ if not self.boxes:
714
+ return
715
+ findit = False
716
+ i = 0
717
+ while i < len(self.boxes):
718
+ if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
719
+ i += 1
720
+ continue
721
+ findit = True
722
+ eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
723
+ self.boxes.pop(i)
724
+ if i >= len(self.boxes):
725
+ break
726
+ prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split()[:2])
727
+ while not prefix:
728
+ self.boxes.pop(i)
729
+ if i >= len(self.boxes):
730
+ break
731
+ prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split()[:2])
732
+ self.boxes.pop(i)
733
+ if i >= len(self.boxes) or not prefix:
734
+ break
735
+ for j in range(i, min(i + 128, len(self.boxes))):
736
+ if not re.match(prefix, self.boxes[j]["text"]):
737
+ continue
738
+ for k in range(i, j):
739
+ self.boxes.pop(i)
740
+ break
741
+ if findit:
742
+ return
743
+
744
+ page_dirty = [0] * len(self.page_images)
745
+ for b in self.boxes:
746
+ if re.search(r"(··|··|··)", b["text"]):
747
+ page_dirty[b["page_number"] - 1] += 1
748
+ page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
749
+ if not page_dirty:
750
+ return
751
+ i = 0
752
+ while i < len(self.boxes):
753
+ if self.boxes[i]["page_number"] in page_dirty:
754
+ self.boxes.pop(i)
755
+ continue
756
+ i += 1
757
+
758
+ def _merge_with_same_bullet(self):
759
+ i = 0
760
+ while i + 1 < len(self.boxes):
761
+ b = self.boxes[i]
762
+ b_ = self.boxes[i + 1]
763
+ if not b["text"].strip():
764
+ self.boxes.pop(i)
765
+ continue
766
+ if not b_["text"].strip():
767
+ self.boxes.pop(i + 1)
768
+ continue
769
+
770
+ if (
771
+ b["text"].strip()[0] != b_["text"].strip()[0]
772
+ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm")
773
+ or is_chinese(b["text"].strip()[0])
774
+ or b["top"] > b_["bottom"]
775
+ ):
776
+ i += 1
777
+ continue
778
+ b_["text"] = b["text"] + "\n" + b_["text"]
779
+ b_["x0"] = min(b["x0"], b_["x0"])
780
+ b_["x1"] = max(b["x1"], b_["x1"])
781
+ b_["top"] = b["top"]
782
+ self.boxes.pop(i)
783
+
784
+ def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False):
785
+ tables = {}
786
+ figures = {}
787
+ # extract figure and table boxes
788
+ i = 0
789
+ lst_lout_no = ""
790
+ nomerge_lout_no = []
791
+ while i < len(self.boxes):
792
+ if "layoutno" not in self.boxes[i]:
793
+ i += 1
794
+ continue
795
+ lout_no = str(self.boxes[i]["page_number"]) + "-" + str(self.boxes[i]["layoutno"])
796
+ if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title", "figure caption", "reference"]:
797
+ nomerge_lout_no.append(lst_lout_no)
798
+ if self.boxes[i]["layout_type"] == "table":
799
+ if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
800
+ self.boxes.pop(i)
801
+ continue
802
+ if lout_no not in tables:
803
+ tables[lout_no] = []
804
+ tables[lout_no].append(self.boxes[i])
805
+ self.boxes.pop(i)
806
+ lst_lout_no = lout_no
807
+ continue
808
+ if need_image and self.boxes[i]["layout_type"] == "figure":
809
+ if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
810
+ self.boxes.pop(i)
811
+ continue
812
+ if lout_no not in figures:
813
+ figures[lout_no] = []
814
+ figures[lout_no].append(self.boxes[i])
815
+ self.boxes.pop(i)
816
+ lst_lout_no = lout_no
817
+ continue
818
+ i += 1
819
+
820
+ # merge table on different pages
821
+ nomerge_lout_no = set(nomerge_lout_no)
822
+ tbls = sorted([(k, bxs) for k, bxs in tables.items()], key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
823
+
824
+ i = len(tbls) - 1
825
+ while i - 1 >= 0:
826
+ k0, bxs0 = tbls[i - 1]
827
+ k, bxs = tbls[i]
828
+ i -= 1
829
+ if k0 in nomerge_lout_no:
830
+ continue
831
+ if bxs[0]["page_number"] == bxs0[0]["page_number"]:
832
+ continue
833
+ if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
834
+ continue
835
+ mh = self.mean_height[bxs[0]["page_number"] - 1]
836
+ if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
837
+ continue
838
+ tables[k0].extend(tables[k])
839
+ del tables[k]
840
+
841
+ def x_overlapped(a, b):
842
+ return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
843
+
844
+ # find captions and pop out
845
+ i = 0
846
+ while i < len(self.boxes):
847
+ c = self.boxes[i]
848
+ # mh = self.mean_height[c["page_number"]-1]
849
+ if not TableStructureRecognizer.is_caption(c):
850
+ i += 1
851
+ continue
852
+
853
+ # find the nearest layouts
854
+ def nearest(tbls):
855
+ nonlocal c
856
+ mink = ""
857
+ minv = 1000000000
858
+ for k, bxs in tbls.items():
859
+ for b in bxs:
860
+ if b.get("layout_type", "").find("caption") >= 0:
861
+ continue
862
+ y_dis = self._y_dis(c, b)
863
+ x_dis = self._x_dis(c, b) if not x_overlapped(c, b) else 0
864
+ dis = y_dis * y_dis + x_dis * x_dis
865
+ if dis < minv:
866
+ mink = k
867
+ minv = dis
868
+ return mink, minv
869
+
870
+ tk, tv = nearest(tables)
871
+ fk, fv = nearest(figures)
872
+ # if min(tv, fv) > 2000:
873
+ # i += 1
874
+ # continue
875
+ if tv < fv and tk:
876
+ tables[tk].insert(0, c)
877
+ logging.debug("TABLE:" + self.boxes[i]["text"] + "; Cap: " + tk)
878
+ elif fk:
879
+ figures[fk].insert(0, c)
880
+ logging.debug("FIGURE:" + self.boxes[i]["text"] + "; Cap: " + tk)
881
+ self.boxes.pop(i)
882
+
883
+ def cropout(bxs, ltype, poss):
884
+ nonlocal ZM
885
+ pn = set([b["page_number"] - 1 for b in bxs])
886
+ if len(pn) < 2:
887
+ pn = list(pn)[0]
888
+ ht = self.page_cum_height[pn]
889
+ b = {"x0": np.min([b["x0"] for b in bxs]), "top": np.min([b["top"] for b in bxs]) - ht, "x1": np.max([b["x1"] for b in bxs]), "bottom": np.max([b["bottom"] for b in bxs]) - ht}
890
+ louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
891
+ ii = Recognizer.find_overlapped(b, louts, naive=True)
892
+ if ii is not None:
893
+ b = louts[ii]
894
+ else:
895
+ logging.warning(f"Missing layout match: {pn + 1},%s" % (bxs[0].get("layoutno", "")))
896
+
897
+ left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
898
+ if right < left:
899
+ right = left + 1
900
+
901
+ # 确保裁剪坐标在图片边界内
902
+ img_width, img_height = self.page_images[pn].size
903
+ crop_left = max(0, int(left * ZM))
904
+ crop_top = max(0, int(top * ZM))
905
+ crop_right = min(img_width, max(crop_left + 1, int(right * ZM)))
906
+ crop_bottom = min(img_height, max(crop_top + 1, int(bott * ZM)))
907
+
908
+ poss.append((pn + self.page_from, left, right, top, bott))
909
+
910
+ try:
911
+ return self.page_images[pn].crop((crop_left, crop_top, crop_right, crop_bottom))
912
+ except Exception as e:
913
+ logging.warning(f"Failed to crop image: {e}")
914
+ return None
915
+ pn = {}
916
+ for b in bxs:
917
+ p = b["page_number"] - 1
918
+ if p not in pn:
919
+ pn[p] = []
920
+ pn[p].append(b)
921
+ pn = sorted(pn.items(), key=lambda x: x[0])
922
+ imgs = [cropout(arr, ltype, poss) for p, arr in pn]
923
+ pic = Image.new("RGB", (int(np.max([i.size[0] for i in imgs])), int(np.sum([m.size[1] for m in imgs]))), (245, 245, 245))
924
+ height = 0
925
+ for img in imgs:
926
+ pic.paste(img, (0, int(height)))
927
+ height += img.size[1]
928
+ return pic
929
+
930
+ res = []
931
+ positions = []
932
+ figure_results = []
933
+ figure_positions = []
934
+ # counter for figures by page
935
+ figure_counter_by_page = {}
936
+ # crop figure out and add caption
937
+ for k, bxs in figures.items():
938
+ txt = "\n".join([b["text"] for b in bxs])
939
+ # 如果文本为空,使用默认描述,但仍然处理图片
940
+ if not txt:
941
+ # 使用页码和序号生成唯一标识
942
+ page_num = bxs[0]["page_number"]
943
+ if page_num not in figure_counter_by_page:
944
+ figure_counter_by_page[page_num] = 0
945
+ figure_counter_by_page[page_num] += 1
946
+ txt = f"Figure-P{page_num}-{figure_counter_by_page[page_num]}"
947
+
948
+ poss = []
949
+
950
+ cropped_img = cropout(bxs, "figure", poss)
951
+ if cropped_img is not None: # 只添加成功裁剪的图片
952
+ if separate_tables_figures:
953
+ figure_results.append((cropped_img, [txt]))
954
+ figure_positions.append(poss)
955
+ else:
956
+ res.append((cropped_img, [txt]))
957
+ positions.append(poss)
958
+
959
+ for k, bxs in tables.items():
960
+ if not bxs:
961
+ continue
962
+ bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs]))
963
+
964
+ poss = []
965
+
966
+ res.append((cropout(bxs, "table", poss), self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
967
+ positions.append(poss)
968
+
969
+ if separate_tables_figures:
970
+ assert len(positions) + len(figure_positions) == len(res) + len(figure_results)
971
+ if need_position:
972
+ return list(zip(res, positions)), list(zip(figure_results, figure_positions))
973
+ else:
974
+ return res, figure_results
975
+ else:
976
+ assert len(positions) == len(res)
977
+ if need_position:
978
+ return list(zip(res, positions))
979
+ else:
980
+ return res
981
+
982
+ def proj_match(self, line):
983
+ if len(line) <= 2:
984
+ return
985
+ if re.match(r"[0-9 ().,%%+/-]+$", line):
986
+ return False
987
+ for p, j in [
988
+ (r"第[零一二三四五六七八九十百]+章", 1),
989
+ (r"第[零一二三四五六七八九十百]+[条节]", 2),
990
+ (r"[零一二三四五六七八九十百]+[、  ]", 3),
991
+ (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
992
+ (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
993
+ (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
994
+ (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
995
+ (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
996
+ (r".{,48}[::??]$", 9),
997
+ (r"[0-9]+)", 10),
998
+ (r"[\((][0-9]+[)\)]", 11),
999
+ (r"[零一二三四五六七八九十百]+是", 12),
1000
+ (r"[⚫•➢✓]", 12),
1001
+ ]:
1002
+ if re.match(p, line):
1003
+ return j
1004
+ return
1005
+
1006
+ def _line_tag(self, bx, ZM):
1007
+ pn = [bx["page_number"]]
1008
+ top = bx["top"] - self.page_cum_height[pn[0] - 1]
1009
+ bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
1010
+ page_images_cnt = len(self.page_images)
1011
+ if pn[-1] - 1 >= page_images_cnt:
1012
+ return ""
1013
+ while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
1014
+ bott -= self.page_images[pn[-1] - 1].size[1] / ZM
1015
+ pn.append(pn[-1] + 1)
1016
+ if pn[-1] - 1 >= page_images_cnt:
1017
+ return ""
1018
+
1019
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), bx["x0"], bx["x1"], top, bott)
1020
+
1021
+ def __filterout_scraps(self, boxes, ZM):
1022
+ def width(b):
1023
+ return b["x1"] - b["x0"]
1024
+
1025
+ def height(b):
1026
+ return b["bottom"] - b["top"]
1027
+
1028
+ def usefull(b):
1029
+ if b.get("layout_type"):
1030
+ return True
1031
+ if width(b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
1032
+ return True
1033
+ if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
1034
+ return True
1035
+ return False
1036
+
1037
+ res = []
1038
+ while boxes:
1039
+ lines = []
1040
+ widths = []
1041
+ pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
1042
+ mh = self.mean_height[boxes[0]["page_number"] - 1]
1043
+ mj = self.proj_match(boxes[0]["text"]) or boxes[0].get("layout_type", "") == "title"
1044
+
1045
+ def dfs(line, st):
1046
+ nonlocal mh, pw, lines, widths
1047
+ lines.append(line)
1048
+ widths.append(width(line))
1049
+ mmj = self.proj_match(line["text"]) or line.get("layout_type", "") == "title"
1050
+ for i in range(st + 1, min(st + 20, len(boxes))):
1051
+ if (boxes[i]["page_number"] - line["page_number"]) > 0:
1052
+ break
1053
+ if not mmj and self._y_dis(line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
1054
+ break
1055
+
1056
+ if not usefull(boxes[i]):
1057
+ continue
1058
+ if mmj or (self._x_dis(boxes[i], line) < pw / 10):
1059
+ # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
1060
+ # concat following
1061
+ dfs(boxes[i], i)
1062
+ boxes.pop(i)
1063
+ break
1064
+
1065
+ try:
1066
+ if usefull(boxes[0]):
1067
+ dfs(boxes[0], 0)
1068
+ else:
1069
+ logging.debug("WASTE: " + boxes[0]["text"])
1070
+ except Exception:
1071
+ pass
1072
+ boxes.pop(0)
1073
+ mw = np.mean(widths)
1074
+ if mj or mw / pw >= 0.35 or mw > 200:
1075
+ res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
1076
+ else:
1077
+ logging.debug("REMOVED: " + "<<".join([c["text"] for c in lines]))
1078
+
1079
+ return "\n\n".join(res)
1080
+
1081
+ @staticmethod
1082
+ def total_page_number(fnm, binary=None):
1083
+ try:
1084
+ with sys.modules[LOCK_KEY_pdfplumber]:
1085
+ pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
1086
+ total_page = len(pdf.pages)
1087
+ pdf.close()
1088
+ return total_page
1089
+ except Exception:
1090
+ logging.exception("total_page_number")
1091
+
1092
+ def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
1093
+ self.lefted_chars = []
1094
+ self.mean_height = []
1095
+ self.mean_width = []
1096
+ self.boxes = []
1097
+ self.garbages = {}
1098
+ self.page_cum_height = [0]
1099
+ self.page_layout = []
1100
+ self.page_from = page_from
1101
+ start = timer()
1102
+ try:
1103
+ with sys.modules[LOCK_KEY_pdfplumber]:
1104
+ with pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) as pdf:
1105
+ self.pdf = pdf
1106
+ self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])]
1107
+
1108
+ try:
1109
+ self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
1110
+ except Exception as e:
1111
+ logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
1112
+ self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
1113
+
1114
+ self.total_page = len(self.pdf.pages)
1115
+
1116
+ except Exception:
1117
+ logging.exception("RAGFlowPdfParser __images__")
1118
+ logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
1119
+
1120
+ self.outlines = []
1121
+ try:
1122
+ with pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm)) as pdf:
1123
+ self.pdf = pdf
1124
+
1125
+ outlines = self.pdf.outline
1126
+
1127
+ def dfs(arr, depth):
1128
+ for a in arr:
1129
+ if isinstance(a, dict):
1130
+ self.outlines.append((a["/Title"], depth))
1131
+ continue
1132
+ dfs(a, depth + 1)
1133
+
1134
+ dfs(outlines, 0)
1135
+
1136
+ except Exception as e:
1137
+ logging.warning(f"Outlines exception: {e}")
1138
+
1139
+ if not self.outlines:
1140
+ logging.warning("Miss outlines")
1141
+
1142
+ logging.debug("Images converted.")
1143
+ self.is_english = [
1144
+ re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
1145
+ for i in range(len(self.page_chars))
1146
+ ]
1147
+ if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
1148
+ self.is_english = True
1149
+ else:
1150
+ self.is_english = False
1151
+
1152
+ async def __img_ocr(i, id, img, chars, limiter):
1153
+ j = 0
1154
+ while j + 1 < len(chars):
1155
+ if (
1156
+ chars[j]["text"]
1157
+ and chars[j + 1]["text"]
1158
+ and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"])
1159
+ and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"], chars[j]["width"]) / 2
1160
+ ):
1161
+ chars[j]["text"] += " "
1162
+ j += 1
1163
+
1164
+ if limiter:
1165
+ async with limiter:
1166
+ await asyncio.to_thread(self.__ocr, i + 1, img, chars, zoomin, id)
1167
+ else:
1168
+ self.__ocr(i + 1, img, chars, zoomin, id)
1169
+
1170
+ if callback and i % 6 == 5:
1171
+ callback((i + 1) * 0.6 / len(self.page_images))
1172
+
1173
+ async def __img_ocr_launcher():
1174
+ def __ocr_preprocess():
1175
+ chars = self.page_chars[i] if not self.is_english else []
1176
+ self.mean_height.append(np.median(sorted([c["height"] for c in chars])) if chars else 0)
1177
+ self.mean_width.append(np.median(sorted([c["width"] for c in chars])) if chars else 8)
1178
+ self.page_cum_height.append(img.size[1] / zoomin)
1179
+ return chars
1180
+
1181
+ if self.parallel_limiter:
1182
+ tasks = []
1183
+
1184
+ for i, img in enumerate(self.page_images):
1185
+ chars = __ocr_preprocess()
1186
+
1187
+ semaphore = self.parallel_limiter[i % settings.PARALLEL_DEVICES]
1188
+
1189
+ async def wrapper(i=i, img=img, chars=chars, semaphore=semaphore):
1190
+ await __img_ocr(
1191
+ i,
1192
+ i % settings.PARALLEL_DEVICES,
1193
+ img,
1194
+ chars,
1195
+ semaphore,
1196
+ )
1197
+
1198
+ tasks.append(asyncio.create_task(wrapper()))
1199
+ await asyncio.sleep(0)
1200
+
1201
+ try:
1202
+ await asyncio.gather(*tasks, return_exceptions=False)
1203
+ except Exception as e:
1204
+ logging.error(f"Error in OCR: {e}")
1205
+ for t in tasks:
1206
+ t.cancel()
1207
+ await asyncio.gather(*tasks, return_exceptions=True)
1208
+ raise
1209
+
1210
+ else:
1211
+ for i, img in enumerate(self.page_images):
1212
+ chars = __ocr_preprocess()
1213
+ await __img_ocr(i, 0, img, chars, None)
1214
+
1215
+ start = timer()
1216
+
1217
+ # Handle asyncio.run() in case there's already a running event loop
1218
+ try:
1219
+ # Check if there's a running event loop
1220
+ asyncio.get_running_loop()
1221
+ # If we get here, there's a running loop, so we need to run in a new thread
1222
+ result_queue: queue.Queue = queue.Queue()
1223
+
1224
+ def runner():
1225
+ try:
1226
+ # Create a new event loop in this thread
1227
+ new_loop = asyncio.new_event_loop()
1228
+ asyncio.set_event_loop(new_loop)
1229
+ try:
1230
+ result_queue.put((True, new_loop.run_until_complete(__img_ocr_launcher())))
1231
+ finally:
1232
+ new_loop.close()
1233
+ except Exception as e:
1234
+ result_queue.put((False, e))
1235
+
1236
+ thread = threading.Thread(target=runner, daemon=True)
1237
+ thread.start()
1238
+ thread.join()
1239
+
1240
+ success, value = result_queue.get_nowait()
1241
+ if not success:
1242
+ raise value
1243
+ except RuntimeError:
1244
+ # No running event loop, safe to use asyncio.run()
1245
+ asyncio.run(__img_ocr_launcher())
1246
+
1247
+ logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")
1248
+
1249
+ if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
1250
+ bxes = [b for bxs in self.boxes for b in bxs]
1251
+ self.is_english = re.search(r"[ \na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
1252
+
1253
+ logging.debug(f"Is it English: {self.is_english}")
1254
+
1255
+ self.page_cum_height = np.cumsum(self.page_cum_height)
1256
+ assert len(self.page_cum_height) == len(self.page_images) + 1
1257
+ if len(self.boxes) == 0 and zoomin < 9:
1258
+ self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
1259
+
1260
+ def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
1261
+ self.__images__(fnm, zoomin)
1262
+ self._layouts_rec(zoomin)
1263
+ self._table_transformer_job(zoomin)
1264
+ self._text_merge()
1265
+ self._concat_downward()
1266
+ self._filter_forpages()
1267
+ tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
1268
+ return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
1269
+
1270
+ def parse_into_bboxes(self, fnm, callback=None, zoomin=3):
1271
+ start = timer()
1272
+ self.__images__(fnm, zoomin, callback=callback)
1273
+ if callback:
1274
+ callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start))
1275
+
1276
+ start = timer()
1277
+ self._layouts_rec(zoomin)
1278
+ if callback:
1279
+ callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
1280
+
1281
+ start = timer()
1282
+ self._table_transformer_job(zoomin)
1283
+ if callback:
1284
+ callback(0.83, "Table analysis ({:.2f}s)".format(timer() - start))
1285
+
1286
+ start = timer()
1287
+ self._text_merge()
1288
+ self._concat_downward()
1289
+ #self._naive_vertical_merge(zoomin)
1290
+ if callback:
1291
+ callback(0.92, "Text merged ({:.2f}s)".format(timer() - start))
1292
+
1293
+ start = timer()
1294
+ tbls, figs = self._extract_table_figure(True, zoomin, True, True, True)
1295
+
1296
+ def insert_table_figures(tbls_or_figs, layout_type):
1297
+ def min_rectangle_distance(rect1, rect2):
1298
+ pn1, left1, right1, top1, bottom1 = rect1
1299
+ pn2, left2, right2, top2, bottom2 = rect2
1300
+ if right1 >= left2 and right2 >= left1 and bottom1 >= top2 and bottom2 >= top1:
1301
+ return 0
1302
+ if right1 < left2:
1303
+ dx = left2 - right1
1304
+ elif right2 < left1:
1305
+ dx = left1 - right2
1306
+ else:
1307
+ dx = 0
1308
+ if bottom1 < top2:
1309
+ dy = top2 - bottom1
1310
+ elif bottom2 < top1:
1311
+ dy = top1 - bottom2
1312
+ else:
1313
+ dy = 0
1314
+ return math.sqrt(dx * dx + dy * dy) # + (pn2-pn1)*10000
1315
+
1316
+ for (img, txt), poss in tbls_or_figs:
1317
+ bboxes = [(i, (b["page_number"], b["x0"], b["x1"], b["top"], b["bottom"])) for i, b in enumerate(self.boxes)]
1318
+ dists = [
1319
+ (min_rectangle_distance((pn, left, right, top + self.page_cum_height[pn], bott + self.page_cum_height[pn]), rect), i) for i, rect in bboxes for pn, left, right, top, bott in poss
1320
+ ]
1321
+ min_i = np.argmin(dists, axis=0)[0]
1322
+ min_i, rect = bboxes[dists[min_i][-1]]
1323
+ if isinstance(txt, list):
1324
+ txt = "\n".join(txt)
1325
+ pn, left, right, top, bott = poss[0]
1326
+ if self.boxes[min_i]["bottom"] < top + self.page_cum_height[pn]:
1327
+ min_i += 1
1328
+ self.boxes.insert(
1329
+ min_i,
1330
+ {
1331
+ "page_number": pn + 1,
1332
+ "x0": left,
1333
+ "x1": right,
1334
+ "top": top + self.page_cum_height[pn],
1335
+ "bottom": bott + self.page_cum_height[pn],
1336
+ "layout_type": layout_type,
1337
+ "text": txt,
1338
+ "image": img,
1339
+ "positions": [[pn + 1, int(left), int(right), int(top), int(bott)]],
1340
+ },
1341
+ )
1342
+
1343
+ for b in self.boxes:
1344
+ b["position_tag"] = self._line_tag(b, zoomin)
1345
+ b["image"] = self.crop(b["position_tag"], zoomin)
1346
+ b["positions"] = [[pos[0][-1] + 1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(b["position_tag"])]
1347
+
1348
+ insert_table_figures(tbls, "table")
1349
+ insert_table_figures(figs, "figure")
1350
+ if callback:
1351
+ callback(1, "Structured ({:.2f}s)".format(timer() - start))
1352
+ return deepcopy(self.boxes)
1353
+
1354
+ @staticmethod
1355
+ def remove_tag(txt):
1356
+ return re.sub(r"@@[\t0-9.-]+?##", "", txt)
1357
+
1358
+ @staticmethod
1359
+ def extract_positions(txt):
1360
+ poss = []
1361
+ for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
1362
+ pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
1363
+ left, right, top, bottom = float(left), float(right), float(top), float(bottom)
1364
+ poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
1365
+ return poss
1366
+
1367
+ def crop(self, text, ZM=3, need_position=False):
1368
+ imgs = []
1369
+ poss = self.extract_positions(text)
1370
+ if not poss:
1371
+ if need_position:
1372
+ return None, None
1373
+ return
1374
+
1375
+ if not getattr(self, "page_images", None):
1376
+ logging.warning("crop called without page images; skipping image generation.")
1377
+ if need_position:
1378
+ return None, None
1379
+ return
1380
+
1381
+ page_count = len(self.page_images)
1382
+
1383
+ filtered_poss = []
1384
+ for pns, left, right, top, bottom in poss:
1385
+ if not pns:
1386
+ logging.warning("Empty page index list in crop; skipping this position.")
1387
+ continue
1388
+ valid_pns = [p for p in pns if 0 <= p < page_count]
1389
+ if not valid_pns:
1390
+ logging.warning(f"All page indices {pns} out of range for {page_count} pages; skipping.")
1391
+ continue
1392
+ filtered_poss.append((valid_pns, left, right, top, bottom))
1393
+
1394
+ poss = filtered_poss
1395
+ if not poss:
1396
+ logging.warning("No valid positions after filtering; skip cropping.")
1397
+ if need_position:
1398
+ return None, None
1399
+ return
1400
+
1401
+ max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
1402
+ GAP = 6
1403
+ pos = poss[0]
1404
+ first_page_idx = pos[0][0]
1405
+ poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
1406
+ pos = poss[-1]
1407
+ last_page_idx = pos[0][-1]
1408
+ if not (0 <= last_page_idx < page_count):
1409
+ logging.warning(f"Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
1410
+ if need_position:
1411
+ return None, None
1412
+ return
1413
+ last_page_height = self.page_images[last_page_idx].size[1] / ZM
1414
+ poss.append(
1415
+ (
1416
+ [last_page_idx],
1417
+ pos[1],
1418
+ pos[2],
1419
+ min(last_page_height, pos[4] + GAP),
1420
+ min(last_page_height, pos[4] + 120),
1421
+ )
1422
+ )
1423
+
1424
+ positions = []
1425
+ for ii, (pns, left, right, top, bottom) in enumerate(poss):
1426
+ if 0 < ii < len(poss) - 1:
1427
+ right = max(left + 10, right)
1428
+ else:
1429
+ right = left + max_width
1430
+ bottom *= ZM
1431
+ for pn in pns[1:]:
1432
+ if 0 <= pn - 1 < page_count:
1433
+ bottom += self.page_images[pn - 1].size[1]
1434
+ else:
1435
+ logging.warning(f"Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
1436
+
1437
+ if not (0 <= pns[0] < page_count):
1438
+ logging.warning(f"Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
1439
+ continue
1440
+
1441
+ imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1]))))
1442
+ if 0 < ii < len(poss) - 1:
1443
+ positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM))
1444
+ bottom -= self.page_images[pns[0]].size[1]
1445
+ for pn in pns[1:]:
1446
+ if not (0 <= pn < page_count):
1447
+ logging.warning(f"Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
1448
+ continue
1449
+ imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1]))))
1450
+ if 0 < ii < len(poss) - 1:
1451
+ positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM))
1452
+ bottom -= self.page_images[pn].size[1]
1453
+
1454
+ if not imgs:
1455
+ if need_position:
1456
+ return None, None
1457
+ return
1458
+ height = 0
1459
+ for img in imgs:
1460
+ height += img.size[1] + GAP
1461
+ height = int(height)
1462
+ width = int(np.max([i.size[0] for i in imgs]))
1463
+ pic = Image.new("RGB", (width, height), (245, 245, 245))
1464
+ height = 0
1465
+ for ii, img in enumerate(imgs):
1466
+ if ii == 0 or ii + 1 == len(imgs):
1467
+ img = img.convert("RGBA")
1468
+ overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
1469
+ overlay.putalpha(128)
1470
+ img = Image.alpha_composite(img, overlay).convert("RGB")
1471
+ pic.paste(img, (0, int(height)))
1472
+ height += img.size[1] + GAP
1473
+
1474
+ if need_position:
1475
+ return pic, positions
1476
+ return pic
1477
+
1478
+ def get_position(self, bx, ZM):
1479
+ poss = []
1480
+ pn = bx["page_number"]
1481
+ top = bx["top"] - self.page_cum_height[pn - 1]
1482
+ bott = bx["bottom"] - self.page_cum_height[pn - 1]
1483
+ poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
1484
+ while bott * ZM > self.page_images[pn - 1].size[1]:
1485
+ bott -= self.page_images[pn - 1].size[1] / ZM
1486
+ top = 0
1487
+ pn += 1
1488
+ poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
1489
+ return poss
1490
+
1491
+
1492
+ class PlainParser:
1493
+ def __init__(self):
1494
+ pass
1495
+
1496
+ def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
1497
+ self.outlines = []
1498
+ lines = []
1499
+ try:
1500
+ self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
1501
+ for page in self.pdf.pages[from_page:to_page]:
1502
+ lines.extend([t for t in page.extract_text().split("\n")])
1503
+
1504
+ outlines = self.pdf.outline
1505
+
1506
+ def dfs(arr, depth):
1507
+ for a in arr:
1508
+ if isinstance(a, dict):
1509
+ self.outlines.append((a["/Title"], depth))
1510
+ continue
1511
+ dfs(a, depth + 1)
1512
+
1513
+ dfs(outlines, 0)
1514
+ except Exception:
1515
+ logging.exception("Outlines exception")
1516
+ if not self.outlines:
1517
+ logging.warning("Miss outlines")
1518
+
1519
+ return [(line, "") for line in lines], []
1520
+
1521
+ def crop(self, ck, need_position):
1522
+ raise NotImplementedError
1523
+
1524
+ @staticmethod
1525
+ def remove_tag(txt):
1526
+ raise NotImplementedError
1527
+
1528
+
1529
+ class VisionParser(RAGFlowPdfParser):
1530
+ def __init__(
1531
+ self,
1532
+ vision_model,
1533
+ model_cfg: PdfModelConfig | None = None,
1534
+ tokenizer_cfg: TokenizerConfig | None = None,
1535
+ ):
1536
+ super().__init__(model_cfg=model_cfg, tokenizer_cfg=tokenizer_cfg)
1537
+ self.vision_model = vision_model
1538
+ self.outlines = []
1539
+
1540
+
1541
+ def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
1542
+ try:
1543
+ with sys.modules[LOCK_KEY_pdfplumber]:
1544
+ self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
1545
+ self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])]
1546
+ self.total_page = len(self.pdf.pages)
1547
+ except Exception:
1548
+ self.page_images = None
1549
+ self.total_page = 0
1550
+ logging.exception("VisionParser __images__")
1551
+
1552
+ def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
1553
+ callback = kwargs.get("callback", lambda prog, msg: None)
1554
+ zoomin = kwargs.get("zoomin", 3)
1555
+ self.__images__(fnm=filename, zoomin=zoomin, page_from=from_page, page_to=to_page, callback=callback)
1556
+
1557
+ total_pdf_pages = self.total_page
1558
+
1559
+ start_page = max(0, from_page)
1560
+ end_page = min(to_page, total_pdf_pages)
1561
+
1562
+ all_docs = []
1563
+
1564
+ for idx, img_binary in enumerate(self.page_images or []):
1565
+ pdf_page_num = idx # 0-based
1566
+ if pdf_page_num < start_page or pdf_page_num >= end_page:
1567
+ continue
1568
+
1569
+ from .llm_adapter.vision import vision_llm_chunk as picture_vision_llm_chunk
1570
+
1571
+ text = picture_vision_llm_chunk(
1572
+ binary=img_binary,
1573
+ vision_model=self.vision_model,
1574
+ prompt=vision_llm_describe_prompt(page=pdf_page_num + 1),
1575
+ callback=callback,
1576
+ )
1577
+
1578
+ if kwargs.get("callback"):
1579
+ kwargs["callback"](idx * 1.0 / len(self.page_images), f"Processed: {idx + 1}/{len(self.page_images)}")
1580
+
1581
+ if text:
1582
+ width, height = self.page_images[idx].size
1583
+ all_docs.append((
1584
+ text,
1585
+ f"@@{pdf_page_num + 1}\t{0.0:.1f}\t{width / zoomin:.1f}\t{0.0:.1f}\t{height / zoomin:.1f}##"
1586
+ ))
1587
+ return all_docs, []
1588
+
1589
+
1590
+ if __name__ == "__main__":
1591
+ pass