deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,1591 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import logging
|
|
19
|
+
import math
|
|
20
|
+
import os
|
|
21
|
+
import queue
|
|
22
|
+
import random
|
|
23
|
+
import re
|
|
24
|
+
import sys
|
|
25
|
+
import threading
|
|
26
|
+
from collections import Counter, defaultdict
|
|
27
|
+
from copy import deepcopy
|
|
28
|
+
from io import BytesIO
|
|
29
|
+
from timeit import default_timer as timer
|
|
30
|
+
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pdfplumber
|
|
33
|
+
import xgboost as xgb
|
|
34
|
+
from PIL import Image
|
|
35
|
+
from pypdf import PdfReader as pdf2_read
|
|
36
|
+
from sklearn.cluster import KMeans
|
|
37
|
+
from sklearn.metrics import silhouette_score
|
|
38
|
+
|
|
39
|
+
from ..common.misc_utils import pip_install_torch
|
|
40
|
+
from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
|
|
41
|
+
from ..config import PdfModelConfig, TokenizerConfig
|
|
42
|
+
from ..depend.rag_tokenizer import RagTokenizer, is_chinese
|
|
43
|
+
from ..depend.prompts import vision_llm_describe_prompt
|
|
44
|
+
from ..common import settings
|
|
45
|
+
|
|
46
|
+
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
|
47
|
+
if LOCK_KEY_pdfplumber not in sys.modules:
|
|
48
|
+
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class RAGFlowPdfParser:
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
model_cfg: PdfModelConfig | None = None,
|
|
55
|
+
tokenizer_cfg: TokenizerConfig | None = None,
|
|
56
|
+
):
|
|
57
|
+
# Allow constructing parsers without explicitly passing configs.
|
|
58
|
+
# Env-based factories keep backwards compatibility for users that already
|
|
59
|
+
# configure via DEEPDOC_* env vars.
|
|
60
|
+
if model_cfg is None:
|
|
61
|
+
model_cfg = PdfModelConfig.from_env()
|
|
62
|
+
if tokenizer_cfg is None:
|
|
63
|
+
tokenizer_cfg = TokenizerConfig.from_env()
|
|
64
|
+
|
|
65
|
+
self.model_cfg = model_cfg
|
|
66
|
+
self.tokenizer_cfg = tokenizer_cfg
|
|
67
|
+
|
|
68
|
+
provider = model_cfg.normalized_provider()
|
|
69
|
+
model_offline = provider == "local"
|
|
70
|
+
self.tokenizer = RagTokenizer(
|
|
71
|
+
dict_prefix=tokenizer_cfg.resolve_dict_prefix(),
|
|
72
|
+
offline=tokenizer_cfg.offline,
|
|
73
|
+
nltk_data_dir=tokenizer_cfg.nltk_data_dir,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
vision_model_dir = model_cfg.resolve_vision_model_dir()
|
|
77
|
+
xgb_model_dir = model_cfg.resolve_xgb_model_dir()
|
|
78
|
+
ascend_model_dir = model_cfg.resolve_ascend_model_dir()
|
|
79
|
+
|
|
80
|
+
self.ocr = OCR(
|
|
81
|
+
model_dir=vision_model_dir,
|
|
82
|
+
model_home=model_cfg.model_home,
|
|
83
|
+
model_provider=provider,
|
|
84
|
+
offline=model_offline,
|
|
85
|
+
)
|
|
86
|
+
self.parallel_limiter = None
|
|
87
|
+
if settings.PARALLEL_DEVICES > 1:
|
|
88
|
+
self.parallel_limiter = [asyncio.Semaphore(1) for _ in range(settings.PARALLEL_DEVICES)]
|
|
89
|
+
|
|
90
|
+
layout_recognizer_type = os.getenv("LAYOUT_RECOGNIZER_TYPE", "onnx").lower()
|
|
91
|
+
if layout_recognizer_type not in ["onnx", "ascend"]:
|
|
92
|
+
raise RuntimeError("Unsupported layout recognizer type.")
|
|
93
|
+
|
|
94
|
+
if hasattr(self, "model_speciess"):
|
|
95
|
+
recognizer_domain = "layout." + self.model_speciess
|
|
96
|
+
else:
|
|
97
|
+
recognizer_domain = "layout"
|
|
98
|
+
|
|
99
|
+
if layout_recognizer_type == "ascend":
|
|
100
|
+
logging.debug("Using Ascend LayoutRecognizer")
|
|
101
|
+
if not ascend_model_dir:
|
|
102
|
+
raise ValueError("ascend_model_dir is required when LAYOUT_RECOGNIZER_TYPE=ascend")
|
|
103
|
+
self.layouter = AscendLayoutRecognizer(recognizer_domain, model_dir=ascend_model_dir)
|
|
104
|
+
else: # onnx
|
|
105
|
+
logging.debug("Using Onnx LayoutRecognizer")
|
|
106
|
+
self.layouter = LayoutRecognizer(
|
|
107
|
+
recognizer_domain,
|
|
108
|
+
model_dir=vision_model_dir,
|
|
109
|
+
model_home=model_cfg.model_home,
|
|
110
|
+
model_provider=provider,
|
|
111
|
+
offline=model_offline,
|
|
112
|
+
)
|
|
113
|
+
self.tbl_det = TableStructureRecognizer(
|
|
114
|
+
model_dir=vision_model_dir,
|
|
115
|
+
model_home=model_cfg.model_home,
|
|
116
|
+
model_provider=provider,
|
|
117
|
+
offline=model_offline,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
self.updown_cnt_mdl = xgb.Booster()
|
|
121
|
+
try:
|
|
122
|
+
pip_install_torch()
|
|
123
|
+
import torch.cuda
|
|
124
|
+
if torch.cuda.is_available():
|
|
125
|
+
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
|
126
|
+
except Exception:
|
|
127
|
+
logging.info("No torch found.")
|
|
128
|
+
|
|
129
|
+
self.updown_cnt_mdl.load_model(os.path.join(xgb_model_dir, "updown_concat_xgb.model"))
|
|
130
|
+
|
|
131
|
+
self.page_from = 0
|
|
132
|
+
self.column_num = 1
|
|
133
|
+
|
|
134
|
+
def __char_width(self, c):
|
|
135
|
+
return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
|
|
136
|
+
|
|
137
|
+
def __height(self, c):
|
|
138
|
+
return c["bottom"] - c["top"]
|
|
139
|
+
|
|
140
|
+
def _x_dis(self, a, b):
|
|
141
|
+
return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]), abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
|
|
142
|
+
|
|
143
|
+
def _y_dis(self, a, b):
|
|
144
|
+
return (b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
|
|
145
|
+
|
|
146
|
+
def _match_proj(self, b):
|
|
147
|
+
proj_patt = [
|
|
148
|
+
r"第[零一二三四五六七八九十百]+章",
|
|
149
|
+
r"第[零一二三四五六七八九十百]+[条节]",
|
|
150
|
+
r"[零一二三四五六七八九十百]+[、是 ]",
|
|
151
|
+
r"[\((][零一二三四五六七八九十百]+[)\)]",
|
|
152
|
+
r"[\((][0-9]+[)\)]",
|
|
153
|
+
r"[0-9]+(、|\.[ ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
|
|
154
|
+
r"[0-9]+\.[0-9.]+(、|\.[ ])",
|
|
155
|
+
r"[⚫•➢①② ]",
|
|
156
|
+
]
|
|
157
|
+
return any([re.match(p, b["text"]) for p in proj_patt])
|
|
158
|
+
|
|
159
|
+
def _updown_concat_features(self, up, down):
|
|
160
|
+
w = max(self.__char_width(up), self.__char_width(down))
|
|
161
|
+
h = max(self.__height(up), self.__height(down))
|
|
162
|
+
y_dis = self._y_dis(up, down)
|
|
163
|
+
LEN = 6
|
|
164
|
+
tks_down = self.tokenizer.tokenize(down["text"][:LEN]).split()
|
|
165
|
+
tks_up = self.tokenizer.tokenize(up["text"][-LEN:]).split()
|
|
166
|
+
tks_all = up["text"][-LEN:].strip() + (" " if re.match(r"[a-zA-Z0-9]+", up["text"][-1] + down["text"][0]) else "") + down["text"][:LEN].strip()
|
|
167
|
+
tks_all = self.tokenizer.tokenize(tks_all).split()
|
|
168
|
+
fea = [
|
|
169
|
+
up.get("R", -1) == down.get("R", -1),
|
|
170
|
+
y_dis / h,
|
|
171
|
+
down["page_number"] - up["page_number"],
|
|
172
|
+
up["layout_type"] == down["layout_type"],
|
|
173
|
+
up["layout_type"] == "text",
|
|
174
|
+
down["layout_type"] == "text",
|
|
175
|
+
up["layout_type"] == "table",
|
|
176
|
+
down["layout_type"] == "table",
|
|
177
|
+
True if re.search(r"([。?!;!?;+))]|[a-z]\.)$", up["text"]) else False,
|
|
178
|
+
True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
|
|
179
|
+
True if re.search(r"(^.?[/,?;:\],。;:’”?!》】)-])", down["text"]) else False,
|
|
180
|
+
True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
|
|
181
|
+
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
|
182
|
+
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
|
183
|
+
True if re.search(r"[\((][^\))]+$", up["text"]) and re.search(r"[\))]", down["text"]) else False,
|
|
184
|
+
self._match_proj(down),
|
|
185
|
+
True if re.match(r"[A-Z]", down["text"]) else False,
|
|
186
|
+
True if re.match(r"[A-Z]", up["text"][-1]) else False,
|
|
187
|
+
True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
|
|
188
|
+
True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
|
|
189
|
+
up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()) > 1 and len(down["text"].strip()) > 1 else False,
|
|
190
|
+
up["x0"] > down["x1"],
|
|
191
|
+
abs(self.__height(up) - self.__height(down)) / min(self.__height(up), self.__height(down)),
|
|
192
|
+
self._x_dis(up, down) / max(w, 0.000001),
|
|
193
|
+
(len(up["text"]) - len(down["text"])) / max(len(up["text"]), len(down["text"])),
|
|
194
|
+
len(tks_all) - len(tks_up) - len(tks_down),
|
|
195
|
+
len(tks_down) - len(tks_up),
|
|
196
|
+
tks_down[-1] == tks_up[-1] if tks_down and tks_up else False,
|
|
197
|
+
max(down["in_row"], up["in_row"]),
|
|
198
|
+
abs(down["in_row"] - up["in_row"]),
|
|
199
|
+
len(tks_down) == 1 and self.tokenizer.tag(tks_down[0]).find("n") >= 0,
|
|
200
|
+
len(tks_up) == 1 and self.tokenizer.tag(tks_up[0]).find("n") >= 0,
|
|
201
|
+
]
|
|
202
|
+
return fea
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def sort_X_by_page(arr, threshold):
|
|
206
|
+
# sort using y1 first and then x1
|
|
207
|
+
arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
|
|
208
|
+
for i in range(len(arr) - 1):
|
|
209
|
+
for j in range(i, -1, -1):
|
|
210
|
+
# restore the order using th
|
|
211
|
+
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold and arr[j + 1]["top"] < arr[j]["top"] and arr[j + 1]["page_number"] == arr[j]["page_number"]:
|
|
212
|
+
tmp = arr[j]
|
|
213
|
+
arr[j] = arr[j + 1]
|
|
214
|
+
arr[j + 1] = tmp
|
|
215
|
+
return arr
|
|
216
|
+
|
|
217
|
+
def _has_color(self, o):
|
|
218
|
+
if o.get("ncs", "") == "DeviceGray":
|
|
219
|
+
if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and o["non_stroking_color"][0] == 1:
|
|
220
|
+
if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
|
|
221
|
+
return False
|
|
222
|
+
return True
|
|
223
|
+
|
|
224
|
+
def _table_transformer_job(self, ZM):
|
|
225
|
+
logging.debug("Table processing...")
|
|
226
|
+
imgs, pos = [], []
|
|
227
|
+
tbcnt = [0]
|
|
228
|
+
MARGIN = 10
|
|
229
|
+
self.tb_cpns = []
|
|
230
|
+
assert len(self.page_layout) == len(self.page_images)
|
|
231
|
+
for p, tbls in enumerate(self.page_layout): # for page
|
|
232
|
+
tbls = [f for f in tbls if f["type"] == "table"]
|
|
233
|
+
tbcnt.append(len(tbls))
|
|
234
|
+
if not tbls:
|
|
235
|
+
continue
|
|
236
|
+
for tb in tbls: # for table
|
|
237
|
+
left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, tb["x1"] + MARGIN, tb["bottom"] + MARGIN
|
|
238
|
+
left *= ZM
|
|
239
|
+
top *= ZM
|
|
240
|
+
right *= ZM
|
|
241
|
+
bott *= ZM
|
|
242
|
+
pos.append((left, top))
|
|
243
|
+
imgs.append(self.page_images[p].crop((left, top, right, bott)))
|
|
244
|
+
|
|
245
|
+
assert len(self.page_images) == len(tbcnt) - 1
|
|
246
|
+
if not imgs:
|
|
247
|
+
return
|
|
248
|
+
recos = self.tbl_det(imgs)
|
|
249
|
+
tbcnt = np.cumsum(tbcnt)
|
|
250
|
+
for i in range(len(tbcnt) - 1): # for page
|
|
251
|
+
pg = []
|
|
252
|
+
for j, tb_items in enumerate(recos[tbcnt[i] : tbcnt[i + 1]]): # for table
|
|
253
|
+
poss = pos[tbcnt[i] : tbcnt[i + 1]]
|
|
254
|
+
for it in tb_items: # for table components
|
|
255
|
+
it["x0"] = it["x0"] + poss[j][0]
|
|
256
|
+
it["x1"] = it["x1"] + poss[j][0]
|
|
257
|
+
it["top"] = it["top"] + poss[j][1]
|
|
258
|
+
it["bottom"] = it["bottom"] + poss[j][1]
|
|
259
|
+
for n in ["x0", "x1", "top", "bottom"]:
|
|
260
|
+
it[n] /= ZM
|
|
261
|
+
it["top"] += self.page_cum_height[i]
|
|
262
|
+
it["bottom"] += self.page_cum_height[i]
|
|
263
|
+
it["pn"] = i
|
|
264
|
+
it["layoutno"] = j
|
|
265
|
+
pg.append(it)
|
|
266
|
+
self.tb_cpns.extend(pg)
|
|
267
|
+
|
|
268
|
+
def gather(kwd, fzy=10, ption=0.6):
|
|
269
|
+
eles = Recognizer.sort_Y_firstly([r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
|
|
270
|
+
eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
|
|
271
|
+
return Recognizer.sort_Y_firstly(eles, 0)
|
|
272
|
+
|
|
273
|
+
# add R,H,C,SP tag to boxes within table layout
|
|
274
|
+
headers = gather(r".*header$")
|
|
275
|
+
rows = gather(r".* (row|header)")
|
|
276
|
+
spans = gather(r".*spanning")
|
|
277
|
+
clmns = sorted([r for r in self.tb_cpns if re.match(r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
|
|
278
|
+
clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
|
|
279
|
+
for b in self.boxes:
|
|
280
|
+
if b.get("layout_type", "") != "table":
|
|
281
|
+
continue
|
|
282
|
+
ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
|
|
283
|
+
if ii is not None:
|
|
284
|
+
b["R"] = ii
|
|
285
|
+
b["R_top"] = rows[ii]["top"]
|
|
286
|
+
b["R_bott"] = rows[ii]["bottom"]
|
|
287
|
+
|
|
288
|
+
ii = Recognizer.find_overlapped_with_threshold(b, headers, thr=0.3)
|
|
289
|
+
if ii is not None:
|
|
290
|
+
b["H_top"] = headers[ii]["top"]
|
|
291
|
+
b["H_bott"] = headers[ii]["bottom"]
|
|
292
|
+
b["H_left"] = headers[ii]["x0"]
|
|
293
|
+
b["H_right"] = headers[ii]["x1"]
|
|
294
|
+
b["H"] = ii
|
|
295
|
+
|
|
296
|
+
ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
|
|
297
|
+
if ii is not None:
|
|
298
|
+
b["C"] = ii
|
|
299
|
+
b["C_left"] = clmns[ii]["x0"]
|
|
300
|
+
b["C_right"] = clmns[ii]["x1"]
|
|
301
|
+
|
|
302
|
+
ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
|
|
303
|
+
if ii is not None:
|
|
304
|
+
b["H_top"] = spans[ii]["top"]
|
|
305
|
+
b["H_bott"] = spans[ii]["bottom"]
|
|
306
|
+
b["H_left"] = spans[ii]["x0"]
|
|
307
|
+
b["H_right"] = spans[ii]["x1"]
|
|
308
|
+
b["SP"] = ii
|
|
309
|
+
|
|
310
|
+
def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None):
|
|
311
|
+
start = timer()
|
|
312
|
+
bxs = self.ocr.detect(np.array(img), device_id)
|
|
313
|
+
logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)")
|
|
314
|
+
|
|
315
|
+
start = timer()
|
|
316
|
+
if not bxs:
|
|
317
|
+
self.boxes.append([])
|
|
318
|
+
return
|
|
319
|
+
bxs = [(line[0], line[1][0]) for line in bxs]
|
|
320
|
+
bxs = Recognizer.sort_Y_firstly(
|
|
321
|
+
[
|
|
322
|
+
{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM, "top": b[0][1] / ZM, "text": "", "txt": t, "bottom": b[-1][1] / ZM, "chars": [], "page_number": pagenum}
|
|
323
|
+
for b, t in bxs
|
|
324
|
+
if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]
|
|
325
|
+
],
|
|
326
|
+
self.mean_height[pagenum - 1] / 3,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# merge chars in the same rect
|
|
330
|
+
for c in chars:
|
|
331
|
+
ii = Recognizer.find_overlapped(c, bxs)
|
|
332
|
+
if ii is None:
|
|
333
|
+
self.lefted_chars.append(c)
|
|
334
|
+
continue
|
|
335
|
+
ch = c["bottom"] - c["top"]
|
|
336
|
+
bh = bxs[ii]["bottom"] - bxs[ii]["top"]
|
|
337
|
+
if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != " ":
|
|
338
|
+
self.lefted_chars.append(c)
|
|
339
|
+
continue
|
|
340
|
+
bxs[ii]["chars"].append(c)
|
|
341
|
+
|
|
342
|
+
for b in bxs:
|
|
343
|
+
if not b["chars"]:
|
|
344
|
+
del b["chars"]
|
|
345
|
+
continue
|
|
346
|
+
m_ht = np.mean([c["height"] for c in b["chars"]])
|
|
347
|
+
for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
|
|
348
|
+
if c["text"] == " " and b["text"]:
|
|
349
|
+
if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
|
|
350
|
+
b["text"] += " "
|
|
351
|
+
else:
|
|
352
|
+
b["text"] += c["text"]
|
|
353
|
+
del b["chars"]
|
|
354
|
+
|
|
355
|
+
logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
|
|
356
|
+
start = timer()
|
|
357
|
+
boxes_to_reg = []
|
|
358
|
+
img_np = np.array(img)
|
|
359
|
+
for b in bxs:
|
|
360
|
+
if not b["text"]:
|
|
361
|
+
left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM
|
|
362
|
+
b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
|
|
363
|
+
boxes_to_reg.append(b)
|
|
364
|
+
del b["txt"]
|
|
365
|
+
texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg], device_id)
|
|
366
|
+
for i in range(len(boxes_to_reg)):
|
|
367
|
+
boxes_to_reg[i]["text"] = texts[i]
|
|
368
|
+
del boxes_to_reg[i]["box_image"]
|
|
369
|
+
logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s")
|
|
370
|
+
bxs = [b for b in bxs if b["text"]]
|
|
371
|
+
if self.mean_height[pagenum - 1] == 0:
|
|
372
|
+
self.mean_height[pagenum - 1] = np.median([b["bottom"] - b["top"] for b in bxs])
|
|
373
|
+
self.boxes.append(bxs)
|
|
374
|
+
|
|
375
|
+
def _layouts_rec(self, ZM, drop=True):
|
|
376
|
+
assert len(self.page_images) == len(self.boxes)
|
|
377
|
+
self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM, drop=drop)
|
|
378
|
+
# cumlative Y
|
|
379
|
+
for i in range(len(self.boxes)):
|
|
380
|
+
self.boxes[i]["top"] += self.page_cum_height[self.boxes[i]["page_number"] - 1]
|
|
381
|
+
self.boxes[i]["bottom"] += self.page_cum_height[self.boxes[i]["page_number"] - 1]
|
|
382
|
+
|
|
383
|
+
def _assign_column(self, boxes, zoomin=3):
|
|
384
|
+
if not boxes:
|
|
385
|
+
return boxes
|
|
386
|
+
if all("col_id" in b for b in boxes):
|
|
387
|
+
return boxes
|
|
388
|
+
|
|
389
|
+
by_page = defaultdict(list)
|
|
390
|
+
for b in boxes:
|
|
391
|
+
by_page[b["page_number"]].append(b)
|
|
392
|
+
|
|
393
|
+
page_cols = {}
|
|
394
|
+
|
|
395
|
+
for pg, bxs in by_page.items():
|
|
396
|
+
if not bxs:
|
|
397
|
+
page_cols[pg] = 1
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
x0s_raw = np.array([b["x0"] for b in bxs], dtype=float)
|
|
401
|
+
|
|
402
|
+
min_x0 = np.min(x0s_raw)
|
|
403
|
+
max_x1 = np.max([b["x1"] for b in bxs])
|
|
404
|
+
width = max_x1 - min_x0
|
|
405
|
+
|
|
406
|
+
INDENT_TOL = width * 0.12
|
|
407
|
+
x0s = []
|
|
408
|
+
for x in x0s_raw:
|
|
409
|
+
if abs(x - min_x0) < INDENT_TOL:
|
|
410
|
+
x0s.append([min_x0])
|
|
411
|
+
else:
|
|
412
|
+
x0s.append([x])
|
|
413
|
+
x0s = np.array(x0s, dtype=float)
|
|
414
|
+
|
|
415
|
+
max_try = min(4, len(bxs))
|
|
416
|
+
if max_try < 2:
|
|
417
|
+
max_try = 1
|
|
418
|
+
best_k = 1
|
|
419
|
+
best_score = -1
|
|
420
|
+
|
|
421
|
+
for k in range(1, max_try + 1):
|
|
422
|
+
km = KMeans(n_clusters=k, n_init="auto")
|
|
423
|
+
labels = km.fit_predict(x0s)
|
|
424
|
+
|
|
425
|
+
centers = np.sort(km.cluster_centers_.flatten())
|
|
426
|
+
if len(centers) > 1:
|
|
427
|
+
try:
|
|
428
|
+
score = silhouette_score(x0s, labels)
|
|
429
|
+
except ValueError:
|
|
430
|
+
continue
|
|
431
|
+
else:
|
|
432
|
+
score = 0
|
|
433
|
+
if score > best_score:
|
|
434
|
+
best_score = score
|
|
435
|
+
best_k = k
|
|
436
|
+
|
|
437
|
+
page_cols[pg] = best_k
|
|
438
|
+
logging.info(f"[Page {pg}] best_score={best_score:.2f}, best_k={best_k}")
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
global_cols = Counter(page_cols.values()).most_common(1)[0][0]
|
|
442
|
+
logging.info(f"Global column_num decided by majority: {global_cols}")
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
for pg, bxs in by_page.items():
|
|
446
|
+
if not bxs:
|
|
447
|
+
continue
|
|
448
|
+
k = page_cols[pg]
|
|
449
|
+
if len(bxs) < k:
|
|
450
|
+
k = 1
|
|
451
|
+
x0s = np.array([[b["x0"]] for b in bxs], dtype=float)
|
|
452
|
+
km = KMeans(n_clusters=k, n_init="auto")
|
|
453
|
+
labels = km.fit_predict(x0s)
|
|
454
|
+
|
|
455
|
+
centers = km.cluster_centers_.flatten()
|
|
456
|
+
order = np.argsort(centers)
|
|
457
|
+
|
|
458
|
+
remap = {orig: new for new, orig in enumerate(order)}
|
|
459
|
+
|
|
460
|
+
for b, lb in zip(bxs, labels):
|
|
461
|
+
b["col_id"] = remap[lb]
|
|
462
|
+
|
|
463
|
+
grouped = defaultdict(list)
|
|
464
|
+
for b in bxs:
|
|
465
|
+
grouped[b["col_id"]].append(b)
|
|
466
|
+
|
|
467
|
+
return boxes
|
|
468
|
+
|
|
469
|
+
def _text_merge(self, zoomin=3):
|
|
470
|
+
# merge adjusted boxes
|
|
471
|
+
bxs = self._assign_column(self.boxes, zoomin)
|
|
472
|
+
|
|
473
|
+
def end_with(b, txt):
|
|
474
|
+
txt = txt.strip()
|
|
475
|
+
tt = b.get("text", "").strip()
|
|
476
|
+
return tt and tt.find(txt) == len(tt) - len(txt)
|
|
477
|
+
|
|
478
|
+
def start_with(b, txts):
|
|
479
|
+
tt = b.get("text", "").strip()
|
|
480
|
+
return tt and any([tt.find(t.strip()) == 0 for t in txts])
|
|
481
|
+
|
|
482
|
+
# horizontally merge adjacent box with the same layout
|
|
483
|
+
i = 0
|
|
484
|
+
while i < len(bxs) - 1:
|
|
485
|
+
b = bxs[i]
|
|
486
|
+
b_ = bxs[i + 1]
|
|
487
|
+
|
|
488
|
+
if b["page_number"] != b_["page_number"] or b.get("col_id") != b_.get("col_id"):
|
|
489
|
+
i += 1
|
|
490
|
+
continue
|
|
491
|
+
|
|
492
|
+
if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
|
|
493
|
+
i += 1
|
|
494
|
+
continue
|
|
495
|
+
|
|
496
|
+
if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
|
|
497
|
+
# merge
|
|
498
|
+
bxs[i]["x1"] = b_["x1"]
|
|
499
|
+
bxs[i]["top"] = (b["top"] + b_["top"]) / 2
|
|
500
|
+
bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
|
|
501
|
+
bxs[i]["text"] += b_["text"]
|
|
502
|
+
bxs.pop(i + 1)
|
|
503
|
+
continue
|
|
504
|
+
i += 1
|
|
505
|
+
self.boxes = bxs
|
|
506
|
+
|
|
507
|
+
def _naive_vertical_merge(self, zoomin=3):
|
|
508
|
+
bxs = self._assign_column(self.boxes, zoomin)
|
|
509
|
+
|
|
510
|
+
grouped = defaultdict(list)
|
|
511
|
+
for b in bxs:
|
|
512
|
+
grouped[(b["page_number"], b.get("col_id", 0))].append(b)
|
|
513
|
+
|
|
514
|
+
merged_boxes = []
|
|
515
|
+
for (pg, col), bxs in grouped.items():
|
|
516
|
+
bxs = sorted(bxs, key=lambda x: (x["top"], x["x0"]))
|
|
517
|
+
if not bxs:
|
|
518
|
+
continue
|
|
519
|
+
|
|
520
|
+
mh = self.mean_height[pg - 1] if self.mean_height else np.median([b["bottom"] - b["top"] for b in bxs]) or 10
|
|
521
|
+
|
|
522
|
+
i = 0
|
|
523
|
+
while i + 1 < len(bxs):
|
|
524
|
+
b = bxs[i]
|
|
525
|
+
b_ = bxs[i + 1]
|
|
526
|
+
|
|
527
|
+
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
|
|
528
|
+
bxs.pop(i)
|
|
529
|
+
continue
|
|
530
|
+
|
|
531
|
+
if not b["text"].strip():
|
|
532
|
+
bxs.pop(i)
|
|
533
|
+
continue
|
|
534
|
+
|
|
535
|
+
if not b["text"].strip() or b.get("layoutno") != b_.get("layoutno"):
|
|
536
|
+
i += 1
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
if b_["top"] - b["bottom"] > mh * 1.5:
|
|
540
|
+
i += 1
|
|
541
|
+
continue
|
|
542
|
+
|
|
543
|
+
overlap = max(0, min(b["x1"], b_["x1"]) - max(b["x0"], b_["x0"]))
|
|
544
|
+
if overlap / max(1, min(b["x1"] - b["x0"], b_["x1"] - b_["x0"])) < 0.3:
|
|
545
|
+
i += 1
|
|
546
|
+
continue
|
|
547
|
+
|
|
548
|
+
concatting_feats = [
|
|
549
|
+
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
|
|
550
|
+
len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
|
|
551
|
+
b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
|
|
552
|
+
]
|
|
553
|
+
# features for not concating
|
|
554
|
+
feats = [
|
|
555
|
+
b.get("layoutno", 0) != b_.get("layoutno", 0),
|
|
556
|
+
b["text"].strip()[-1] in "。?!?",
|
|
557
|
+
self.is_english and b["text"].strip()[-1] in ".!?",
|
|
558
|
+
b["page_number"] == b_["page_number"] and b_["top"] - b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
|
|
559
|
+
b["page_number"] < b_["page_number"] and abs(b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
|
|
560
|
+
]
|
|
561
|
+
# split features
|
|
562
|
+
detach_feats = [b["x1"] < b_["x0"], b["x0"] > b_["x1"]]
|
|
563
|
+
if (any(feats) and not any(concatting_feats)) or any(detach_feats):
|
|
564
|
+
logging.debug(
|
|
565
|
+
"{} {} {} {}".format(
|
|
566
|
+
b["text"],
|
|
567
|
+
b_["text"],
|
|
568
|
+
any(feats),
|
|
569
|
+
any(concatting_feats),
|
|
570
|
+
)
|
|
571
|
+
)
|
|
572
|
+
i += 1
|
|
573
|
+
continue
|
|
574
|
+
|
|
575
|
+
b["text"] = (b["text"].rstrip() + " " + b_["text"].lstrip()).strip()
|
|
576
|
+
b["bottom"] = b_["bottom"]
|
|
577
|
+
b["x0"] = min(b["x0"], b_["x0"])
|
|
578
|
+
b["x1"] = max(b["x1"], b_["x1"])
|
|
579
|
+
bxs.pop(i + 1)
|
|
580
|
+
|
|
581
|
+
merged_boxes.extend(bxs)
|
|
582
|
+
|
|
583
|
+
self.boxes = sorted(merged_boxes, key=lambda x: (x["page_number"], x.get("col_id", 0), x["top"]))
|
|
584
|
+
|
|
585
|
+
def _final_reading_order_merge(self, zoomin=3):
|
|
586
|
+
if not self.boxes:
|
|
587
|
+
return
|
|
588
|
+
|
|
589
|
+
self.boxes = self._assign_column(self.boxes, zoomin=zoomin)
|
|
590
|
+
|
|
591
|
+
pages = defaultdict(lambda: defaultdict(list))
|
|
592
|
+
for b in self.boxes:
|
|
593
|
+
pg = b["page_number"]
|
|
594
|
+
col = b.get("col_id", 0)
|
|
595
|
+
pages[pg][col].append(b)
|
|
596
|
+
|
|
597
|
+
for pg in pages:
|
|
598
|
+
for col in pages[pg]:
|
|
599
|
+
pages[pg][col].sort(key=lambda x: (x["top"], x["x0"]))
|
|
600
|
+
|
|
601
|
+
new_boxes = []
|
|
602
|
+
for pg in sorted(pages.keys()):
|
|
603
|
+
for col in sorted(pages[pg].keys()):
|
|
604
|
+
new_boxes.extend(pages[pg][col])
|
|
605
|
+
|
|
606
|
+
self.boxes = new_boxes
|
|
607
|
+
|
|
608
|
+
def _concat_downward(self, concat_between_pages=True):
|
|
609
|
+
self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0)
|
|
610
|
+
return
|
|
611
|
+
|
|
612
|
+
# count boxes in the same row as a feature
|
|
613
|
+
for i in range(len(self.boxes)):
|
|
614
|
+
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
|
|
615
|
+
self.boxes[i]["in_row"] = 0
|
|
616
|
+
j = max(0, i - 12)
|
|
617
|
+
while j < min(i + 12, len(self.boxes)):
|
|
618
|
+
if j == i:
|
|
619
|
+
j += 1
|
|
620
|
+
continue
|
|
621
|
+
ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
|
|
622
|
+
if abs(ydis) < 1:
|
|
623
|
+
self.boxes[i]["in_row"] += 1
|
|
624
|
+
elif ydis > 0:
|
|
625
|
+
break
|
|
626
|
+
j += 1
|
|
627
|
+
|
|
628
|
+
# concat between rows
|
|
629
|
+
boxes = deepcopy(self.boxes)
|
|
630
|
+
blocks = []
|
|
631
|
+
while boxes:
|
|
632
|
+
chunks = []
|
|
633
|
+
|
|
634
|
+
def dfs(up, dp):
|
|
635
|
+
chunks.append(up)
|
|
636
|
+
i = dp
|
|
637
|
+
while i < min(dp + 12, len(boxes)):
|
|
638
|
+
ydis = self._y_dis(up, boxes[i])
|
|
639
|
+
smpg = up["page_number"] == boxes[i]["page_number"]
|
|
640
|
+
mh = self.mean_height[up["page_number"] - 1]
|
|
641
|
+
mw = self.mean_width[up["page_number"] - 1]
|
|
642
|
+
if smpg and ydis > mh * 4:
|
|
643
|
+
break
|
|
644
|
+
if not smpg and ydis > mh * 16:
|
|
645
|
+
break
|
|
646
|
+
down = boxes[i]
|
|
647
|
+
if not concat_between_pages and down["page_number"] > up["page_number"]:
|
|
648
|
+
break
|
|
649
|
+
|
|
650
|
+
if up.get("R", "") != down.get("R", "") and up["text"][-1] != ",":
|
|
651
|
+
i += 1
|
|
652
|
+
continue
|
|
653
|
+
|
|
654
|
+
if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) or not down["text"].strip():
|
|
655
|
+
i += 1
|
|
656
|
+
continue
|
|
657
|
+
|
|
658
|
+
if not down["text"].strip() or not up["text"].strip():
|
|
659
|
+
i += 1
|
|
660
|
+
continue
|
|
661
|
+
|
|
662
|
+
if up["x1"] < down["x0"] - 10 * mw or up["x0"] > down["x1"] + 10 * mw:
|
|
663
|
+
i += 1
|
|
664
|
+
continue
|
|
665
|
+
|
|
666
|
+
if i - dp < 5 and up.get("layout_type") == "text":
|
|
667
|
+
if up.get("layoutno", "1") == down.get("layoutno", "2"):
|
|
668
|
+
dfs(down, i + 1)
|
|
669
|
+
boxes.pop(i)
|
|
670
|
+
return
|
|
671
|
+
i += 1
|
|
672
|
+
continue
|
|
673
|
+
|
|
674
|
+
fea = self._updown_concat_features(up, down)
|
|
675
|
+
if self.updown_cnt_mdl.predict(xgb.DMatrix([fea]))[0] <= 0.5:
|
|
676
|
+
i += 1
|
|
677
|
+
continue
|
|
678
|
+
dfs(down, i + 1)
|
|
679
|
+
boxes.pop(i)
|
|
680
|
+
return
|
|
681
|
+
|
|
682
|
+
dfs(boxes[0], 1)
|
|
683
|
+
boxes.pop(0)
|
|
684
|
+
if chunks:
|
|
685
|
+
blocks.append(chunks)
|
|
686
|
+
|
|
687
|
+
# concat within each block
|
|
688
|
+
boxes = []
|
|
689
|
+
for b in blocks:
|
|
690
|
+
if len(b) == 1:
|
|
691
|
+
boxes.append(b[0])
|
|
692
|
+
continue
|
|
693
|
+
t = b[0]
|
|
694
|
+
for c in b[1:]:
|
|
695
|
+
t["text"] = t["text"].strip()
|
|
696
|
+
c["text"] = c["text"].strip()
|
|
697
|
+
if not c["text"]:
|
|
698
|
+
continue
|
|
699
|
+
if t["text"] and re.match(r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
|
|
700
|
+
t["text"] += " "
|
|
701
|
+
t["text"] += c["text"]
|
|
702
|
+
t["x0"] = min(t["x0"], c["x0"])
|
|
703
|
+
t["x1"] = max(t["x1"], c["x1"])
|
|
704
|
+
t["page_number"] = min(t["page_number"], c["page_number"])
|
|
705
|
+
t["bottom"] = c["bottom"]
|
|
706
|
+
if not t["layout_type"] and c["layout_type"]:
|
|
707
|
+
t["layout_type"] = c["layout_type"]
|
|
708
|
+
boxes.append(t)
|
|
709
|
+
|
|
710
|
+
self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
|
|
711
|
+
|
|
712
|
+
def _filter_forpages(self):
|
|
713
|
+
if not self.boxes:
|
|
714
|
+
return
|
|
715
|
+
findit = False
|
|
716
|
+
i = 0
|
|
717
|
+
while i < len(self.boxes):
|
|
718
|
+
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
|
|
719
|
+
i += 1
|
|
720
|
+
continue
|
|
721
|
+
findit = True
|
|
722
|
+
eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
|
|
723
|
+
self.boxes.pop(i)
|
|
724
|
+
if i >= len(self.boxes):
|
|
725
|
+
break
|
|
726
|
+
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split()[:2])
|
|
727
|
+
while not prefix:
|
|
728
|
+
self.boxes.pop(i)
|
|
729
|
+
if i >= len(self.boxes):
|
|
730
|
+
break
|
|
731
|
+
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split()[:2])
|
|
732
|
+
self.boxes.pop(i)
|
|
733
|
+
if i >= len(self.boxes) or not prefix:
|
|
734
|
+
break
|
|
735
|
+
for j in range(i, min(i + 128, len(self.boxes))):
|
|
736
|
+
if not re.match(prefix, self.boxes[j]["text"]):
|
|
737
|
+
continue
|
|
738
|
+
for k in range(i, j):
|
|
739
|
+
self.boxes.pop(i)
|
|
740
|
+
break
|
|
741
|
+
if findit:
|
|
742
|
+
return
|
|
743
|
+
|
|
744
|
+
page_dirty = [0] * len(self.page_images)
|
|
745
|
+
for b in self.boxes:
|
|
746
|
+
if re.search(r"(··|··|··)", b["text"]):
|
|
747
|
+
page_dirty[b["page_number"] - 1] += 1
|
|
748
|
+
page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
|
|
749
|
+
if not page_dirty:
|
|
750
|
+
return
|
|
751
|
+
i = 0
|
|
752
|
+
while i < len(self.boxes):
|
|
753
|
+
if self.boxes[i]["page_number"] in page_dirty:
|
|
754
|
+
self.boxes.pop(i)
|
|
755
|
+
continue
|
|
756
|
+
i += 1
|
|
757
|
+
|
|
758
|
+
def _merge_with_same_bullet(self):
|
|
759
|
+
i = 0
|
|
760
|
+
while i + 1 < len(self.boxes):
|
|
761
|
+
b = self.boxes[i]
|
|
762
|
+
b_ = self.boxes[i + 1]
|
|
763
|
+
if not b["text"].strip():
|
|
764
|
+
self.boxes.pop(i)
|
|
765
|
+
continue
|
|
766
|
+
if not b_["text"].strip():
|
|
767
|
+
self.boxes.pop(i + 1)
|
|
768
|
+
continue
|
|
769
|
+
|
|
770
|
+
if (
|
|
771
|
+
b["text"].strip()[0] != b_["text"].strip()[0]
|
|
772
|
+
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm")
|
|
773
|
+
or is_chinese(b["text"].strip()[0])
|
|
774
|
+
or b["top"] > b_["bottom"]
|
|
775
|
+
):
|
|
776
|
+
i += 1
|
|
777
|
+
continue
|
|
778
|
+
b_["text"] = b["text"] + "\n" + b_["text"]
|
|
779
|
+
b_["x0"] = min(b["x0"], b_["x0"])
|
|
780
|
+
b_["x1"] = max(b["x1"], b_["x1"])
|
|
781
|
+
b_["top"] = b["top"]
|
|
782
|
+
self.boxes.pop(i)
|
|
783
|
+
|
|
784
|
+
def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False):
|
|
785
|
+
tables = {}
|
|
786
|
+
figures = {}
|
|
787
|
+
# extract figure and table boxes
|
|
788
|
+
i = 0
|
|
789
|
+
lst_lout_no = ""
|
|
790
|
+
nomerge_lout_no = []
|
|
791
|
+
while i < len(self.boxes):
|
|
792
|
+
if "layoutno" not in self.boxes[i]:
|
|
793
|
+
i += 1
|
|
794
|
+
continue
|
|
795
|
+
lout_no = str(self.boxes[i]["page_number"]) + "-" + str(self.boxes[i]["layoutno"])
|
|
796
|
+
if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title", "figure caption", "reference"]:
|
|
797
|
+
nomerge_lout_no.append(lst_lout_no)
|
|
798
|
+
if self.boxes[i]["layout_type"] == "table":
|
|
799
|
+
if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
|
|
800
|
+
self.boxes.pop(i)
|
|
801
|
+
continue
|
|
802
|
+
if lout_no not in tables:
|
|
803
|
+
tables[lout_no] = []
|
|
804
|
+
tables[lout_no].append(self.boxes[i])
|
|
805
|
+
self.boxes.pop(i)
|
|
806
|
+
lst_lout_no = lout_no
|
|
807
|
+
continue
|
|
808
|
+
if need_image and self.boxes[i]["layout_type"] == "figure":
|
|
809
|
+
if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
|
|
810
|
+
self.boxes.pop(i)
|
|
811
|
+
continue
|
|
812
|
+
if lout_no not in figures:
|
|
813
|
+
figures[lout_no] = []
|
|
814
|
+
figures[lout_no].append(self.boxes[i])
|
|
815
|
+
self.boxes.pop(i)
|
|
816
|
+
lst_lout_no = lout_no
|
|
817
|
+
continue
|
|
818
|
+
i += 1
|
|
819
|
+
|
|
820
|
+
# merge table on different pages
|
|
821
|
+
nomerge_lout_no = set(nomerge_lout_no)
|
|
822
|
+
tbls = sorted([(k, bxs) for k, bxs in tables.items()], key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
|
|
823
|
+
|
|
824
|
+
i = len(tbls) - 1
|
|
825
|
+
while i - 1 >= 0:
|
|
826
|
+
k0, bxs0 = tbls[i - 1]
|
|
827
|
+
k, bxs = tbls[i]
|
|
828
|
+
i -= 1
|
|
829
|
+
if k0 in nomerge_lout_no:
|
|
830
|
+
continue
|
|
831
|
+
if bxs[0]["page_number"] == bxs0[0]["page_number"]:
|
|
832
|
+
continue
|
|
833
|
+
if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
|
|
834
|
+
continue
|
|
835
|
+
mh = self.mean_height[bxs[0]["page_number"] - 1]
|
|
836
|
+
if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
|
|
837
|
+
continue
|
|
838
|
+
tables[k0].extend(tables[k])
|
|
839
|
+
del tables[k]
|
|
840
|
+
|
|
841
|
+
def x_overlapped(a, b):
|
|
842
|
+
return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
|
|
843
|
+
|
|
844
|
+
# find captions and pop out
|
|
845
|
+
i = 0
|
|
846
|
+
while i < len(self.boxes):
|
|
847
|
+
c = self.boxes[i]
|
|
848
|
+
# mh = self.mean_height[c["page_number"]-1]
|
|
849
|
+
if not TableStructureRecognizer.is_caption(c):
|
|
850
|
+
i += 1
|
|
851
|
+
continue
|
|
852
|
+
|
|
853
|
+
# find the nearest layouts
|
|
854
|
+
def nearest(tbls):
|
|
855
|
+
nonlocal c
|
|
856
|
+
mink = ""
|
|
857
|
+
minv = 1000000000
|
|
858
|
+
for k, bxs in tbls.items():
|
|
859
|
+
for b in bxs:
|
|
860
|
+
if b.get("layout_type", "").find("caption") >= 0:
|
|
861
|
+
continue
|
|
862
|
+
y_dis = self._y_dis(c, b)
|
|
863
|
+
x_dis = self._x_dis(c, b) if not x_overlapped(c, b) else 0
|
|
864
|
+
dis = y_dis * y_dis + x_dis * x_dis
|
|
865
|
+
if dis < minv:
|
|
866
|
+
mink = k
|
|
867
|
+
minv = dis
|
|
868
|
+
return mink, minv
|
|
869
|
+
|
|
870
|
+
tk, tv = nearest(tables)
|
|
871
|
+
fk, fv = nearest(figures)
|
|
872
|
+
# if min(tv, fv) > 2000:
|
|
873
|
+
# i += 1
|
|
874
|
+
# continue
|
|
875
|
+
if tv < fv and tk:
|
|
876
|
+
tables[tk].insert(0, c)
|
|
877
|
+
logging.debug("TABLE:" + self.boxes[i]["text"] + "; Cap: " + tk)
|
|
878
|
+
elif fk:
|
|
879
|
+
figures[fk].insert(0, c)
|
|
880
|
+
logging.debug("FIGURE:" + self.boxes[i]["text"] + "; Cap: " + tk)
|
|
881
|
+
self.boxes.pop(i)
|
|
882
|
+
|
|
883
|
+
def cropout(bxs, ltype, poss):
|
|
884
|
+
nonlocal ZM
|
|
885
|
+
pn = set([b["page_number"] - 1 for b in bxs])
|
|
886
|
+
if len(pn) < 2:
|
|
887
|
+
pn = list(pn)[0]
|
|
888
|
+
ht = self.page_cum_height[pn]
|
|
889
|
+
b = {"x0": np.min([b["x0"] for b in bxs]), "top": np.min([b["top"] for b in bxs]) - ht, "x1": np.max([b["x1"] for b in bxs]), "bottom": np.max([b["bottom"] for b in bxs]) - ht}
|
|
890
|
+
louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
|
|
891
|
+
ii = Recognizer.find_overlapped(b, louts, naive=True)
|
|
892
|
+
if ii is not None:
|
|
893
|
+
b = louts[ii]
|
|
894
|
+
else:
|
|
895
|
+
logging.warning(f"Missing layout match: {pn + 1},%s" % (bxs[0].get("layoutno", "")))
|
|
896
|
+
|
|
897
|
+
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
|
898
|
+
if right < left:
|
|
899
|
+
right = left + 1
|
|
900
|
+
|
|
901
|
+
# 确保裁剪坐标在图片边界内
|
|
902
|
+
img_width, img_height = self.page_images[pn].size
|
|
903
|
+
crop_left = max(0, int(left * ZM))
|
|
904
|
+
crop_top = max(0, int(top * ZM))
|
|
905
|
+
crop_right = min(img_width, max(crop_left + 1, int(right * ZM)))
|
|
906
|
+
crop_bottom = min(img_height, max(crop_top + 1, int(bott * ZM)))
|
|
907
|
+
|
|
908
|
+
poss.append((pn + self.page_from, left, right, top, bott))
|
|
909
|
+
|
|
910
|
+
try:
|
|
911
|
+
return self.page_images[pn].crop((crop_left, crop_top, crop_right, crop_bottom))
|
|
912
|
+
except Exception as e:
|
|
913
|
+
logging.warning(f"Failed to crop image: {e}")
|
|
914
|
+
return None
|
|
915
|
+
pn = {}
|
|
916
|
+
for b in bxs:
|
|
917
|
+
p = b["page_number"] - 1
|
|
918
|
+
if p not in pn:
|
|
919
|
+
pn[p] = []
|
|
920
|
+
pn[p].append(b)
|
|
921
|
+
pn = sorted(pn.items(), key=lambda x: x[0])
|
|
922
|
+
imgs = [cropout(arr, ltype, poss) for p, arr in pn]
|
|
923
|
+
pic = Image.new("RGB", (int(np.max([i.size[0] for i in imgs])), int(np.sum([m.size[1] for m in imgs]))), (245, 245, 245))
|
|
924
|
+
height = 0
|
|
925
|
+
for img in imgs:
|
|
926
|
+
pic.paste(img, (0, int(height)))
|
|
927
|
+
height += img.size[1]
|
|
928
|
+
return pic
|
|
929
|
+
|
|
930
|
+
res = []
|
|
931
|
+
positions = []
|
|
932
|
+
figure_results = []
|
|
933
|
+
figure_positions = []
|
|
934
|
+
# counter for figures by page
|
|
935
|
+
figure_counter_by_page = {}
|
|
936
|
+
# crop figure out and add caption
|
|
937
|
+
for k, bxs in figures.items():
|
|
938
|
+
txt = "\n".join([b["text"] for b in bxs])
|
|
939
|
+
# 如果文本为空,使用默认描述,但仍然处理图片
|
|
940
|
+
if not txt:
|
|
941
|
+
# 使用页码和序号生成唯一标识
|
|
942
|
+
page_num = bxs[0]["page_number"]
|
|
943
|
+
if page_num not in figure_counter_by_page:
|
|
944
|
+
figure_counter_by_page[page_num] = 0
|
|
945
|
+
figure_counter_by_page[page_num] += 1
|
|
946
|
+
txt = f"Figure-P{page_num}-{figure_counter_by_page[page_num]}"
|
|
947
|
+
|
|
948
|
+
poss = []
|
|
949
|
+
|
|
950
|
+
cropped_img = cropout(bxs, "figure", poss)
|
|
951
|
+
if cropped_img is not None: # 只添加成功裁剪的图片
|
|
952
|
+
if separate_tables_figures:
|
|
953
|
+
figure_results.append((cropped_img, [txt]))
|
|
954
|
+
figure_positions.append(poss)
|
|
955
|
+
else:
|
|
956
|
+
res.append((cropped_img, [txt]))
|
|
957
|
+
positions.append(poss)
|
|
958
|
+
|
|
959
|
+
for k, bxs in tables.items():
|
|
960
|
+
if not bxs:
|
|
961
|
+
continue
|
|
962
|
+
bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs]))
|
|
963
|
+
|
|
964
|
+
poss = []
|
|
965
|
+
|
|
966
|
+
res.append((cropout(bxs, "table", poss), self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
|
|
967
|
+
positions.append(poss)
|
|
968
|
+
|
|
969
|
+
if separate_tables_figures:
|
|
970
|
+
assert len(positions) + len(figure_positions) == len(res) + len(figure_results)
|
|
971
|
+
if need_position:
|
|
972
|
+
return list(zip(res, positions)), list(zip(figure_results, figure_positions))
|
|
973
|
+
else:
|
|
974
|
+
return res, figure_results
|
|
975
|
+
else:
|
|
976
|
+
assert len(positions) == len(res)
|
|
977
|
+
if need_position:
|
|
978
|
+
return list(zip(res, positions))
|
|
979
|
+
else:
|
|
980
|
+
return res
|
|
981
|
+
|
|
982
|
+
def proj_match(self, line):
|
|
983
|
+
if len(line) <= 2:
|
|
984
|
+
return
|
|
985
|
+
if re.match(r"[0-9 ().,%%+/-]+$", line):
|
|
986
|
+
return False
|
|
987
|
+
for p, j in [
|
|
988
|
+
(r"第[零一二三四五六七八九十百]+章", 1),
|
|
989
|
+
(r"第[零一二三四五六七八九十百]+[条节]", 2),
|
|
990
|
+
(r"[零一二三四五六七八九十百]+[、 ]", 3),
|
|
991
|
+
(r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
|
|
992
|
+
(r"[0-9]+(、|\.[ ]|\.[^0-9])", 5),
|
|
993
|
+
(r"[0-9]+\.[0-9]+(、|[. ]|[^0-9])", 6),
|
|
994
|
+
(r"[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 7),
|
|
995
|
+
(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 8),
|
|
996
|
+
(r".{,48}[::??]$", 9),
|
|
997
|
+
(r"[0-9]+)", 10),
|
|
998
|
+
(r"[\((][0-9]+[)\)]", 11),
|
|
999
|
+
(r"[零一二三四五六七八九十百]+是", 12),
|
|
1000
|
+
(r"[⚫•➢✓]", 12),
|
|
1001
|
+
]:
|
|
1002
|
+
if re.match(p, line):
|
|
1003
|
+
return j
|
|
1004
|
+
return
|
|
1005
|
+
|
|
1006
|
+
def _line_tag(self, bx, ZM):
|
|
1007
|
+
pn = [bx["page_number"]]
|
|
1008
|
+
top = bx["top"] - self.page_cum_height[pn[0] - 1]
|
|
1009
|
+
bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
|
|
1010
|
+
page_images_cnt = len(self.page_images)
|
|
1011
|
+
if pn[-1] - 1 >= page_images_cnt:
|
|
1012
|
+
return ""
|
|
1013
|
+
while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
|
|
1014
|
+
bott -= self.page_images[pn[-1] - 1].size[1] / ZM
|
|
1015
|
+
pn.append(pn[-1] + 1)
|
|
1016
|
+
if pn[-1] - 1 >= page_images_cnt:
|
|
1017
|
+
return ""
|
|
1018
|
+
|
|
1019
|
+
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), bx["x0"], bx["x1"], top, bott)
|
|
1020
|
+
|
|
1021
|
+
def __filterout_scraps(self, boxes, ZM):
|
|
1022
|
+
def width(b):
|
|
1023
|
+
return b["x1"] - b["x0"]
|
|
1024
|
+
|
|
1025
|
+
def height(b):
|
|
1026
|
+
return b["bottom"] - b["top"]
|
|
1027
|
+
|
|
1028
|
+
def usefull(b):
|
|
1029
|
+
if b.get("layout_type"):
|
|
1030
|
+
return True
|
|
1031
|
+
if width(b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
|
|
1032
|
+
return True
|
|
1033
|
+
if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
|
|
1034
|
+
return True
|
|
1035
|
+
return False
|
|
1036
|
+
|
|
1037
|
+
res = []
|
|
1038
|
+
while boxes:
|
|
1039
|
+
lines = []
|
|
1040
|
+
widths = []
|
|
1041
|
+
pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
|
|
1042
|
+
mh = self.mean_height[boxes[0]["page_number"] - 1]
|
|
1043
|
+
mj = self.proj_match(boxes[0]["text"]) or boxes[0].get("layout_type", "") == "title"
|
|
1044
|
+
|
|
1045
|
+
def dfs(line, st):
|
|
1046
|
+
nonlocal mh, pw, lines, widths
|
|
1047
|
+
lines.append(line)
|
|
1048
|
+
widths.append(width(line))
|
|
1049
|
+
mmj = self.proj_match(line["text"]) or line.get("layout_type", "") == "title"
|
|
1050
|
+
for i in range(st + 1, min(st + 20, len(boxes))):
|
|
1051
|
+
if (boxes[i]["page_number"] - line["page_number"]) > 0:
|
|
1052
|
+
break
|
|
1053
|
+
if not mmj and self._y_dis(line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
|
|
1054
|
+
break
|
|
1055
|
+
|
|
1056
|
+
if not usefull(boxes[i]):
|
|
1057
|
+
continue
|
|
1058
|
+
if mmj or (self._x_dis(boxes[i], line) < pw / 10):
|
|
1059
|
+
# and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
|
|
1060
|
+
# concat following
|
|
1061
|
+
dfs(boxes[i], i)
|
|
1062
|
+
boxes.pop(i)
|
|
1063
|
+
break
|
|
1064
|
+
|
|
1065
|
+
try:
|
|
1066
|
+
if usefull(boxes[0]):
|
|
1067
|
+
dfs(boxes[0], 0)
|
|
1068
|
+
else:
|
|
1069
|
+
logging.debug("WASTE: " + boxes[0]["text"])
|
|
1070
|
+
except Exception:
|
|
1071
|
+
pass
|
|
1072
|
+
boxes.pop(0)
|
|
1073
|
+
mw = np.mean(widths)
|
|
1074
|
+
if mj or mw / pw >= 0.35 or mw > 200:
|
|
1075
|
+
res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
|
|
1076
|
+
else:
|
|
1077
|
+
logging.debug("REMOVED: " + "<<".join([c["text"] for c in lines]))
|
|
1078
|
+
|
|
1079
|
+
return "\n\n".join(res)
|
|
1080
|
+
|
|
1081
|
+
@staticmethod
|
|
1082
|
+
def total_page_number(fnm, binary=None):
|
|
1083
|
+
try:
|
|
1084
|
+
with sys.modules[LOCK_KEY_pdfplumber]:
|
|
1085
|
+
pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
|
1086
|
+
total_page = len(pdf.pages)
|
|
1087
|
+
pdf.close()
|
|
1088
|
+
return total_page
|
|
1089
|
+
except Exception:
|
|
1090
|
+
logging.exception("total_page_number")
|
|
1091
|
+
|
|
1092
|
+
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
|
|
1093
|
+
self.lefted_chars = []
|
|
1094
|
+
self.mean_height = []
|
|
1095
|
+
self.mean_width = []
|
|
1096
|
+
self.boxes = []
|
|
1097
|
+
self.garbages = {}
|
|
1098
|
+
self.page_cum_height = [0]
|
|
1099
|
+
self.page_layout = []
|
|
1100
|
+
self.page_from = page_from
|
|
1101
|
+
start = timer()
|
|
1102
|
+
try:
|
|
1103
|
+
with sys.modules[LOCK_KEY_pdfplumber]:
|
|
1104
|
+
with pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
|
1105
|
+
self.pdf = pdf
|
|
1106
|
+
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])]
|
|
1107
|
+
|
|
1108
|
+
try:
|
|
1109
|
+
self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
|
|
1110
|
+
except Exception as e:
|
|
1111
|
+
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
|
|
1112
|
+
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
|
|
1113
|
+
|
|
1114
|
+
self.total_page = len(self.pdf.pages)
|
|
1115
|
+
|
|
1116
|
+
except Exception:
|
|
1117
|
+
logging.exception("RAGFlowPdfParser __images__")
|
|
1118
|
+
logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
|
|
1119
|
+
|
|
1120
|
+
self.outlines = []
|
|
1121
|
+
try:
|
|
1122
|
+
with pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm)) as pdf:
|
|
1123
|
+
self.pdf = pdf
|
|
1124
|
+
|
|
1125
|
+
outlines = self.pdf.outline
|
|
1126
|
+
|
|
1127
|
+
def dfs(arr, depth):
|
|
1128
|
+
for a in arr:
|
|
1129
|
+
if isinstance(a, dict):
|
|
1130
|
+
self.outlines.append((a["/Title"], depth))
|
|
1131
|
+
continue
|
|
1132
|
+
dfs(a, depth + 1)
|
|
1133
|
+
|
|
1134
|
+
dfs(outlines, 0)
|
|
1135
|
+
|
|
1136
|
+
except Exception as e:
|
|
1137
|
+
logging.warning(f"Outlines exception: {e}")
|
|
1138
|
+
|
|
1139
|
+
if not self.outlines:
|
|
1140
|
+
logging.warning("Miss outlines")
|
|
1141
|
+
|
|
1142
|
+
logging.debug("Images converted.")
|
|
1143
|
+
self.is_english = [
|
|
1144
|
+
re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
|
|
1145
|
+
for i in range(len(self.page_chars))
|
|
1146
|
+
]
|
|
1147
|
+
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
|
|
1148
|
+
self.is_english = True
|
|
1149
|
+
else:
|
|
1150
|
+
self.is_english = False
|
|
1151
|
+
|
|
1152
|
+
async def __img_ocr(i, id, img, chars, limiter):
|
|
1153
|
+
j = 0
|
|
1154
|
+
while j + 1 < len(chars):
|
|
1155
|
+
if (
|
|
1156
|
+
chars[j]["text"]
|
|
1157
|
+
and chars[j + 1]["text"]
|
|
1158
|
+
and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"])
|
|
1159
|
+
and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"], chars[j]["width"]) / 2
|
|
1160
|
+
):
|
|
1161
|
+
chars[j]["text"] += " "
|
|
1162
|
+
j += 1
|
|
1163
|
+
|
|
1164
|
+
if limiter:
|
|
1165
|
+
async with limiter:
|
|
1166
|
+
await asyncio.to_thread(self.__ocr, i + 1, img, chars, zoomin, id)
|
|
1167
|
+
else:
|
|
1168
|
+
self.__ocr(i + 1, img, chars, zoomin, id)
|
|
1169
|
+
|
|
1170
|
+
if callback and i % 6 == 5:
|
|
1171
|
+
callback((i + 1) * 0.6 / len(self.page_images))
|
|
1172
|
+
|
|
1173
|
+
async def __img_ocr_launcher():
|
|
1174
|
+
def __ocr_preprocess():
|
|
1175
|
+
chars = self.page_chars[i] if not self.is_english else []
|
|
1176
|
+
self.mean_height.append(np.median(sorted([c["height"] for c in chars])) if chars else 0)
|
|
1177
|
+
self.mean_width.append(np.median(sorted([c["width"] for c in chars])) if chars else 8)
|
|
1178
|
+
self.page_cum_height.append(img.size[1] / zoomin)
|
|
1179
|
+
return chars
|
|
1180
|
+
|
|
1181
|
+
if self.parallel_limiter:
|
|
1182
|
+
tasks = []
|
|
1183
|
+
|
|
1184
|
+
for i, img in enumerate(self.page_images):
|
|
1185
|
+
chars = __ocr_preprocess()
|
|
1186
|
+
|
|
1187
|
+
semaphore = self.parallel_limiter[i % settings.PARALLEL_DEVICES]
|
|
1188
|
+
|
|
1189
|
+
async def wrapper(i=i, img=img, chars=chars, semaphore=semaphore):
|
|
1190
|
+
await __img_ocr(
|
|
1191
|
+
i,
|
|
1192
|
+
i % settings.PARALLEL_DEVICES,
|
|
1193
|
+
img,
|
|
1194
|
+
chars,
|
|
1195
|
+
semaphore,
|
|
1196
|
+
)
|
|
1197
|
+
|
|
1198
|
+
tasks.append(asyncio.create_task(wrapper()))
|
|
1199
|
+
await asyncio.sleep(0)
|
|
1200
|
+
|
|
1201
|
+
try:
|
|
1202
|
+
await asyncio.gather(*tasks, return_exceptions=False)
|
|
1203
|
+
except Exception as e:
|
|
1204
|
+
logging.error(f"Error in OCR: {e}")
|
|
1205
|
+
for t in tasks:
|
|
1206
|
+
t.cancel()
|
|
1207
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
1208
|
+
raise
|
|
1209
|
+
|
|
1210
|
+
else:
|
|
1211
|
+
for i, img in enumerate(self.page_images):
|
|
1212
|
+
chars = __ocr_preprocess()
|
|
1213
|
+
await __img_ocr(i, 0, img, chars, None)
|
|
1214
|
+
|
|
1215
|
+
start = timer()
|
|
1216
|
+
|
|
1217
|
+
# Handle asyncio.run() in case there's already a running event loop
|
|
1218
|
+
try:
|
|
1219
|
+
# Check if there's a running event loop
|
|
1220
|
+
asyncio.get_running_loop()
|
|
1221
|
+
# If we get here, there's a running loop, so we need to run in a new thread
|
|
1222
|
+
result_queue: queue.Queue = queue.Queue()
|
|
1223
|
+
|
|
1224
|
+
def runner():
|
|
1225
|
+
try:
|
|
1226
|
+
# Create a new event loop in this thread
|
|
1227
|
+
new_loop = asyncio.new_event_loop()
|
|
1228
|
+
asyncio.set_event_loop(new_loop)
|
|
1229
|
+
try:
|
|
1230
|
+
result_queue.put((True, new_loop.run_until_complete(__img_ocr_launcher())))
|
|
1231
|
+
finally:
|
|
1232
|
+
new_loop.close()
|
|
1233
|
+
except Exception as e:
|
|
1234
|
+
result_queue.put((False, e))
|
|
1235
|
+
|
|
1236
|
+
thread = threading.Thread(target=runner, daemon=True)
|
|
1237
|
+
thread.start()
|
|
1238
|
+
thread.join()
|
|
1239
|
+
|
|
1240
|
+
success, value = result_queue.get_nowait()
|
|
1241
|
+
if not success:
|
|
1242
|
+
raise value
|
|
1243
|
+
except RuntimeError:
|
|
1244
|
+
# No running event loop, safe to use asyncio.run()
|
|
1245
|
+
asyncio.run(__img_ocr_launcher())
|
|
1246
|
+
|
|
1247
|
+
logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")
|
|
1248
|
+
|
|
1249
|
+
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
|
|
1250
|
+
bxes = [b for bxs in self.boxes for b in bxs]
|
|
1251
|
+
self.is_english = re.search(r"[ \na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
|
|
1252
|
+
|
|
1253
|
+
logging.debug(f"Is it English: {self.is_english}")
|
|
1254
|
+
|
|
1255
|
+
self.page_cum_height = np.cumsum(self.page_cum_height)
|
|
1256
|
+
assert len(self.page_cum_height) == len(self.page_images) + 1
|
|
1257
|
+
if len(self.boxes) == 0 and zoomin < 9:
|
|
1258
|
+
self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
|
|
1259
|
+
|
|
1260
|
+
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
|
1261
|
+
self.__images__(fnm, zoomin)
|
|
1262
|
+
self._layouts_rec(zoomin)
|
|
1263
|
+
self._table_transformer_job(zoomin)
|
|
1264
|
+
self._text_merge()
|
|
1265
|
+
self._concat_downward()
|
|
1266
|
+
self._filter_forpages()
|
|
1267
|
+
tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
|
|
1268
|
+
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
|
1269
|
+
|
|
1270
|
+
def parse_into_bboxes(self, fnm, callback=None, zoomin=3):
|
|
1271
|
+
start = timer()
|
|
1272
|
+
self.__images__(fnm, zoomin, callback=callback)
|
|
1273
|
+
if callback:
|
|
1274
|
+
callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start))
|
|
1275
|
+
|
|
1276
|
+
start = timer()
|
|
1277
|
+
self._layouts_rec(zoomin)
|
|
1278
|
+
if callback:
|
|
1279
|
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
|
1280
|
+
|
|
1281
|
+
start = timer()
|
|
1282
|
+
self._table_transformer_job(zoomin)
|
|
1283
|
+
if callback:
|
|
1284
|
+
callback(0.83, "Table analysis ({:.2f}s)".format(timer() - start))
|
|
1285
|
+
|
|
1286
|
+
start = timer()
|
|
1287
|
+
self._text_merge()
|
|
1288
|
+
self._concat_downward()
|
|
1289
|
+
#self._naive_vertical_merge(zoomin)
|
|
1290
|
+
if callback:
|
|
1291
|
+
callback(0.92, "Text merged ({:.2f}s)".format(timer() - start))
|
|
1292
|
+
|
|
1293
|
+
start = timer()
|
|
1294
|
+
tbls, figs = self._extract_table_figure(True, zoomin, True, True, True)
|
|
1295
|
+
|
|
1296
|
+
def insert_table_figures(tbls_or_figs, layout_type):
|
|
1297
|
+
def min_rectangle_distance(rect1, rect2):
|
|
1298
|
+
pn1, left1, right1, top1, bottom1 = rect1
|
|
1299
|
+
pn2, left2, right2, top2, bottom2 = rect2
|
|
1300
|
+
if right1 >= left2 and right2 >= left1 and bottom1 >= top2 and bottom2 >= top1:
|
|
1301
|
+
return 0
|
|
1302
|
+
if right1 < left2:
|
|
1303
|
+
dx = left2 - right1
|
|
1304
|
+
elif right2 < left1:
|
|
1305
|
+
dx = left1 - right2
|
|
1306
|
+
else:
|
|
1307
|
+
dx = 0
|
|
1308
|
+
if bottom1 < top2:
|
|
1309
|
+
dy = top2 - bottom1
|
|
1310
|
+
elif bottom2 < top1:
|
|
1311
|
+
dy = top1 - bottom2
|
|
1312
|
+
else:
|
|
1313
|
+
dy = 0
|
|
1314
|
+
return math.sqrt(dx * dx + dy * dy) # + (pn2-pn1)*10000
|
|
1315
|
+
|
|
1316
|
+
for (img, txt), poss in tbls_or_figs:
|
|
1317
|
+
bboxes = [(i, (b["page_number"], b["x0"], b["x1"], b["top"], b["bottom"])) for i, b in enumerate(self.boxes)]
|
|
1318
|
+
dists = [
|
|
1319
|
+
(min_rectangle_distance((pn, left, right, top + self.page_cum_height[pn], bott + self.page_cum_height[pn]), rect), i) for i, rect in bboxes for pn, left, right, top, bott in poss
|
|
1320
|
+
]
|
|
1321
|
+
min_i = np.argmin(dists, axis=0)[0]
|
|
1322
|
+
min_i, rect = bboxes[dists[min_i][-1]]
|
|
1323
|
+
if isinstance(txt, list):
|
|
1324
|
+
txt = "\n".join(txt)
|
|
1325
|
+
pn, left, right, top, bott = poss[0]
|
|
1326
|
+
if self.boxes[min_i]["bottom"] < top + self.page_cum_height[pn]:
|
|
1327
|
+
min_i += 1
|
|
1328
|
+
self.boxes.insert(
|
|
1329
|
+
min_i,
|
|
1330
|
+
{
|
|
1331
|
+
"page_number": pn + 1,
|
|
1332
|
+
"x0": left,
|
|
1333
|
+
"x1": right,
|
|
1334
|
+
"top": top + self.page_cum_height[pn],
|
|
1335
|
+
"bottom": bott + self.page_cum_height[pn],
|
|
1336
|
+
"layout_type": layout_type,
|
|
1337
|
+
"text": txt,
|
|
1338
|
+
"image": img,
|
|
1339
|
+
"positions": [[pn + 1, int(left), int(right), int(top), int(bott)]],
|
|
1340
|
+
},
|
|
1341
|
+
)
|
|
1342
|
+
|
|
1343
|
+
for b in self.boxes:
|
|
1344
|
+
b["position_tag"] = self._line_tag(b, zoomin)
|
|
1345
|
+
b["image"] = self.crop(b["position_tag"], zoomin)
|
|
1346
|
+
b["positions"] = [[pos[0][-1] + 1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(b["position_tag"])]
|
|
1347
|
+
|
|
1348
|
+
insert_table_figures(tbls, "table")
|
|
1349
|
+
insert_table_figures(figs, "figure")
|
|
1350
|
+
if callback:
|
|
1351
|
+
callback(1, "Structured ({:.2f}s)".format(timer() - start))
|
|
1352
|
+
return deepcopy(self.boxes)
|
|
1353
|
+
|
|
1354
|
+
@staticmethod
|
|
1355
|
+
def remove_tag(txt):
|
|
1356
|
+
return re.sub(r"@@[\t0-9.-]+?##", "", txt)
|
|
1357
|
+
|
|
1358
|
+
@staticmethod
|
|
1359
|
+
def extract_positions(txt):
|
|
1360
|
+
poss = []
|
|
1361
|
+
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
|
1362
|
+
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
|
1363
|
+
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
|
1364
|
+
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
|
1365
|
+
return poss
|
|
1366
|
+
|
|
1367
|
+
def crop(self, text, ZM=3, need_position=False):
|
|
1368
|
+
imgs = []
|
|
1369
|
+
poss = self.extract_positions(text)
|
|
1370
|
+
if not poss:
|
|
1371
|
+
if need_position:
|
|
1372
|
+
return None, None
|
|
1373
|
+
return
|
|
1374
|
+
|
|
1375
|
+
if not getattr(self, "page_images", None):
|
|
1376
|
+
logging.warning("crop called without page images; skipping image generation.")
|
|
1377
|
+
if need_position:
|
|
1378
|
+
return None, None
|
|
1379
|
+
return
|
|
1380
|
+
|
|
1381
|
+
page_count = len(self.page_images)
|
|
1382
|
+
|
|
1383
|
+
filtered_poss = []
|
|
1384
|
+
for pns, left, right, top, bottom in poss:
|
|
1385
|
+
if not pns:
|
|
1386
|
+
logging.warning("Empty page index list in crop; skipping this position.")
|
|
1387
|
+
continue
|
|
1388
|
+
valid_pns = [p for p in pns if 0 <= p < page_count]
|
|
1389
|
+
if not valid_pns:
|
|
1390
|
+
logging.warning(f"All page indices {pns} out of range for {page_count} pages; skipping.")
|
|
1391
|
+
continue
|
|
1392
|
+
filtered_poss.append((valid_pns, left, right, top, bottom))
|
|
1393
|
+
|
|
1394
|
+
poss = filtered_poss
|
|
1395
|
+
if not poss:
|
|
1396
|
+
logging.warning("No valid positions after filtering; skip cropping.")
|
|
1397
|
+
if need_position:
|
|
1398
|
+
return None, None
|
|
1399
|
+
return
|
|
1400
|
+
|
|
1401
|
+
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
|
1402
|
+
GAP = 6
|
|
1403
|
+
pos = poss[0]
|
|
1404
|
+
first_page_idx = pos[0][0]
|
|
1405
|
+
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
|
1406
|
+
pos = poss[-1]
|
|
1407
|
+
last_page_idx = pos[0][-1]
|
|
1408
|
+
if not (0 <= last_page_idx < page_count):
|
|
1409
|
+
logging.warning(f"Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
|
1410
|
+
if need_position:
|
|
1411
|
+
return None, None
|
|
1412
|
+
return
|
|
1413
|
+
last_page_height = self.page_images[last_page_idx].size[1] / ZM
|
|
1414
|
+
poss.append(
|
|
1415
|
+
(
|
|
1416
|
+
[last_page_idx],
|
|
1417
|
+
pos[1],
|
|
1418
|
+
pos[2],
|
|
1419
|
+
min(last_page_height, pos[4] + GAP),
|
|
1420
|
+
min(last_page_height, pos[4] + 120),
|
|
1421
|
+
)
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
positions = []
|
|
1425
|
+
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
|
1426
|
+
if 0 < ii < len(poss) - 1:
|
|
1427
|
+
right = max(left + 10, right)
|
|
1428
|
+
else:
|
|
1429
|
+
right = left + max_width
|
|
1430
|
+
bottom *= ZM
|
|
1431
|
+
for pn in pns[1:]:
|
|
1432
|
+
if 0 <= pn - 1 < page_count:
|
|
1433
|
+
bottom += self.page_images[pn - 1].size[1]
|
|
1434
|
+
else:
|
|
1435
|
+
logging.warning(f"Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
|
1436
|
+
|
|
1437
|
+
if not (0 <= pns[0] < page_count):
|
|
1438
|
+
logging.warning(f"Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
|
1439
|
+
continue
|
|
1440
|
+
|
|
1441
|
+
imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1]))))
|
|
1442
|
+
if 0 < ii < len(poss) - 1:
|
|
1443
|
+
positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM))
|
|
1444
|
+
bottom -= self.page_images[pns[0]].size[1]
|
|
1445
|
+
for pn in pns[1:]:
|
|
1446
|
+
if not (0 <= pn < page_count):
|
|
1447
|
+
logging.warning(f"Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
|
1448
|
+
continue
|
|
1449
|
+
imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1]))))
|
|
1450
|
+
if 0 < ii < len(poss) - 1:
|
|
1451
|
+
positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM))
|
|
1452
|
+
bottom -= self.page_images[pn].size[1]
|
|
1453
|
+
|
|
1454
|
+
if not imgs:
|
|
1455
|
+
if need_position:
|
|
1456
|
+
return None, None
|
|
1457
|
+
return
|
|
1458
|
+
height = 0
|
|
1459
|
+
for img in imgs:
|
|
1460
|
+
height += img.size[1] + GAP
|
|
1461
|
+
height = int(height)
|
|
1462
|
+
width = int(np.max([i.size[0] for i in imgs]))
|
|
1463
|
+
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
|
1464
|
+
height = 0
|
|
1465
|
+
for ii, img in enumerate(imgs):
|
|
1466
|
+
if ii == 0 or ii + 1 == len(imgs):
|
|
1467
|
+
img = img.convert("RGBA")
|
|
1468
|
+
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
|
1469
|
+
overlay.putalpha(128)
|
|
1470
|
+
img = Image.alpha_composite(img, overlay).convert("RGB")
|
|
1471
|
+
pic.paste(img, (0, int(height)))
|
|
1472
|
+
height += img.size[1] + GAP
|
|
1473
|
+
|
|
1474
|
+
if need_position:
|
|
1475
|
+
return pic, positions
|
|
1476
|
+
return pic
|
|
1477
|
+
|
|
1478
|
+
def get_position(self, bx, ZM):
|
|
1479
|
+
poss = []
|
|
1480
|
+
pn = bx["page_number"]
|
|
1481
|
+
top = bx["top"] - self.page_cum_height[pn - 1]
|
|
1482
|
+
bott = bx["bottom"] - self.page_cum_height[pn - 1]
|
|
1483
|
+
poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
|
|
1484
|
+
while bott * ZM > self.page_images[pn - 1].size[1]:
|
|
1485
|
+
bott -= self.page_images[pn - 1].size[1] / ZM
|
|
1486
|
+
top = 0
|
|
1487
|
+
pn += 1
|
|
1488
|
+
poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
|
|
1489
|
+
return poss
|
|
1490
|
+
|
|
1491
|
+
|
|
1492
|
+
class PlainParser:
|
|
1493
|
+
def __init__(self):
|
|
1494
|
+
pass
|
|
1495
|
+
|
|
1496
|
+
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
|
|
1497
|
+
self.outlines = []
|
|
1498
|
+
lines = []
|
|
1499
|
+
try:
|
|
1500
|
+
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
|
|
1501
|
+
for page in self.pdf.pages[from_page:to_page]:
|
|
1502
|
+
lines.extend([t for t in page.extract_text().split("\n")])
|
|
1503
|
+
|
|
1504
|
+
outlines = self.pdf.outline
|
|
1505
|
+
|
|
1506
|
+
def dfs(arr, depth):
|
|
1507
|
+
for a in arr:
|
|
1508
|
+
if isinstance(a, dict):
|
|
1509
|
+
self.outlines.append((a["/Title"], depth))
|
|
1510
|
+
continue
|
|
1511
|
+
dfs(a, depth + 1)
|
|
1512
|
+
|
|
1513
|
+
dfs(outlines, 0)
|
|
1514
|
+
except Exception:
|
|
1515
|
+
logging.exception("Outlines exception")
|
|
1516
|
+
if not self.outlines:
|
|
1517
|
+
logging.warning("Miss outlines")
|
|
1518
|
+
|
|
1519
|
+
return [(line, "") for line in lines], []
|
|
1520
|
+
|
|
1521
|
+
def crop(self, ck, need_position):
|
|
1522
|
+
raise NotImplementedError
|
|
1523
|
+
|
|
1524
|
+
@staticmethod
|
|
1525
|
+
def remove_tag(txt):
|
|
1526
|
+
raise NotImplementedError
|
|
1527
|
+
|
|
1528
|
+
|
|
1529
|
+
class VisionParser(RAGFlowPdfParser):
|
|
1530
|
+
def __init__(
|
|
1531
|
+
self,
|
|
1532
|
+
vision_model,
|
|
1533
|
+
model_cfg: PdfModelConfig | None = None,
|
|
1534
|
+
tokenizer_cfg: TokenizerConfig | None = None,
|
|
1535
|
+
):
|
|
1536
|
+
super().__init__(model_cfg=model_cfg, tokenizer_cfg=tokenizer_cfg)
|
|
1537
|
+
self.vision_model = vision_model
|
|
1538
|
+
self.outlines = []
|
|
1539
|
+
|
|
1540
|
+
|
|
1541
|
+
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
|
|
1542
|
+
try:
|
|
1543
|
+
with sys.modules[LOCK_KEY_pdfplumber]:
|
|
1544
|
+
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
|
1545
|
+
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])]
|
|
1546
|
+
self.total_page = len(self.pdf.pages)
|
|
1547
|
+
except Exception:
|
|
1548
|
+
self.page_images = None
|
|
1549
|
+
self.total_page = 0
|
|
1550
|
+
logging.exception("VisionParser __images__")
|
|
1551
|
+
|
|
1552
|
+
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
|
|
1553
|
+
callback = kwargs.get("callback", lambda prog, msg: None)
|
|
1554
|
+
zoomin = kwargs.get("zoomin", 3)
|
|
1555
|
+
self.__images__(fnm=filename, zoomin=zoomin, page_from=from_page, page_to=to_page, callback=callback)
|
|
1556
|
+
|
|
1557
|
+
total_pdf_pages = self.total_page
|
|
1558
|
+
|
|
1559
|
+
start_page = max(0, from_page)
|
|
1560
|
+
end_page = min(to_page, total_pdf_pages)
|
|
1561
|
+
|
|
1562
|
+
all_docs = []
|
|
1563
|
+
|
|
1564
|
+
for idx, img_binary in enumerate(self.page_images or []):
|
|
1565
|
+
pdf_page_num = idx # 0-based
|
|
1566
|
+
if pdf_page_num < start_page or pdf_page_num >= end_page:
|
|
1567
|
+
continue
|
|
1568
|
+
|
|
1569
|
+
from .llm_adapter.vision import vision_llm_chunk as picture_vision_llm_chunk
|
|
1570
|
+
|
|
1571
|
+
text = picture_vision_llm_chunk(
|
|
1572
|
+
binary=img_binary,
|
|
1573
|
+
vision_model=self.vision_model,
|
|
1574
|
+
prompt=vision_llm_describe_prompt(page=pdf_page_num + 1),
|
|
1575
|
+
callback=callback,
|
|
1576
|
+
)
|
|
1577
|
+
|
|
1578
|
+
if kwargs.get("callback"):
|
|
1579
|
+
kwargs["callback"](idx * 1.0 / len(self.page_images), f"Processed: {idx + 1}/{len(self.page_images)}")
|
|
1580
|
+
|
|
1581
|
+
if text:
|
|
1582
|
+
width, height = self.page_images[idx].size
|
|
1583
|
+
all_docs.append((
|
|
1584
|
+
text,
|
|
1585
|
+
f"@@{pdf_page_num + 1}\t{0.0:.1f}\t{width / zoomin:.1f}\t{0.0:.1f}\t{height / zoomin:.1f}##"
|
|
1586
|
+
))
|
|
1587
|
+
return all_docs, []
|
|
1588
|
+
|
|
1589
|
+
|
|
1590
|
+
if __name__ == "__main__":
|
|
1591
|
+
pass
|