bisheng-langchain 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bisheng_langchain/__init__.py +0 -0
- bisheng_langchain/chains/__init__.py +5 -0
- bisheng_langchain/chains/combine_documents/__init__.py +0 -0
- bisheng_langchain/chains/combine_documents/stuff.py +56 -0
- bisheng_langchain/chains/question_answering/__init__.py +240 -0
- bisheng_langchain/chains/retrieval_qa/__init__.py +0 -0
- bisheng_langchain/chains/retrieval_qa/base.py +89 -0
- bisheng_langchain/chat_models/__init__.py +11 -0
- bisheng_langchain/chat_models/host_llm.py +409 -0
- bisheng_langchain/chat_models/interface/__init__.py +10 -0
- bisheng_langchain/chat_models/interface/minimax.py +123 -0
- bisheng_langchain/chat_models/interface/openai.py +68 -0
- bisheng_langchain/chat_models/interface/types.py +61 -0
- bisheng_langchain/chat_models/interface/utils.py +5 -0
- bisheng_langchain/chat_models/interface/wenxin.py +114 -0
- bisheng_langchain/chat_models/interface/xunfei.py +233 -0
- bisheng_langchain/chat_models/interface/zhipuai.py +81 -0
- bisheng_langchain/chat_models/minimax.py +354 -0
- bisheng_langchain/chat_models/proxy_llm.py +354 -0
- bisheng_langchain/chat_models/wenxin.py +349 -0
- bisheng_langchain/chat_models/xunfeiai.py +355 -0
- bisheng_langchain/chat_models/zhipuai.py +379 -0
- bisheng_langchain/document_loaders/__init__.py +3 -0
- bisheng_langchain/document_loaders/elem_html.py +0 -0
- bisheng_langchain/document_loaders/elem_image.py +0 -0
- bisheng_langchain/document_loaders/elem_pdf.py +655 -0
- bisheng_langchain/document_loaders/parsers/__init__.py +5 -0
- bisheng_langchain/document_loaders/parsers/image.py +28 -0
- bisheng_langchain/document_loaders/parsers/test_image.py +286 -0
- bisheng_langchain/embeddings/__init__.py +7 -0
- bisheng_langchain/embeddings/host_embedding.py +133 -0
- bisheng_langchain/embeddings/interface/__init__.py +3 -0
- bisheng_langchain/embeddings/interface/types.py +23 -0
- bisheng_langchain/embeddings/interface/wenxin.py +86 -0
- bisheng_langchain/embeddings/wenxin.py +139 -0
- bisheng_langchain/vectorstores/__init__.py +3 -0
- bisheng_langchain/vectorstores/elastic_keywords_search.py +284 -0
- bisheng_langchain-0.0.1.dist-info/METADATA +64 -0
- bisheng_langchain-0.0.1.dist-info/RECORD +41 -0
- bisheng_langchain-0.0.1.dist-info/WHEEL +5 -0
- bisheng_langchain-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,655 @@
|
|
1
|
+
# flake8: noqa
|
2
|
+
"""Loads PDF with semantic splilter."""
|
3
|
+
import io
|
4
|
+
import json
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
import re
|
8
|
+
import tempfile
|
9
|
+
import time
|
10
|
+
from abc import ABC
|
11
|
+
from collections import Counter
|
12
|
+
from copy import deepcopy
|
13
|
+
from pathlib import Path
|
14
|
+
from typing import Any, Iterator, List, Mapping, Optional, Union
|
15
|
+
from urllib.parse import urlparse
|
16
|
+
|
17
|
+
import fitz
|
18
|
+
import numpy as np
|
19
|
+
import pypdfium2
|
20
|
+
import requests
|
21
|
+
from bisheng_langchain.document_loaders.parsers import LayoutParser
|
22
|
+
from langchain.docstore.document import Document
|
23
|
+
from langchain.document_loaders.blob_loaders import Blob
|
24
|
+
from langchain.document_loaders.pdf import BasePDFLoader
|
25
|
+
from shapely import Polygon
|
26
|
+
from shapely import box as Rect
|
27
|
+
|
28
|
+
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r'\s+', flags=re.DOTALL)
|
29
|
+
|
30
|
+
|
31
|
+
def merge_rects(bboxes):
|
32
|
+
x0 = np.min(bboxes[:, 0])
|
33
|
+
y0 = np.min(bboxes[:, 1])
|
34
|
+
x1 = np.max(bboxes[:, 2])
|
35
|
+
y1 = np.max(bboxes[:, 3])
|
36
|
+
return [x0, y0, x1, y1]
|
37
|
+
|
38
|
+
|
39
|
+
def norm_rect(bbox):
|
40
|
+
x0 = np.min([bbox[0], bbox[2]])
|
41
|
+
x1 = np.max([bbox[0], bbox[2]])
|
42
|
+
y0 = np.min([bbox[1], bbox[3]])
|
43
|
+
y1 = np.max([bbox[1], bbox[3]])
|
44
|
+
return np.asarray([x0, y0, x1, y1])
|
45
|
+
|
46
|
+
|
47
|
+
def find_max_continuous_seq(arr):
|
48
|
+
n = len(arr)
|
49
|
+
max_info = (0, 1)
|
50
|
+
for i in range(n):
|
51
|
+
m = 1
|
52
|
+
for j in range(i + 1, n):
|
53
|
+
if arr[j] - arr[j - 1] == 1:
|
54
|
+
m += 1
|
55
|
+
else:
|
56
|
+
break
|
57
|
+
|
58
|
+
if m > max_info[1]:
|
59
|
+
max_info = (i, m)
|
60
|
+
|
61
|
+
max_info = (max_info[0] + arr[0], max_info[1])
|
62
|
+
return max_info
|
63
|
+
|
64
|
+
|
65
|
+
def order_by_tbyx(block_info, th=10):
|
66
|
+
"""
|
67
|
+
block_info: [(b0, b1, b2, b3, text, x, y)+]
|
68
|
+
th: threshold of the position threshold
|
69
|
+
"""
|
70
|
+
# sort using y1 first and then x1
|
71
|
+
res = sorted(block_info, key=lambda b: (b[1], b[0]))
|
72
|
+
for i in range(len(res) - 1):
|
73
|
+
for j in range(i, 0, -1):
|
74
|
+
# restore the order using the
|
75
|
+
if (abs(res[j + 1][1] - res[j][1]) < th
|
76
|
+
and (res[j + 1][0] < res[j][0])):
|
77
|
+
tmp = deepcopy(res[j])
|
78
|
+
res[j] = deepcopy(res[j + 1])
|
79
|
+
res[j + 1] = deepcopy(tmp)
|
80
|
+
else:
|
81
|
+
break
|
82
|
+
return res
|
83
|
+
|
84
|
+
|
85
|
+
def join_lines(texts, is_table=False):
|
86
|
+
if is_table:
|
87
|
+
return '\n'.join(texts)
|
88
|
+
|
89
|
+
flags = []
|
90
|
+
PUNC_SET = set(['.', ',', ';', '?', '!'])
|
91
|
+
for text in texts:
|
92
|
+
flags.append(np.all([t.isalnum() for t in text.rsplit(' ', 5)]))
|
93
|
+
|
94
|
+
if np.all(flags):
|
95
|
+
t0 = texts[0]
|
96
|
+
for t in texts[1:]:
|
97
|
+
if t0[-1] == '-':
|
98
|
+
t0 = t0[:-1] + t
|
99
|
+
elif t0[-1].isalnum() and t[0].isalnum():
|
100
|
+
t0 += ' ' + t
|
101
|
+
elif t0[-1] in PUNC_SET or t[0] in PUNC_SET:
|
102
|
+
t0 += ' ' + t
|
103
|
+
else:
|
104
|
+
t0 += t
|
105
|
+
return t0
|
106
|
+
else:
|
107
|
+
return ''.join(texts)
|
108
|
+
|
109
|
+
|
110
|
+
class Segment:
|
111
|
+
|
112
|
+
def __init__(self, seg):
|
113
|
+
self.whole = seg
|
114
|
+
self.segs = []
|
115
|
+
|
116
|
+
@staticmethod
|
117
|
+
def is_align(seg0, seg1, delta=5, mode=0):
|
118
|
+
# mode=0 edge align
|
119
|
+
# mode=1, edge align or center align
|
120
|
+
res = Segment.contain(seg0, seg1)
|
121
|
+
if not res:
|
122
|
+
return False
|
123
|
+
else:
|
124
|
+
if mode == 1:
|
125
|
+
r1 = seg1[0] - seg0[0] <= delta or seg0[1] - seg1[1] <= delta
|
126
|
+
c0 = (seg0[0] + seg0[1]) / 2
|
127
|
+
c1 = (seg1[0] + seg1[1]) / 2
|
128
|
+
r2 = abs(c1 - c0) <= delta
|
129
|
+
return r1 or r2
|
130
|
+
else:
|
131
|
+
return seg1[0] - seg0[0] <= delta or seg0[1] - seg1[1] <= delta
|
132
|
+
|
133
|
+
@staticmethod
|
134
|
+
def contain(seg0, seg1):
|
135
|
+
return seg0[0] <= seg1[0] and seg0[1] >= seg1[0]
|
136
|
+
|
137
|
+
@staticmethod
|
138
|
+
def overlap(seg0, seg1):
|
139
|
+
max_x0 = max(seg0[0], seg1[0])
|
140
|
+
min_x1 = min(seg0[1], seg1[1])
|
141
|
+
return max_x0 < min_x1
|
142
|
+
|
143
|
+
def _merge(self, segs):
|
144
|
+
x0s = [s[0] for s in segs]
|
145
|
+
x1s = [s[1] for s in segs]
|
146
|
+
return (np.min(x0s), np.max(x1s))
|
147
|
+
|
148
|
+
def add(self, seg):
|
149
|
+
if not self.segs:
|
150
|
+
self.segs.append(seg)
|
151
|
+
else:
|
152
|
+
overlaps = []
|
153
|
+
non_overlaps = []
|
154
|
+
for seg0 in self.segs:
|
155
|
+
if Segment.overlap(seg0, seg):
|
156
|
+
overlaps.append(seg0)
|
157
|
+
else:
|
158
|
+
non_overlaps.append(seg0)
|
159
|
+
|
160
|
+
if not overlaps:
|
161
|
+
self.segs.append(seg)
|
162
|
+
else:
|
163
|
+
overlaps.append(seg)
|
164
|
+
new_seg = self._merge(overlaps)
|
165
|
+
non_overlaps.append(new_seg)
|
166
|
+
self.segs = non_overlaps
|
167
|
+
|
168
|
+
def get_free_segment(self, incr_margin=True, margin_threshold=10):
|
169
|
+
sorted_segs = sorted(self.segs, key=lambda x: x[0])
|
170
|
+
n = len(sorted_segs)
|
171
|
+
free_segs = []
|
172
|
+
if incr_margin:
|
173
|
+
if n > 0:
|
174
|
+
seg_1st = sorted_segs[0]
|
175
|
+
if (seg_1st[0] - self.whole[0]) > margin_threshold:
|
176
|
+
free_segs.append((self.whole[0], seg_1st[0]))
|
177
|
+
|
178
|
+
seg_last = sorted_segs[-1]
|
179
|
+
if (self.whole[1] - seg_last[1]) > margin_threshold:
|
180
|
+
free_segs.append((seg_last[1], self.whole[1]))
|
181
|
+
|
182
|
+
for i in range(n - 1):
|
183
|
+
x0 = sorted_segs[i][1]
|
184
|
+
x1 = sorted_segs[i + 1][0]
|
185
|
+
free_segs.append((x0, x1))
|
186
|
+
|
187
|
+
return free_segs
|
188
|
+
|
189
|
+
|
190
|
+
class PDFWithSemanticLoader(BasePDFLoader):
|
191
|
+
"""Loads a PDF with pypdf and chunks at character level.
|
192
|
+
|
193
|
+
Loader also stores page numbers in metadata.
|
194
|
+
"""
|
195
|
+
|
196
|
+
def __init__(self,
|
197
|
+
file_path: str,
|
198
|
+
password: Optional[Union[str, bytes]] = None,
|
199
|
+
layout_api_key: str = None,
|
200
|
+
layout_api_url: str = None,
|
201
|
+
is_join_table: bool = True,
|
202
|
+
with_columns: bool = False,
|
203
|
+
support_rotate: bool = False,
|
204
|
+
text_elem_sep: str = '\n',
|
205
|
+
start: int = 0,
|
206
|
+
n: int = None,
|
207
|
+
html_output_file: str = None,
|
208
|
+
verbose: bool = False) -> None:
|
209
|
+
"""Initialize with a file path."""
|
210
|
+
self.layout_parser = LayoutParser(api_key=layout_api_key,
|
211
|
+
api_base_url=layout_api_url)
|
212
|
+
self.with_columns = with_columns
|
213
|
+
self.is_join_table = is_join_table
|
214
|
+
self.support_rotate = support_rotate
|
215
|
+
self.start = start
|
216
|
+
self.n = n
|
217
|
+
self.html_output_file = html_output_file
|
218
|
+
self.verbose = verbose
|
219
|
+
self.text_elem_sep = text_elem_sep
|
220
|
+
super().__init__(file_path)
|
221
|
+
|
222
|
+
def _get_image_blobs(self, fitz_doc, pdf_reader, n=None, start=0):
|
223
|
+
blobs = []
|
224
|
+
pages = []
|
225
|
+
if not n:
|
226
|
+
n = fitz_doc.page_count
|
227
|
+
for pg in range(start, start + n):
|
228
|
+
bytes_img = None
|
229
|
+
page = fitz_doc.load_page(pg)
|
230
|
+
pages.append(page)
|
231
|
+
mat = fitz.Matrix(1, 1)
|
232
|
+
try:
|
233
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
234
|
+
bytes_img = pm.getPNGData()
|
235
|
+
except Exception:
|
236
|
+
# some pdf input cannot get render image from fitz
|
237
|
+
page = pdf_reader.get_page(pg)
|
238
|
+
pil_image = page.render().to_pil()
|
239
|
+
img_byte_arr = io.BytesIO()
|
240
|
+
pil_image.save(img_byte_arr, format='PNG')
|
241
|
+
bytes_img = img_byte_arr.getvalue()
|
242
|
+
|
243
|
+
blobs.append(Blob(data=bytes_img))
|
244
|
+
return blobs, pages
|
245
|
+
|
246
|
+
def _allocate_semantic(self, page, layout):
|
247
|
+
class_name = ['印章', '图片', '标题', '段落', '表格', '页眉', '页码', '页脚']
|
248
|
+
effective_class_inds = [3, 4, 5, 999]
|
249
|
+
non_conti_class_ids = [6, 7, 8]
|
250
|
+
TEXT_ID = 4
|
251
|
+
TABLE_ID = 5
|
252
|
+
|
253
|
+
textpage = page.get_textpage()
|
254
|
+
blocks = textpage.extractBLOCKS()
|
255
|
+
|
256
|
+
if self.support_rotate:
|
257
|
+
rotation_matrix = np.asarray(page.rotation_matrix).reshape((3, 2))
|
258
|
+
c1 = (rotation_matrix[0, 0] - 1) <= 1e-6
|
259
|
+
c2 = (rotation_matrix[1, 1] - 1) <= 1e-6
|
260
|
+
is_rotated = c1 and c2
|
261
|
+
# print('c1/c2', c1, c2)
|
262
|
+
if is_rotated:
|
263
|
+
new_blocks = []
|
264
|
+
for b in blocks:
|
265
|
+
bbox = np.asarray([b[0], b[1], b[2], b[3]])
|
266
|
+
aug_bbox = bbox.reshape((-1, 2))
|
267
|
+
padding = np.ones((len(aug_bbox), 1))
|
268
|
+
aug_bbox = np.hstack([aug_bbox, padding])
|
269
|
+
bb = np.dot(aug_bbox, rotation_matrix).reshape(-1)
|
270
|
+
bb = norm_rect(bb)
|
271
|
+
info = (bb[0], bb[1], bb[2], bb[3], b[4], b[5], b[6])
|
272
|
+
new_blocks.append(info)
|
273
|
+
|
274
|
+
blocks = new_blocks
|
275
|
+
|
276
|
+
if not self.with_columns:
|
277
|
+
blocks = order_by_tbyx(blocks)
|
278
|
+
|
279
|
+
# print('---ori blocks---')
|
280
|
+
# for b in blocks:
|
281
|
+
# print(b)
|
282
|
+
|
283
|
+
IMG_BLOCK_TYPE = 1
|
284
|
+
text_ploys = []
|
285
|
+
text_rects = []
|
286
|
+
texts = []
|
287
|
+
for b in blocks:
|
288
|
+
if b[-1] != IMG_BLOCK_TYPE:
|
289
|
+
text = re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', b[4]
|
290
|
+
or '').strip()
|
291
|
+
if text:
|
292
|
+
texts.append(text)
|
293
|
+
text_ploys.append(Rect(b[0], b[1], b[2], b[3]))
|
294
|
+
text_rects.append([b[0], b[1], b[2], b[3]])
|
295
|
+
text_rects = np.asarray(text_rects)
|
296
|
+
texts = np.asarray(texts)
|
297
|
+
|
298
|
+
semantic_polys = []
|
299
|
+
semantic_labels = []
|
300
|
+
|
301
|
+
layout_info = json.loads(layout.page_content)
|
302
|
+
for info in layout_info:
|
303
|
+
bbs = info['bbox']
|
304
|
+
coords = ((bbs[0], bbs[1]), (bbs[2], bbs[3]), (bbs[4], bbs[5]),
|
305
|
+
(bbs[6], bbs[7]))
|
306
|
+
semantic_polys.append(Polygon(coords))
|
307
|
+
semantic_labels.append(info['category_id'])
|
308
|
+
|
309
|
+
# caculate containing overlap
|
310
|
+
sem_cnt = len(semantic_polys)
|
311
|
+
texts_cnt = len(text_ploys)
|
312
|
+
contain_matrix = np.zeros((sem_cnt, texts_cnt))
|
313
|
+
for i in range(sem_cnt):
|
314
|
+
for j in range(texts_cnt):
|
315
|
+
inter = semantic_polys[i].intersection(text_ploys[j]).area
|
316
|
+
contain_matrix[i, j] = inter * 1.0 / text_ploys[j].area
|
317
|
+
|
318
|
+
# print('----------------containing matrix--------')
|
319
|
+
# for r in contain_matrix.tolist():
|
320
|
+
# print([round(r_, 2) for r_ in r])
|
321
|
+
|
322
|
+
# print('---text---')
|
323
|
+
# for t in texts:
|
324
|
+
# print(t)
|
325
|
+
|
326
|
+
# merge continuous text block by the containing matrix
|
327
|
+
CONTRAIN_THRESHOLD = 0.70
|
328
|
+
contain_info = []
|
329
|
+
for i in range(sem_cnt):
|
330
|
+
ind = np.argwhere(contain_matrix[i, :] > CONTRAIN_THRESHOLD)[:, 0]
|
331
|
+
if len(ind) == 0: continue
|
332
|
+
label = semantic_labels[i]
|
333
|
+
if label in non_conti_class_ids:
|
334
|
+
n = len(ind)
|
335
|
+
contain_info.append((None, None, n, label, ind))
|
336
|
+
else:
|
337
|
+
start, n = find_max_continuous_seq(ind)
|
338
|
+
if n >= 1:
|
339
|
+
contain_info.append((start, start + n, n, label, None))
|
340
|
+
|
341
|
+
contain_info = sorted(contain_info, key=lambda x: x[2], reverse=True)
|
342
|
+
mask = np.zeros(texts_cnt)
|
343
|
+
new_block_info = []
|
344
|
+
for info in contain_info:
|
345
|
+
start, end, n, label, ind = info
|
346
|
+
if label in non_conti_class_ids and np.all(mask[ind] == 0):
|
347
|
+
rect = merge_rects(text_rects[ind])
|
348
|
+
ori_orders = [blocks[i][-2] for i in ind]
|
349
|
+
ts = texts[ind]
|
350
|
+
rs = text_rects[ind]
|
351
|
+
ord_ind = np.min(ori_orders)
|
352
|
+
mask[ind] = 1
|
353
|
+
new_block_info.append(
|
354
|
+
(rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
|
355
|
+
|
356
|
+
elif np.all(mask[start:end] == 0):
|
357
|
+
rect = merge_rects(text_rects[start:end])
|
358
|
+
ori_orders = [blocks[i][-2] for i in range(start, end)]
|
359
|
+
arg_ind = np.argsort(ori_orders)
|
360
|
+
# print('ori_orders', ori_orders, arg_ind)
|
361
|
+
ord_ind = np.min(ori_orders)
|
362
|
+
|
363
|
+
ts = texts[start:end]
|
364
|
+
rs = text_rects[start:end]
|
365
|
+
if label == TABLE_ID:
|
366
|
+
ts = ts[arg_ind]
|
367
|
+
rs = rs[arg_ind]
|
368
|
+
|
369
|
+
mask[start:end] = 1
|
370
|
+
new_block_info.append(
|
371
|
+
(rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
|
372
|
+
|
373
|
+
for i in range(texts_cnt):
|
374
|
+
if mask[i] == 0:
|
375
|
+
b = blocks[i]
|
376
|
+
r = np.asarray([b[0], b[1], b[2], b[3]])
|
377
|
+
ord_ind = b[-2]
|
378
|
+
new_block_info.append(
|
379
|
+
(b[0], b[1], b[2], b[3], [texts[i]], [r], ord_ind))
|
380
|
+
|
381
|
+
if self.with_columns:
|
382
|
+
new_blocks = sorted(new_block_info, key=lambda x: x[-1])
|
383
|
+
else:
|
384
|
+
new_blocks = order_by_tbyx(new_block_info)
|
385
|
+
|
386
|
+
# print('\n\n---new blocks---')
|
387
|
+
# for idx, b in enumerate(new_blocks):
|
388
|
+
# print(idx, b)
|
389
|
+
|
390
|
+
text_ploys = []
|
391
|
+
texts = []
|
392
|
+
for b in new_blocks:
|
393
|
+
texts.append(b[4])
|
394
|
+
text_ploys.append(Rect(b[0], b[1], b[2], b[3]))
|
395
|
+
|
396
|
+
# caculate overlap
|
397
|
+
sem_cnt = len(semantic_polys)
|
398
|
+
texts_cnt = len(text_ploys)
|
399
|
+
overlap_matrix = np.zeros((sem_cnt, texts_cnt))
|
400
|
+
for i in range(sem_cnt):
|
401
|
+
for j in range(texts_cnt):
|
402
|
+
inter = semantic_polys[i].intersection(text_ploys[j]).area
|
403
|
+
union = semantic_polys[i].union(text_ploys[j]).area
|
404
|
+
overlap_matrix[i, j] = (inter * 1.0) / union
|
405
|
+
|
406
|
+
# print('---overlap_matrix---')
|
407
|
+
# for r in overlap_matrix:
|
408
|
+
# print([round(r_, 3) for r_ in r])
|
409
|
+
|
410
|
+
# allocate label
|
411
|
+
OVERLAP_THRESHOLD = 0.2
|
412
|
+
texts_labels = []
|
413
|
+
DEF_SEM_LABEL = 999
|
414
|
+
for j in range(texts_cnt):
|
415
|
+
ind = np.argwhere(overlap_matrix[:, j] > OVERLAP_THRESHOLD)[:, 0]
|
416
|
+
if len(ind) == 0:
|
417
|
+
sem_label = DEF_SEM_LABEL
|
418
|
+
else:
|
419
|
+
c = Counter([semantic_labels[i] for i in ind])
|
420
|
+
items = c.most_common()
|
421
|
+
sem_label = items[0][0]
|
422
|
+
if len(items) > 1 and TEXT_ID in dict(items):
|
423
|
+
sem_label = TEXT_ID
|
424
|
+
|
425
|
+
texts_labels.append(sem_label)
|
426
|
+
|
427
|
+
# print(texts_labels)
|
428
|
+
# filter the unused element
|
429
|
+
filtered_blocks = []
|
430
|
+
for label, b in zip(texts_labels, new_blocks):
|
431
|
+
if label in effective_class_inds:
|
432
|
+
text = join_lines(b[4], label == TABLE_ID)
|
433
|
+
filtered_blocks.append(
|
434
|
+
(b[0], b[1], b[2], b[3], text, b[5], label))
|
435
|
+
|
436
|
+
# print('---filtered_blocks---')
|
437
|
+
# for b in filtered_blocks:
|
438
|
+
# print(b)
|
439
|
+
|
440
|
+
return filtered_blocks
|
441
|
+
|
442
|
+
def _divide_blocks_into_groups(self, blocks):
|
443
|
+
# support only pure two columns layout, each has same width
|
444
|
+
rects = np.asarray([[b[0], b[1], b[2], b[3]] for b in blocks])
|
445
|
+
min_x0 = np.min(rects[:, 0])
|
446
|
+
max_x1 = np.max(rects[:, 2])
|
447
|
+
root_seg = (min_x0, max_x1)
|
448
|
+
root_pc = (min_x0 + max_x1) / 2
|
449
|
+
root_offset = 20
|
450
|
+
center_seg = (root_pc - root_offset, root_pc + root_offset)
|
451
|
+
|
452
|
+
segment = Segment(root_seg)
|
453
|
+
for r in rects:
|
454
|
+
segment.add((r[0], r[2]))
|
455
|
+
|
456
|
+
COLUMN_THRESHOLD = 0.90
|
457
|
+
CENTER_GAP_THRESHOLD = 0.90
|
458
|
+
free_segs = segment.get_free_segment()
|
459
|
+
columns = []
|
460
|
+
if len(free_segs) == 1 and len(segment.segs) == 2:
|
461
|
+
free_seg = free_segs[0]
|
462
|
+
seg0 = segment.segs[0]
|
463
|
+
seg1 = segment.segs[1]
|
464
|
+
cover = seg0[1] - seg0[0] + seg1[1] - seg1[0]
|
465
|
+
c0 = cover / (root_seg[1] - root_seg[0])
|
466
|
+
c1 = Segment.contain(center_seg, free_seg)
|
467
|
+
if c0 > COLUMN_THRESHOLD and c1:
|
468
|
+
# two columns
|
469
|
+
columns.extend([seg0, seg1])
|
470
|
+
|
471
|
+
groups = [blocks]
|
472
|
+
if columns:
|
473
|
+
groups = [[] for _ in columns]
|
474
|
+
for b, r in zip(blocks, rects):
|
475
|
+
column_ind = 0
|
476
|
+
cand_seg = (r[0], r[2])
|
477
|
+
for i, seg in enumerate(columns):
|
478
|
+
if Segment.contain(seg, cand_seg):
|
479
|
+
column_ind = i
|
480
|
+
break
|
481
|
+
groups[i].append(b)
|
482
|
+
|
483
|
+
return groups
|
484
|
+
|
485
|
+
def _allocate_continuous(self, groups):
|
486
|
+
g_bound = []
|
487
|
+
groups = [g for g in groups if g]
|
488
|
+
for blocks in groups:
|
489
|
+
arr = [[b[0], b[1], b[2], b[3]] for b in blocks]
|
490
|
+
bboxes = np.asarray(arr)
|
491
|
+
g_bound.append(np.asarray(merge_rects(bboxes)))
|
492
|
+
|
493
|
+
LINE_FULL_THRESHOLD = 0.80
|
494
|
+
START_THRESHOLD = 0.8
|
495
|
+
SIMI_HEIGHT_THRESHOLD = 0.3
|
496
|
+
SIMI_WIDTH_THRESHOLD = 0.3
|
497
|
+
|
498
|
+
TEXT_ID = 4
|
499
|
+
TABLE_ID = 5
|
500
|
+
|
501
|
+
def _get_elem(blocks, is_first=True):
|
502
|
+
if not blocks:
|
503
|
+
return (None, None, None, None, None)
|
504
|
+
if is_first:
|
505
|
+
b1 = blocks[0]
|
506
|
+
b1_label = b1[-1]
|
507
|
+
r1 = b1[5][0]
|
508
|
+
r1_w = r1[2] - r1[0]
|
509
|
+
r1_h = r1[3] - r1[1]
|
510
|
+
return (b1, b1_label, r1, r1_w, r1_h)
|
511
|
+
else:
|
512
|
+
b0 = blocks[-1]
|
513
|
+
b0_label = b0[-1]
|
514
|
+
r0 = b0[5][-1]
|
515
|
+
r0_w = r0[2] - r0[0]
|
516
|
+
r0_h = r0[3] - r0[1]
|
517
|
+
return (b0, b0_label, r0, r0_w, r0_h)
|
518
|
+
|
519
|
+
b0, b0_label, r0, r0_w, r0_h = _get_elem(groups[0], False)
|
520
|
+
g0 = g_bound[0]
|
521
|
+
|
522
|
+
for i in range(1, len(groups)):
|
523
|
+
b1, b1_label, r1, r1_w, r1_h = _get_elem(groups[i], True)
|
524
|
+
g1 = g_bound[i]
|
525
|
+
|
526
|
+
# print('\n_allocate_continuous:')
|
527
|
+
# print(b0, b0_label, b1, b1_label)
|
528
|
+
|
529
|
+
if b0_label and b0_label == b1_label and b0_label == TEXT_ID:
|
530
|
+
c0 = r0_w / (g0[2] - g0[0])
|
531
|
+
c1 = (r1[0] - g1[0]) / r1_h
|
532
|
+
c2 = np.abs(r0_h - r1_h) / r1_h
|
533
|
+
|
534
|
+
# print('\n\n---conti texts---')
|
535
|
+
# print(b0_label, c0, c1, c2,
|
536
|
+
# b0, b0_label, r0, r0_w, r0_h,
|
537
|
+
# b1, b1_label, r1, r1_w, r1_h)
|
538
|
+
|
539
|
+
if (c0 > LINE_FULL_THRESHOLD and c1 < START_THRESHOLD
|
540
|
+
and c2 < SIMI_HEIGHT_THRESHOLD):
|
541
|
+
new_text = join_lines([b0[4], b1[4]])
|
542
|
+
new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5],
|
543
|
+
b0[6])
|
544
|
+
groups[i - 1][-1] = new_block
|
545
|
+
groups[i].pop(0)
|
546
|
+
|
547
|
+
elif (self.is_join_table and b0_label and b0_label == b1_label
|
548
|
+
and b0_label == TABLE_ID):
|
549
|
+
c0 = (r1_w - r0_w) / r1_h
|
550
|
+
if c0 < SIMI_WIDTH_THRESHOLD:
|
551
|
+
new_text = join_lines([b0[4], b1[4]], True)
|
552
|
+
new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5],
|
553
|
+
b0[6])
|
554
|
+
groups[i - 1][-1] = new_block
|
555
|
+
groups[i].pop(0)
|
556
|
+
|
557
|
+
b0, b0_label, r0, r0_w, r0_h = _get_elem(groups[i], False)
|
558
|
+
|
559
|
+
return groups
|
560
|
+
|
561
|
+
def save_to_html(self, groups, output_file):
|
562
|
+
styles = [
|
563
|
+
'style="background-color: #EBEBEB;"',
|
564
|
+
'style="background-color: #ABBAEA;"'
|
565
|
+
]
|
566
|
+
idx = 0
|
567
|
+
table_style = 'style="border:1px solid black;"'
|
568
|
+
|
569
|
+
with open(output_file, 'w') as fout:
|
570
|
+
for blocks in groups:
|
571
|
+
for b in blocks:
|
572
|
+
if b[-1] == 3:
|
573
|
+
text = f'<h1>{b[4]}</h1>'
|
574
|
+
elif b[-1] == 4:
|
575
|
+
text = f'<p {styles[idx % 2]}>{b[4]}</p>'
|
576
|
+
idx += 1
|
577
|
+
elif b[-1] == 5:
|
578
|
+
rows = b[4].split('\n')
|
579
|
+
content = []
|
580
|
+
for r in rows:
|
581
|
+
content.append(
|
582
|
+
f'<tr><td {table_style}>{r}</td></tr>')
|
583
|
+
elem_text = '\n'.join(content)
|
584
|
+
text = f'<table {table_style}>{elem_text}</table>'
|
585
|
+
else:
|
586
|
+
text = f'<p {styles[idx % 2]}>{b[4]}</p>'
|
587
|
+
idx += 1
|
588
|
+
|
589
|
+
fout.write(text + '\n')
|
590
|
+
|
591
|
+
def _save_to_document(self, groups):
|
592
|
+
TITLE_ID = 3
|
593
|
+
TEXT_ID = 4
|
594
|
+
TABLE_ID = 5
|
595
|
+
content_page = []
|
596
|
+
is_first_elem = True
|
597
|
+
for blocks in groups:
|
598
|
+
for b in blocks:
|
599
|
+
if is_first_elem:
|
600
|
+
content_page.append(b[4])
|
601
|
+
is_first_elem = False
|
602
|
+
else:
|
603
|
+
label, text = b[-1], b[4]
|
604
|
+
if label == TITLE_ID:
|
605
|
+
content_page.append('\n\n' + text)
|
606
|
+
else:
|
607
|
+
content_page.append(self.text_elem_sep + text)
|
608
|
+
|
609
|
+
return ''.join(content_page)
|
610
|
+
|
611
|
+
def load(self) -> List[Document]:
|
612
|
+
"""Load given path as pages."""
|
613
|
+
blob = Blob.from_path(self.file_path)
|
614
|
+
start = self.start
|
615
|
+
groups = []
|
616
|
+
with blob.as_bytes_io() as file_path:
|
617
|
+
fitz_doc = fitz.open(file_path)
|
618
|
+
pdf_doc = pypdfium2.PdfDocument(file_path, autoclose=True)
|
619
|
+
max_page = fitz_doc.page_count - start
|
620
|
+
n = self.n if self.n else max_page
|
621
|
+
n = min(n, max_page)
|
622
|
+
|
623
|
+
tic = time.time()
|
624
|
+
if self.verbose:
|
625
|
+
print(f'{n} pages need be processed...')
|
626
|
+
|
627
|
+
for idx in range(start, start + n):
|
628
|
+
blobs, pages = self._get_image_blobs(fitz_doc, pdf_doc, 1, idx)
|
629
|
+
layout = self.layout_parser.parse(blobs[0])[0]
|
630
|
+
blocks = self._allocate_semantic(pages[0], layout)
|
631
|
+
if not blocks: continue
|
632
|
+
|
633
|
+
if self.with_columns:
|
634
|
+
sub_groups = self._divide_blocks_into_groups(blocks)
|
635
|
+
groups.extend(sub_groups)
|
636
|
+
else:
|
637
|
+
groups.append(blocks)
|
638
|
+
|
639
|
+
if self.verbose:
|
640
|
+
count = idx - start + 1
|
641
|
+
if count % 50 == 0:
|
642
|
+
elapse = round(time.time() - tic, 2)
|
643
|
+
tic = time.time()
|
644
|
+
print(f'process {count} pages used {elapse}sec...')
|
645
|
+
|
646
|
+
groups = self._allocate_continuous(groups)
|
647
|
+
|
648
|
+
if self.html_output_file:
|
649
|
+
self.save_to_html(groups, self.html_output_file)
|
650
|
+
return []
|
651
|
+
|
652
|
+
page_content = self._save_to_document(groups)
|
653
|
+
meta = {'source': os.path.basename(self.file_path)}
|
654
|
+
doc = Document(page_content=page_content, metadata=meta)
|
655
|
+
return [doc]
|
@@ -0,0 +1,28 @@
|
|
1
|
+
import base64
|
2
|
+
import json
|
3
|
+
# import time
|
4
|
+
from typing import List, Optional
|
5
|
+
|
6
|
+
import requests
|
7
|
+
from langchain.document_loaders.blob_loaders import Blob
|
8
|
+
from langchain.schema import Document
|
9
|
+
|
10
|
+
|
11
|
+
class LayoutParser(object):
|
12
|
+
"""Parse image layout structure.
|
13
|
+
"""
|
14
|
+
|
15
|
+
def __init__(self,
|
16
|
+
api_key: Optional[str] = None,
|
17
|
+
api_base_url: Optional[str] = None):
|
18
|
+
self.api_key = api_key
|
19
|
+
self.api_base_url = 'http://192.168.106.20:14569/predict'
|
20
|
+
self.class_name = ['印章', '图片', '标题', '段落', '表格', '页眉', '页码', '页脚']
|
21
|
+
|
22
|
+
def parse(self, blob: Blob) -> List[Document]:
|
23
|
+
b64_data = base64.b64encode(blob.as_bytes()).decode()
|
24
|
+
data = {'img': b64_data}
|
25
|
+
resp = requests.post('http://192.168.106.20:14569/predict', data=data)
|
26
|
+
content = resp.json()
|
27
|
+
doc = Document(page_content=json.dumps(content), metadata={})
|
28
|
+
return [doc]
|