bisheng-langchain 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. bisheng_langchain/__init__.py +0 -0
  2. bisheng_langchain/chains/__init__.py +5 -0
  3. bisheng_langchain/chains/combine_documents/__init__.py +0 -0
  4. bisheng_langchain/chains/combine_documents/stuff.py +56 -0
  5. bisheng_langchain/chains/question_answering/__init__.py +240 -0
  6. bisheng_langchain/chains/retrieval_qa/__init__.py +0 -0
  7. bisheng_langchain/chains/retrieval_qa/base.py +89 -0
  8. bisheng_langchain/chat_models/__init__.py +11 -0
  9. bisheng_langchain/chat_models/host_llm.py +409 -0
  10. bisheng_langchain/chat_models/interface/__init__.py +10 -0
  11. bisheng_langchain/chat_models/interface/minimax.py +123 -0
  12. bisheng_langchain/chat_models/interface/openai.py +68 -0
  13. bisheng_langchain/chat_models/interface/types.py +61 -0
  14. bisheng_langchain/chat_models/interface/utils.py +5 -0
  15. bisheng_langchain/chat_models/interface/wenxin.py +114 -0
  16. bisheng_langchain/chat_models/interface/xunfei.py +233 -0
  17. bisheng_langchain/chat_models/interface/zhipuai.py +81 -0
  18. bisheng_langchain/chat_models/minimax.py +354 -0
  19. bisheng_langchain/chat_models/proxy_llm.py +354 -0
  20. bisheng_langchain/chat_models/wenxin.py +349 -0
  21. bisheng_langchain/chat_models/xunfeiai.py +355 -0
  22. bisheng_langchain/chat_models/zhipuai.py +379 -0
  23. bisheng_langchain/document_loaders/__init__.py +3 -0
  24. bisheng_langchain/document_loaders/elem_html.py +0 -0
  25. bisheng_langchain/document_loaders/elem_image.py +0 -0
  26. bisheng_langchain/document_loaders/elem_pdf.py +655 -0
  27. bisheng_langchain/document_loaders/parsers/__init__.py +5 -0
  28. bisheng_langchain/document_loaders/parsers/image.py +28 -0
  29. bisheng_langchain/document_loaders/parsers/test_image.py +286 -0
  30. bisheng_langchain/embeddings/__init__.py +7 -0
  31. bisheng_langchain/embeddings/host_embedding.py +133 -0
  32. bisheng_langchain/embeddings/interface/__init__.py +3 -0
  33. bisheng_langchain/embeddings/interface/types.py +23 -0
  34. bisheng_langchain/embeddings/interface/wenxin.py +86 -0
  35. bisheng_langchain/embeddings/wenxin.py +139 -0
  36. bisheng_langchain/vectorstores/__init__.py +3 -0
  37. bisheng_langchain/vectorstores/elastic_keywords_search.py +284 -0
  38. bisheng_langchain-0.0.1.dist-info/METADATA +64 -0
  39. bisheng_langchain-0.0.1.dist-info/RECORD +41 -0
  40. bisheng_langchain-0.0.1.dist-info/WHEEL +5 -0
  41. bisheng_langchain-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,655 @@
1
+ # flake8: noqa
2
+ """Loads PDF with semantic splilter."""
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ import re
8
+ import tempfile
9
+ import time
10
+ from abc import ABC
11
+ from collections import Counter
12
+ from copy import deepcopy
13
+ from pathlib import Path
14
+ from typing import Any, Iterator, List, Mapping, Optional, Union
15
+ from urllib.parse import urlparse
16
+
17
+ import fitz
18
+ import numpy as np
19
+ import pypdfium2
20
+ import requests
21
+ from bisheng_langchain.document_loaders.parsers import LayoutParser
22
+ from langchain.docstore.document import Document
23
+ from langchain.document_loaders.blob_loaders import Blob
24
+ from langchain.document_loaders.pdf import BasePDFLoader
25
+ from shapely import Polygon
26
+ from shapely import box as Rect
27
+
28
+ RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r'\s+', flags=re.DOTALL)
29
+
30
+
31
+ def merge_rects(bboxes):
32
+ x0 = np.min(bboxes[:, 0])
33
+ y0 = np.min(bboxes[:, 1])
34
+ x1 = np.max(bboxes[:, 2])
35
+ y1 = np.max(bboxes[:, 3])
36
+ return [x0, y0, x1, y1]
37
+
38
+
39
+ def norm_rect(bbox):
40
+ x0 = np.min([bbox[0], bbox[2]])
41
+ x1 = np.max([bbox[0], bbox[2]])
42
+ y0 = np.min([bbox[1], bbox[3]])
43
+ y1 = np.max([bbox[1], bbox[3]])
44
+ return np.asarray([x0, y0, x1, y1])
45
+
46
+
47
+ def find_max_continuous_seq(arr):
48
+ n = len(arr)
49
+ max_info = (0, 1)
50
+ for i in range(n):
51
+ m = 1
52
+ for j in range(i + 1, n):
53
+ if arr[j] - arr[j - 1] == 1:
54
+ m += 1
55
+ else:
56
+ break
57
+
58
+ if m > max_info[1]:
59
+ max_info = (i, m)
60
+
61
+ max_info = (max_info[0] + arr[0], max_info[1])
62
+ return max_info
63
+
64
+
65
+ def order_by_tbyx(block_info, th=10):
66
+ """
67
+ block_info: [(b0, b1, b2, b3, text, x, y)+]
68
+ th: threshold of the position threshold
69
+ """
70
+ # sort using y1 first and then x1
71
+ res = sorted(block_info, key=lambda b: (b[1], b[0]))
72
+ for i in range(len(res) - 1):
73
+ for j in range(i, 0, -1):
74
+ # restore the order using the
75
+ if (abs(res[j + 1][1] - res[j][1]) < th
76
+ and (res[j + 1][0] < res[j][0])):
77
+ tmp = deepcopy(res[j])
78
+ res[j] = deepcopy(res[j + 1])
79
+ res[j + 1] = deepcopy(tmp)
80
+ else:
81
+ break
82
+ return res
83
+
84
+
85
+ def join_lines(texts, is_table=False):
86
+ if is_table:
87
+ return '\n'.join(texts)
88
+
89
+ flags = []
90
+ PUNC_SET = set(['.', ',', ';', '?', '!'])
91
+ for text in texts:
92
+ flags.append(np.all([t.isalnum() for t in text.rsplit(' ', 5)]))
93
+
94
+ if np.all(flags):
95
+ t0 = texts[0]
96
+ for t in texts[1:]:
97
+ if t0[-1] == '-':
98
+ t0 = t0[:-1] + t
99
+ elif t0[-1].isalnum() and t[0].isalnum():
100
+ t0 += ' ' + t
101
+ elif t0[-1] in PUNC_SET or t[0] in PUNC_SET:
102
+ t0 += ' ' + t
103
+ else:
104
+ t0 += t
105
+ return t0
106
+ else:
107
+ return ''.join(texts)
108
+
109
+
110
+ class Segment:
111
+
112
+ def __init__(self, seg):
113
+ self.whole = seg
114
+ self.segs = []
115
+
116
+ @staticmethod
117
+ def is_align(seg0, seg1, delta=5, mode=0):
118
+ # mode=0 edge align
119
+ # mode=1, edge align or center align
120
+ res = Segment.contain(seg0, seg1)
121
+ if not res:
122
+ return False
123
+ else:
124
+ if mode == 1:
125
+ r1 = seg1[0] - seg0[0] <= delta or seg0[1] - seg1[1] <= delta
126
+ c0 = (seg0[0] + seg0[1]) / 2
127
+ c1 = (seg1[0] + seg1[1]) / 2
128
+ r2 = abs(c1 - c0) <= delta
129
+ return r1 or r2
130
+ else:
131
+ return seg1[0] - seg0[0] <= delta or seg0[1] - seg1[1] <= delta
132
+
133
+ @staticmethod
134
+ def contain(seg0, seg1):
135
+ return seg0[0] <= seg1[0] and seg0[1] >= seg1[0]
136
+
137
+ @staticmethod
138
+ def overlap(seg0, seg1):
139
+ max_x0 = max(seg0[0], seg1[0])
140
+ min_x1 = min(seg0[1], seg1[1])
141
+ return max_x0 < min_x1
142
+
143
+ def _merge(self, segs):
144
+ x0s = [s[0] for s in segs]
145
+ x1s = [s[1] for s in segs]
146
+ return (np.min(x0s), np.max(x1s))
147
+
148
+ def add(self, seg):
149
+ if not self.segs:
150
+ self.segs.append(seg)
151
+ else:
152
+ overlaps = []
153
+ non_overlaps = []
154
+ for seg0 in self.segs:
155
+ if Segment.overlap(seg0, seg):
156
+ overlaps.append(seg0)
157
+ else:
158
+ non_overlaps.append(seg0)
159
+
160
+ if not overlaps:
161
+ self.segs.append(seg)
162
+ else:
163
+ overlaps.append(seg)
164
+ new_seg = self._merge(overlaps)
165
+ non_overlaps.append(new_seg)
166
+ self.segs = non_overlaps
167
+
168
+ def get_free_segment(self, incr_margin=True, margin_threshold=10):
169
+ sorted_segs = sorted(self.segs, key=lambda x: x[0])
170
+ n = len(sorted_segs)
171
+ free_segs = []
172
+ if incr_margin:
173
+ if n > 0:
174
+ seg_1st = sorted_segs[0]
175
+ if (seg_1st[0] - self.whole[0]) > margin_threshold:
176
+ free_segs.append((self.whole[0], seg_1st[0]))
177
+
178
+ seg_last = sorted_segs[-1]
179
+ if (self.whole[1] - seg_last[1]) > margin_threshold:
180
+ free_segs.append((seg_last[1], self.whole[1]))
181
+
182
+ for i in range(n - 1):
183
+ x0 = sorted_segs[i][1]
184
+ x1 = sorted_segs[i + 1][0]
185
+ free_segs.append((x0, x1))
186
+
187
+ return free_segs
188
+
189
+
190
+ class PDFWithSemanticLoader(BasePDFLoader):
191
+ """Loads a PDF with pypdf and chunks at character level.
192
+
193
+ Loader also stores page numbers in metadata.
194
+ """
195
+
196
+ def __init__(self,
197
+ file_path: str,
198
+ password: Optional[Union[str, bytes]] = None,
199
+ layout_api_key: str = None,
200
+ layout_api_url: str = None,
201
+ is_join_table: bool = True,
202
+ with_columns: bool = False,
203
+ support_rotate: bool = False,
204
+ text_elem_sep: str = '\n',
205
+ start: int = 0,
206
+ n: int = None,
207
+ html_output_file: str = None,
208
+ verbose: bool = False) -> None:
209
+ """Initialize with a file path."""
210
+ self.layout_parser = LayoutParser(api_key=layout_api_key,
211
+ api_base_url=layout_api_url)
212
+ self.with_columns = with_columns
213
+ self.is_join_table = is_join_table
214
+ self.support_rotate = support_rotate
215
+ self.start = start
216
+ self.n = n
217
+ self.html_output_file = html_output_file
218
+ self.verbose = verbose
219
+ self.text_elem_sep = text_elem_sep
220
+ super().__init__(file_path)
221
+
222
+ def _get_image_blobs(self, fitz_doc, pdf_reader, n=None, start=0):
223
+ blobs = []
224
+ pages = []
225
+ if not n:
226
+ n = fitz_doc.page_count
227
+ for pg in range(start, start + n):
228
+ bytes_img = None
229
+ page = fitz_doc.load_page(pg)
230
+ pages.append(page)
231
+ mat = fitz.Matrix(1, 1)
232
+ try:
233
+ pm = page.get_pixmap(matrix=mat, alpha=False)
234
+ bytes_img = pm.getPNGData()
235
+ except Exception:
236
+ # some pdf input cannot get render image from fitz
237
+ page = pdf_reader.get_page(pg)
238
+ pil_image = page.render().to_pil()
239
+ img_byte_arr = io.BytesIO()
240
+ pil_image.save(img_byte_arr, format='PNG')
241
+ bytes_img = img_byte_arr.getvalue()
242
+
243
+ blobs.append(Blob(data=bytes_img))
244
+ return blobs, pages
245
+
246
+ def _allocate_semantic(self, page, layout):
247
+ class_name = ['印章', '图片', '标题', '段落', '表格', '页眉', '页码', '页脚']
248
+ effective_class_inds = [3, 4, 5, 999]
249
+ non_conti_class_ids = [6, 7, 8]
250
+ TEXT_ID = 4
251
+ TABLE_ID = 5
252
+
253
+ textpage = page.get_textpage()
254
+ blocks = textpage.extractBLOCKS()
255
+
256
+ if self.support_rotate:
257
+ rotation_matrix = np.asarray(page.rotation_matrix).reshape((3, 2))
258
+ c1 = (rotation_matrix[0, 0] - 1) <= 1e-6
259
+ c2 = (rotation_matrix[1, 1] - 1) <= 1e-6
260
+ is_rotated = c1 and c2
261
+ # print('c1/c2', c1, c2)
262
+ if is_rotated:
263
+ new_blocks = []
264
+ for b in blocks:
265
+ bbox = np.asarray([b[0], b[1], b[2], b[3]])
266
+ aug_bbox = bbox.reshape((-1, 2))
267
+ padding = np.ones((len(aug_bbox), 1))
268
+ aug_bbox = np.hstack([aug_bbox, padding])
269
+ bb = np.dot(aug_bbox, rotation_matrix).reshape(-1)
270
+ bb = norm_rect(bb)
271
+ info = (bb[0], bb[1], bb[2], bb[3], b[4], b[5], b[6])
272
+ new_blocks.append(info)
273
+
274
+ blocks = new_blocks
275
+
276
+ if not self.with_columns:
277
+ blocks = order_by_tbyx(blocks)
278
+
279
+ # print('---ori blocks---')
280
+ # for b in blocks:
281
+ # print(b)
282
+
283
+ IMG_BLOCK_TYPE = 1
284
+ text_ploys = []
285
+ text_rects = []
286
+ texts = []
287
+ for b in blocks:
288
+ if b[-1] != IMG_BLOCK_TYPE:
289
+ text = re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', b[4]
290
+ or '').strip()
291
+ if text:
292
+ texts.append(text)
293
+ text_ploys.append(Rect(b[0], b[1], b[2], b[3]))
294
+ text_rects.append([b[0], b[1], b[2], b[3]])
295
+ text_rects = np.asarray(text_rects)
296
+ texts = np.asarray(texts)
297
+
298
+ semantic_polys = []
299
+ semantic_labels = []
300
+
301
+ layout_info = json.loads(layout.page_content)
302
+ for info in layout_info:
303
+ bbs = info['bbox']
304
+ coords = ((bbs[0], bbs[1]), (bbs[2], bbs[3]), (bbs[4], bbs[5]),
305
+ (bbs[6], bbs[7]))
306
+ semantic_polys.append(Polygon(coords))
307
+ semantic_labels.append(info['category_id'])
308
+
309
+ # caculate containing overlap
310
+ sem_cnt = len(semantic_polys)
311
+ texts_cnt = len(text_ploys)
312
+ contain_matrix = np.zeros((sem_cnt, texts_cnt))
313
+ for i in range(sem_cnt):
314
+ for j in range(texts_cnt):
315
+ inter = semantic_polys[i].intersection(text_ploys[j]).area
316
+ contain_matrix[i, j] = inter * 1.0 / text_ploys[j].area
317
+
318
+ # print('----------------containing matrix--------')
319
+ # for r in contain_matrix.tolist():
320
+ # print([round(r_, 2) for r_ in r])
321
+
322
+ # print('---text---')
323
+ # for t in texts:
324
+ # print(t)
325
+
326
+ # merge continuous text block by the containing matrix
327
+ CONTRAIN_THRESHOLD = 0.70
328
+ contain_info = []
329
+ for i in range(sem_cnt):
330
+ ind = np.argwhere(contain_matrix[i, :] > CONTRAIN_THRESHOLD)[:, 0]
331
+ if len(ind) == 0: continue
332
+ label = semantic_labels[i]
333
+ if label in non_conti_class_ids:
334
+ n = len(ind)
335
+ contain_info.append((None, None, n, label, ind))
336
+ else:
337
+ start, n = find_max_continuous_seq(ind)
338
+ if n >= 1:
339
+ contain_info.append((start, start + n, n, label, None))
340
+
341
+ contain_info = sorted(contain_info, key=lambda x: x[2], reverse=True)
342
+ mask = np.zeros(texts_cnt)
343
+ new_block_info = []
344
+ for info in contain_info:
345
+ start, end, n, label, ind = info
346
+ if label in non_conti_class_ids and np.all(mask[ind] == 0):
347
+ rect = merge_rects(text_rects[ind])
348
+ ori_orders = [blocks[i][-2] for i in ind]
349
+ ts = texts[ind]
350
+ rs = text_rects[ind]
351
+ ord_ind = np.min(ori_orders)
352
+ mask[ind] = 1
353
+ new_block_info.append(
354
+ (rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
355
+
356
+ elif np.all(mask[start:end] == 0):
357
+ rect = merge_rects(text_rects[start:end])
358
+ ori_orders = [blocks[i][-2] for i in range(start, end)]
359
+ arg_ind = np.argsort(ori_orders)
360
+ # print('ori_orders', ori_orders, arg_ind)
361
+ ord_ind = np.min(ori_orders)
362
+
363
+ ts = texts[start:end]
364
+ rs = text_rects[start:end]
365
+ if label == TABLE_ID:
366
+ ts = ts[arg_ind]
367
+ rs = rs[arg_ind]
368
+
369
+ mask[start:end] = 1
370
+ new_block_info.append(
371
+ (rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
372
+
373
+ for i in range(texts_cnt):
374
+ if mask[i] == 0:
375
+ b = blocks[i]
376
+ r = np.asarray([b[0], b[1], b[2], b[3]])
377
+ ord_ind = b[-2]
378
+ new_block_info.append(
379
+ (b[0], b[1], b[2], b[3], [texts[i]], [r], ord_ind))
380
+
381
+ if self.with_columns:
382
+ new_blocks = sorted(new_block_info, key=lambda x: x[-1])
383
+ else:
384
+ new_blocks = order_by_tbyx(new_block_info)
385
+
386
+ # print('\n\n---new blocks---')
387
+ # for idx, b in enumerate(new_blocks):
388
+ # print(idx, b)
389
+
390
+ text_ploys = []
391
+ texts = []
392
+ for b in new_blocks:
393
+ texts.append(b[4])
394
+ text_ploys.append(Rect(b[0], b[1], b[2], b[3]))
395
+
396
+ # caculate overlap
397
+ sem_cnt = len(semantic_polys)
398
+ texts_cnt = len(text_ploys)
399
+ overlap_matrix = np.zeros((sem_cnt, texts_cnt))
400
+ for i in range(sem_cnt):
401
+ for j in range(texts_cnt):
402
+ inter = semantic_polys[i].intersection(text_ploys[j]).area
403
+ union = semantic_polys[i].union(text_ploys[j]).area
404
+ overlap_matrix[i, j] = (inter * 1.0) / union
405
+
406
+ # print('---overlap_matrix---')
407
+ # for r in overlap_matrix:
408
+ # print([round(r_, 3) for r_ in r])
409
+
410
+ # allocate label
411
+ OVERLAP_THRESHOLD = 0.2
412
+ texts_labels = []
413
+ DEF_SEM_LABEL = 999
414
+ for j in range(texts_cnt):
415
+ ind = np.argwhere(overlap_matrix[:, j] > OVERLAP_THRESHOLD)[:, 0]
416
+ if len(ind) == 0:
417
+ sem_label = DEF_SEM_LABEL
418
+ else:
419
+ c = Counter([semantic_labels[i] for i in ind])
420
+ items = c.most_common()
421
+ sem_label = items[0][0]
422
+ if len(items) > 1 and TEXT_ID in dict(items):
423
+ sem_label = TEXT_ID
424
+
425
+ texts_labels.append(sem_label)
426
+
427
+ # print(texts_labels)
428
+ # filter the unused element
429
+ filtered_blocks = []
430
+ for label, b in zip(texts_labels, new_blocks):
431
+ if label in effective_class_inds:
432
+ text = join_lines(b[4], label == TABLE_ID)
433
+ filtered_blocks.append(
434
+ (b[0], b[1], b[2], b[3], text, b[5], label))
435
+
436
+ # print('---filtered_blocks---')
437
+ # for b in filtered_blocks:
438
+ # print(b)
439
+
440
+ return filtered_blocks
441
+
442
+ def _divide_blocks_into_groups(self, blocks):
443
+ # support only pure two columns layout, each has same width
444
+ rects = np.asarray([[b[0], b[1], b[2], b[3]] for b in blocks])
445
+ min_x0 = np.min(rects[:, 0])
446
+ max_x1 = np.max(rects[:, 2])
447
+ root_seg = (min_x0, max_x1)
448
+ root_pc = (min_x0 + max_x1) / 2
449
+ root_offset = 20
450
+ center_seg = (root_pc - root_offset, root_pc + root_offset)
451
+
452
+ segment = Segment(root_seg)
453
+ for r in rects:
454
+ segment.add((r[0], r[2]))
455
+
456
+ COLUMN_THRESHOLD = 0.90
457
+ CENTER_GAP_THRESHOLD = 0.90
458
+ free_segs = segment.get_free_segment()
459
+ columns = []
460
+ if len(free_segs) == 1 and len(segment.segs) == 2:
461
+ free_seg = free_segs[0]
462
+ seg0 = segment.segs[0]
463
+ seg1 = segment.segs[1]
464
+ cover = seg0[1] - seg0[0] + seg1[1] - seg1[0]
465
+ c0 = cover / (root_seg[1] - root_seg[0])
466
+ c1 = Segment.contain(center_seg, free_seg)
467
+ if c0 > COLUMN_THRESHOLD and c1:
468
+ # two columns
469
+ columns.extend([seg0, seg1])
470
+
471
+ groups = [blocks]
472
+ if columns:
473
+ groups = [[] for _ in columns]
474
+ for b, r in zip(blocks, rects):
475
+ column_ind = 0
476
+ cand_seg = (r[0], r[2])
477
+ for i, seg in enumerate(columns):
478
+ if Segment.contain(seg, cand_seg):
479
+ column_ind = i
480
+ break
481
+ groups[i].append(b)
482
+
483
+ return groups
484
+
485
+ def _allocate_continuous(self, groups):
486
+ g_bound = []
487
+ groups = [g for g in groups if g]
488
+ for blocks in groups:
489
+ arr = [[b[0], b[1], b[2], b[3]] for b in blocks]
490
+ bboxes = np.asarray(arr)
491
+ g_bound.append(np.asarray(merge_rects(bboxes)))
492
+
493
+ LINE_FULL_THRESHOLD = 0.80
494
+ START_THRESHOLD = 0.8
495
+ SIMI_HEIGHT_THRESHOLD = 0.3
496
+ SIMI_WIDTH_THRESHOLD = 0.3
497
+
498
+ TEXT_ID = 4
499
+ TABLE_ID = 5
500
+
501
+ def _get_elem(blocks, is_first=True):
502
+ if not blocks:
503
+ return (None, None, None, None, None)
504
+ if is_first:
505
+ b1 = blocks[0]
506
+ b1_label = b1[-1]
507
+ r1 = b1[5][0]
508
+ r1_w = r1[2] - r1[0]
509
+ r1_h = r1[3] - r1[1]
510
+ return (b1, b1_label, r1, r1_w, r1_h)
511
+ else:
512
+ b0 = blocks[-1]
513
+ b0_label = b0[-1]
514
+ r0 = b0[5][-1]
515
+ r0_w = r0[2] - r0[0]
516
+ r0_h = r0[3] - r0[1]
517
+ return (b0, b0_label, r0, r0_w, r0_h)
518
+
519
+ b0, b0_label, r0, r0_w, r0_h = _get_elem(groups[0], False)
520
+ g0 = g_bound[0]
521
+
522
+ for i in range(1, len(groups)):
523
+ b1, b1_label, r1, r1_w, r1_h = _get_elem(groups[i], True)
524
+ g1 = g_bound[i]
525
+
526
+ # print('\n_allocate_continuous:')
527
+ # print(b0, b0_label, b1, b1_label)
528
+
529
+ if b0_label and b0_label == b1_label and b0_label == TEXT_ID:
530
+ c0 = r0_w / (g0[2] - g0[0])
531
+ c1 = (r1[0] - g1[0]) / r1_h
532
+ c2 = np.abs(r0_h - r1_h) / r1_h
533
+
534
+ # print('\n\n---conti texts---')
535
+ # print(b0_label, c0, c1, c2,
536
+ # b0, b0_label, r0, r0_w, r0_h,
537
+ # b1, b1_label, r1, r1_w, r1_h)
538
+
539
+ if (c0 > LINE_FULL_THRESHOLD and c1 < START_THRESHOLD
540
+ and c2 < SIMI_HEIGHT_THRESHOLD):
541
+ new_text = join_lines([b0[4], b1[4]])
542
+ new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5],
543
+ b0[6])
544
+ groups[i - 1][-1] = new_block
545
+ groups[i].pop(0)
546
+
547
+ elif (self.is_join_table and b0_label and b0_label == b1_label
548
+ and b0_label == TABLE_ID):
549
+ c0 = (r1_w - r0_w) / r1_h
550
+ if c0 < SIMI_WIDTH_THRESHOLD:
551
+ new_text = join_lines([b0[4], b1[4]], True)
552
+ new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5],
553
+ b0[6])
554
+ groups[i - 1][-1] = new_block
555
+ groups[i].pop(0)
556
+
557
+ b0, b0_label, r0, r0_w, r0_h = _get_elem(groups[i], False)
558
+
559
+ return groups
560
+
561
+ def save_to_html(self, groups, output_file):
562
+ styles = [
563
+ 'style="background-color: #EBEBEB;"',
564
+ 'style="background-color: #ABBAEA;"'
565
+ ]
566
+ idx = 0
567
+ table_style = 'style="border:1px solid black;"'
568
+
569
+ with open(output_file, 'w') as fout:
570
+ for blocks in groups:
571
+ for b in blocks:
572
+ if b[-1] == 3:
573
+ text = f'<h1>{b[4]}</h1>'
574
+ elif b[-1] == 4:
575
+ text = f'<p {styles[idx % 2]}>{b[4]}</p>'
576
+ idx += 1
577
+ elif b[-1] == 5:
578
+ rows = b[4].split('\n')
579
+ content = []
580
+ for r in rows:
581
+ content.append(
582
+ f'<tr><td {table_style}>{r}</td></tr>')
583
+ elem_text = '\n'.join(content)
584
+ text = f'<table {table_style}>{elem_text}</table>'
585
+ else:
586
+ text = f'<p {styles[idx % 2]}>{b[4]}</p>'
587
+ idx += 1
588
+
589
+ fout.write(text + '\n')
590
+
591
+ def _save_to_document(self, groups):
592
+ TITLE_ID = 3
593
+ TEXT_ID = 4
594
+ TABLE_ID = 5
595
+ content_page = []
596
+ is_first_elem = True
597
+ for blocks in groups:
598
+ for b in blocks:
599
+ if is_first_elem:
600
+ content_page.append(b[4])
601
+ is_first_elem = False
602
+ else:
603
+ label, text = b[-1], b[4]
604
+ if label == TITLE_ID:
605
+ content_page.append('\n\n' + text)
606
+ else:
607
+ content_page.append(self.text_elem_sep + text)
608
+
609
+ return ''.join(content_page)
610
+
611
+ def load(self) -> List[Document]:
612
+ """Load given path as pages."""
613
+ blob = Blob.from_path(self.file_path)
614
+ start = self.start
615
+ groups = []
616
+ with blob.as_bytes_io() as file_path:
617
+ fitz_doc = fitz.open(file_path)
618
+ pdf_doc = pypdfium2.PdfDocument(file_path, autoclose=True)
619
+ max_page = fitz_doc.page_count - start
620
+ n = self.n if self.n else max_page
621
+ n = min(n, max_page)
622
+
623
+ tic = time.time()
624
+ if self.verbose:
625
+ print(f'{n} pages need be processed...')
626
+
627
+ for idx in range(start, start + n):
628
+ blobs, pages = self._get_image_blobs(fitz_doc, pdf_doc, 1, idx)
629
+ layout = self.layout_parser.parse(blobs[0])[0]
630
+ blocks = self._allocate_semantic(pages[0], layout)
631
+ if not blocks: continue
632
+
633
+ if self.with_columns:
634
+ sub_groups = self._divide_blocks_into_groups(blocks)
635
+ groups.extend(sub_groups)
636
+ else:
637
+ groups.append(blocks)
638
+
639
+ if self.verbose:
640
+ count = idx - start + 1
641
+ if count % 50 == 0:
642
+ elapse = round(time.time() - tic, 2)
643
+ tic = time.time()
644
+ print(f'process {count} pages used {elapse}sec...')
645
+
646
+ groups = self._allocate_continuous(groups)
647
+
648
+ if self.html_output_file:
649
+ self.save_to_html(groups, self.html_output_file)
650
+ return []
651
+
652
+ page_content = self._save_to_document(groups)
653
+ meta = {'source': os.path.basename(self.file_path)}
654
+ doc = Document(page_content=page_content, metadata=meta)
655
+ return [doc]
@@ -0,0 +1,5 @@
1
+ from .image import LayoutParser
2
+
3
+ __all__ = [
4
+ 'LayoutParser',
5
+ ]
@@ -0,0 +1,28 @@
1
+ import base64
2
+ import json
3
+ # import time
4
+ from typing import List, Optional
5
+
6
+ import requests
7
+ from langchain.document_loaders.blob_loaders import Blob
8
+ from langchain.schema import Document
9
+
10
+
11
+ class LayoutParser(object):
12
+ """Parse image layout structure.
13
+ """
14
+
15
+ def __init__(self,
16
+ api_key: Optional[str] = None,
17
+ api_base_url: Optional[str] = None):
18
+ self.api_key = api_key
19
+ self.api_base_url = 'http://192.168.106.20:14569/predict'
20
+ self.class_name = ['印章', '图片', '标题', '段落', '表格', '页眉', '页码', '页脚']
21
+
22
+ def parse(self, blob: Blob) -> List[Document]:
23
+ b64_data = base64.b64encode(blob.as_bytes()).decode()
24
+ data = {'img': b64_data}
25
+ resp = requests.post('http://192.168.106.20:14569/predict', data=data)
26
+ content = resp.json()
27
+ doc = Document(page_content=json.dumps(content), metadata={})
28
+ return [doc]