bisheng-langchain 0.2.2.2__py3-none-any.whl → 0.2.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bisheng_langchain/chat_models/host_llm.py +2 -2
- bisheng_langchain/document_loaders/elem_pdf.py +14 -34
- {bisheng_langchain-0.2.2.2.dist-info → bisheng_langchain-0.2.2.3.dist-info}/METADATA +1 -2
- {bisheng_langchain-0.2.2.2.dist-info → bisheng_langchain-0.2.2.3.dist-info}/RECORD +6 -6
- {bisheng_langchain-0.2.2.2.dist-info → bisheng_langchain-0.2.2.3.dist-info}/WHEEL +0 -0
- {bisheng_langchain-0.2.2.2.dist-info → bisheng_langchain-0.2.2.3.dist-info}/top_level.txt +0 -0
@@ -273,7 +273,7 @@ class BaseHostChatLLM(BaseChatModel):
|
|
273
273
|
try:
|
274
274
|
async with self.client.apost(url=self.host_base_url, json=kwargs) as response:
|
275
275
|
if response.status != 200:
|
276
|
-
raise ValueError(f'Error: {response.status}')
|
276
|
+
raise ValueError(f'Error: {response.status} contet: {response.text}')
|
277
277
|
async for txt in response.content.iter_any():
|
278
278
|
if b'\n' in txt:
|
279
279
|
for txt_ in txt.split(b'\n'):
|
@@ -313,7 +313,7 @@ class BaseHostChatLLM(BaseChatModel):
|
|
313
313
|
"""Generate chat completion with retry."""
|
314
314
|
message_dicts, params = self._create_message_dicts(messages, stop)
|
315
315
|
params = {**params, **kwargs}
|
316
|
-
if self.streaming:
|
316
|
+
if self.streaming and 'infer' not in self.host_base_url:
|
317
317
|
inner_completion = ''
|
318
318
|
role = 'assistant'
|
319
319
|
params['stream'] = True
|
@@ -2,22 +2,15 @@
|
|
2
2
|
"""Loads PDF with semantic splilter."""
|
3
3
|
import io
|
4
4
|
import json
|
5
|
-
import logging
|
6
5
|
import os
|
7
6
|
import re
|
8
|
-
import tempfile
|
9
7
|
import time
|
10
|
-
from abc import ABC
|
11
8
|
from collections import Counter
|
12
9
|
from copy import deepcopy
|
13
|
-
from
|
14
|
-
from typing import Any, Iterator, List, Mapping, Optional, Union
|
15
|
-
from urllib.parse import urlparse
|
10
|
+
from typing import List, Optional, Union
|
16
11
|
|
17
12
|
import fitz
|
18
13
|
import numpy as np
|
19
|
-
import pypdfium2
|
20
|
-
import requests
|
21
14
|
from bisheng_langchain.document_loaders.parsers import LayoutParser
|
22
15
|
from langchain.docstore.document import Document
|
23
16
|
from langchain.document_loaders.blob_loaders import Blob
|
@@ -72,8 +65,7 @@ def order_by_tbyx(block_info, th=10):
|
|
72
65
|
for i in range(len(res) - 1):
|
73
66
|
for j in range(i, 0, -1):
|
74
67
|
# restore the order using the
|
75
|
-
if (abs(res[j + 1][1] - res[j][1]) < th
|
76
|
-
and (res[j + 1][0] < res[j][0])):
|
68
|
+
if (abs(res[j + 1][1] - res[j][1]) < th and (res[j + 1][0] < res[j][0])):
|
77
69
|
tmp = deepcopy(res[j])
|
78
70
|
res[j] = deepcopy(res[j + 1])
|
79
71
|
res[j + 1] = deepcopy(tmp)
|
@@ -207,8 +199,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
207
199
|
html_output_file: str = None,
|
208
200
|
verbose: bool = False) -> None:
|
209
201
|
"""Initialize with a file path."""
|
210
|
-
self.layout_parser = LayoutParser(api_key=layout_api_key,
|
211
|
-
api_base_url=layout_api_url)
|
202
|
+
self.layout_parser = LayoutParser(api_key=layout_api_key, api_base_url=layout_api_url)
|
212
203
|
self.with_columns = with_columns
|
213
204
|
self.is_join_table = is_join_table
|
214
205
|
self.support_rotate = support_rotate
|
@@ -286,8 +277,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
286
277
|
texts = []
|
287
278
|
for b in blocks:
|
288
279
|
if b[-1] != IMG_BLOCK_TYPE:
|
289
|
-
text = re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', b[4]
|
290
|
-
or '').strip()
|
280
|
+
text = re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', b[4] or '').strip()
|
291
281
|
if text:
|
292
282
|
texts.append(text)
|
293
283
|
text_ploys.append(Rect(b[0], b[1], b[2], b[3]))
|
@@ -301,8 +291,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
301
291
|
layout_info = json.loads(layout.page_content)
|
302
292
|
for info in layout_info:
|
303
293
|
bbs = info['bbox']
|
304
|
-
coords = ((bbs[0], bbs[1]), (bbs[2], bbs[3]), (bbs[4], bbs[5]),
|
305
|
-
(bbs[6], bbs[7]))
|
294
|
+
coords = ((bbs[0], bbs[1]), (bbs[2], bbs[3]), (bbs[4], bbs[5]), (bbs[6], bbs[7]))
|
306
295
|
semantic_polys.append(Polygon(coords))
|
307
296
|
semantic_labels.append(info['category_id'])
|
308
297
|
|
@@ -350,8 +339,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
350
339
|
rs = text_rects[ind]
|
351
340
|
ord_ind = np.min(ori_orders)
|
352
341
|
mask[ind] = 1
|
353
|
-
new_block_info.append(
|
354
|
-
(rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
|
342
|
+
new_block_info.append((rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
|
355
343
|
|
356
344
|
elif np.all(mask[start:end] == 0):
|
357
345
|
rect = merge_rects(text_rects[start:end])
|
@@ -367,16 +355,14 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
367
355
|
rs = rs[arg_ind]
|
368
356
|
|
369
357
|
mask[start:end] = 1
|
370
|
-
new_block_info.append(
|
371
|
-
(rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
|
358
|
+
new_block_info.append((rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
|
372
359
|
|
373
360
|
for i in range(texts_cnt):
|
374
361
|
if mask[i] == 0:
|
375
362
|
b = blocks[i]
|
376
363
|
r = np.asarray([b[0], b[1], b[2], b[3]])
|
377
364
|
ord_ind = b[-2]
|
378
|
-
new_block_info.append(
|
379
|
-
(b[0], b[1], b[2], b[3], [texts[i]], [r], ord_ind))
|
365
|
+
new_block_info.append((b[0], b[1], b[2], b[3], [texts[i]], [r], ord_ind))
|
380
366
|
|
381
367
|
if self.with_columns:
|
382
368
|
new_blocks = sorted(new_block_info, key=lambda x: x[-1])
|
@@ -430,8 +416,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
430
416
|
for label, b in zip(texts_labels, new_blocks):
|
431
417
|
if label in effective_class_inds:
|
432
418
|
text = join_lines(b[4], label == TABLE_ID)
|
433
|
-
filtered_blocks.append(
|
434
|
-
(b[0], b[1], b[2], b[3], text, b[5], label))
|
419
|
+
filtered_blocks.append((b[0], b[1], b[2], b[3], text, b[5], label))
|
435
420
|
|
436
421
|
# print('---filtered_blocks---')
|
437
422
|
# for b in filtered_blocks:
|
@@ -539,8 +524,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
539
524
|
if (c0 > LINE_FULL_THRESHOLD and c1 < START_THRESHOLD
|
540
525
|
and c2 < SIMI_HEIGHT_THRESHOLD):
|
541
526
|
new_text = join_lines([b0[4], b1[4]])
|
542
|
-
new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5],
|
543
|
-
b0[6])
|
527
|
+
new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5], b0[6])
|
544
528
|
groups[i - 1][-1] = new_block
|
545
529
|
groups[i].pop(0)
|
546
530
|
|
@@ -549,8 +533,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
549
533
|
c0 = (r1_w - r0_w) / r1_h
|
550
534
|
if c0 < SIMI_WIDTH_THRESHOLD:
|
551
535
|
new_text = join_lines([b0[4], b1[4]], True)
|
552
|
-
new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5],
|
553
|
-
b0[6])
|
536
|
+
new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5], b0[6])
|
554
537
|
groups[i - 1][-1] = new_block
|
555
538
|
groups[i].pop(0)
|
556
539
|
|
@@ -559,10 +542,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
559
542
|
return groups
|
560
543
|
|
561
544
|
def save_to_html(self, groups, output_file):
|
562
|
-
styles = [
|
563
|
-
'style="background-color: #EBEBEB;"',
|
564
|
-
'style="background-color: #ABBAEA;"'
|
565
|
-
]
|
545
|
+
styles = ['style="background-color: #EBEBEB;"', 'style="background-color: #ABBAEA;"']
|
566
546
|
idx = 0
|
567
547
|
table_style = 'style="border:1px solid black;"'
|
568
548
|
|
@@ -578,8 +558,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
578
558
|
rows = b[4].split('\n')
|
579
559
|
content = []
|
580
560
|
for r in rows:
|
581
|
-
content.append(
|
582
|
-
f'<tr><td {table_style}>{r}</td></tr>')
|
561
|
+
content.append(f'<tr><td {table_style}>{r}</td></tr>')
|
583
562
|
elem_text = '\n'.join(content)
|
584
563
|
text = f'<table {table_style}>{elem_text}</table>'
|
585
564
|
else:
|
@@ -610,6 +589,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
|
|
610
589
|
|
611
590
|
def load(self) -> List[Document]:
|
612
591
|
"""Load given path as pages."""
|
592
|
+
import pypdfium2
|
613
593
|
blob = Blob.from_path(self.file_path)
|
614
594
|
start = self.start
|
615
595
|
groups = []
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: bisheng-langchain
|
3
|
-
Version: 0.2.2.
|
3
|
+
Version: 0.2.2.3
|
4
4
|
Summary: bisheng langchain modules
|
5
5
|
Home-page: https://github.com/dataelement/bisheng
|
6
6
|
Author: DataElem
|
@@ -26,7 +26,6 @@ Requires-Dist: bisheng-pyautogen
|
|
26
26
|
Requires-Dist: jieba ==0.42.1
|
27
27
|
Requires-Dist: pydantic ==1.10.13
|
28
28
|
Requires-Dist: pymupdf ==1.23.8
|
29
|
-
Requires-Dist: pypdfium2 ==4.25.0
|
30
29
|
Requires-Dist: shapely ==2.0.2
|
31
30
|
Requires-Dist: filetype ==1.2.0
|
32
31
|
|
@@ -25,7 +25,7 @@ bisheng_langchain/chains/router/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
25
25
|
bisheng_langchain/chains/router/multi_rule.py,sha256=BiFryj3-7rOxfttD-MyOkKWLCSGB9LVYd2rjOsIfQC8,375
|
26
26
|
bisheng_langchain/chains/router/rule_router.py,sha256=R2YRUnwn7s_7DbsSn27uPn4cIV0D-5iXEORXir0tNGM,1835
|
27
27
|
bisheng_langchain/chat_models/__init__.py,sha256=A3_KoMRp96UqHwwYX4Mt60peVNjCsuUHAixqaV44BP4,492
|
28
|
-
bisheng_langchain/chat_models/host_llm.py,sha256=
|
28
|
+
bisheng_langchain/chat_models/host_llm.py,sha256=phIBLY7AoGPIIxl9saFV2XAcRvgaJ8XUJA06bqiN8Uw,21638
|
29
29
|
bisheng_langchain/chat_models/minimax.py,sha256=JLs_f6vWD9beZYUtjD4FG28G8tZHrGUAWOwdLIuJomw,13901
|
30
30
|
bisheng_langchain/chat_models/proxy_llm.py,sha256=wzVBZik9WC3-f7kyQ1eu3Ooibqpcocln08knf5lV1Nw,17082
|
31
31
|
bisheng_langchain/chat_models/qwen.py,sha256=jGx_tW-LPxfegE6NvY6wID8ps2SsP813atjXnc04C-s,18841
|
@@ -44,7 +44,7 @@ bisheng_langchain/document_loaders/__init__.py,sha256=LuQ-zMYxde2FeiEcvVtjQqnHoz
|
|
44
44
|
bisheng_langchain/document_loaders/custom_kv.py,sha256=sUKeK0e8-cCmKyj1FsR7SzBNWjo5zRwHWVS5tKVxuPs,6656
|
45
45
|
bisheng_langchain/document_loaders/elem_html.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
46
46
|
bisheng_langchain/document_loaders/elem_image.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
|
-
bisheng_langchain/document_loaders/elem_pdf.py,sha256=
|
47
|
+
bisheng_langchain/document_loaders/elem_pdf.py,sha256=K-TXILGNFLFjavhun_MFbUF4t2_WGA3Z-kbnr75lmW8,22243
|
48
48
|
bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=N2jCmizi9gIsAO38zWL1zOzfNhoo_XmNIX7fteSeR0Q,4883
|
49
49
|
bisheng_langchain/document_loaders/universal_kv.py,sha256=dJF_GQGKBMUjB_kX9CSp7xZRhXgwVuGPbMIzJwPh-C0,4063
|
50
50
|
bisheng_langchain/document_loaders/parsers/__init__.py,sha256=OOM_FJkwaU-zNS58fASw0TH8FNT6VXKb0VrvisgdrII,171
|
@@ -69,7 +69,7 @@ bisheng_langchain/vectorstores/__init__.py,sha256=zCZgDe7LyQ0iDkfcm5UJ5NxwKQSRHn
|
|
69
69
|
bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=gt_uw_fSMcEZWxbiA3V0RyA-utLOZlUY-qxdwnsfZks,12664
|
70
70
|
bisheng_langchain/vectorstores/milvus.py,sha256=44ZbDsIxdsbUnHOpEpCdrW5zvWnYvDdAVoDKjCFoyYI,34424
|
71
71
|
bisheng_langchain/vectorstores/retriever.py,sha256=hj4nAAl352EV_ANnU2OHJn7omCH3nBK82ydo14KqMH4,4353
|
72
|
-
bisheng_langchain-0.2.2.
|
73
|
-
bisheng_langchain-0.2.2.
|
74
|
-
bisheng_langchain-0.2.2.
|
75
|
-
bisheng_langchain-0.2.2.
|
72
|
+
bisheng_langchain-0.2.2.3.dist-info/METADATA,sha256=9UgVujQ3Ep2eBWGmbeoe8r-dPNqOjPxXHa0_WYSz7pw,2299
|
73
|
+
bisheng_langchain-0.2.2.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
74
|
+
bisheng_langchain-0.2.2.3.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
|
75
|
+
bisheng_langchain-0.2.2.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|