bisheng-langchain 0.2.2.2__py3-none-any.whl → 0.2.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -273,7 +273,7 @@ class BaseHostChatLLM(BaseChatModel):
273
273
  try:
274
274
  async with self.client.apost(url=self.host_base_url, json=kwargs) as response:
275
275
  if response.status != 200:
276
- raise ValueError(f'Error: {response.status}')
276
+ raise ValueError(f'Error: {response.status} contet: {response.text}')
277
277
  async for txt in response.content.iter_any():
278
278
  if b'\n' in txt:
279
279
  for txt_ in txt.split(b'\n'):
@@ -313,7 +313,7 @@ class BaseHostChatLLM(BaseChatModel):
313
313
  """Generate chat completion with retry."""
314
314
  message_dicts, params = self._create_message_dicts(messages, stop)
315
315
  params = {**params, **kwargs}
316
- if self.streaming:
316
+ if self.streaming and 'infer' not in self.host_base_url:
317
317
  inner_completion = ''
318
318
  role = 'assistant'
319
319
  params['stream'] = True
@@ -2,22 +2,15 @@
2
2
  """Loads PDF with semantic splilter."""
3
3
  import io
4
4
  import json
5
- import logging
6
5
  import os
7
6
  import re
8
- import tempfile
9
7
  import time
10
- from abc import ABC
11
8
  from collections import Counter
12
9
  from copy import deepcopy
13
- from pathlib import Path
14
- from typing import Any, Iterator, List, Mapping, Optional, Union
15
- from urllib.parse import urlparse
10
+ from typing import List, Optional, Union
16
11
 
17
12
  import fitz
18
13
  import numpy as np
19
- import pypdfium2
20
- import requests
21
14
  from bisheng_langchain.document_loaders.parsers import LayoutParser
22
15
  from langchain.docstore.document import Document
23
16
  from langchain.document_loaders.blob_loaders import Blob
@@ -72,8 +65,7 @@ def order_by_tbyx(block_info, th=10):
72
65
  for i in range(len(res) - 1):
73
66
  for j in range(i, 0, -1):
74
67
  # restore the order using the
75
- if (abs(res[j + 1][1] - res[j][1]) < th
76
- and (res[j + 1][0] < res[j][0])):
68
+ if (abs(res[j + 1][1] - res[j][1]) < th and (res[j + 1][0] < res[j][0])):
77
69
  tmp = deepcopy(res[j])
78
70
  res[j] = deepcopy(res[j + 1])
79
71
  res[j + 1] = deepcopy(tmp)
@@ -207,8 +199,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
207
199
  html_output_file: str = None,
208
200
  verbose: bool = False) -> None:
209
201
  """Initialize with a file path."""
210
- self.layout_parser = LayoutParser(api_key=layout_api_key,
211
- api_base_url=layout_api_url)
202
+ self.layout_parser = LayoutParser(api_key=layout_api_key, api_base_url=layout_api_url)
212
203
  self.with_columns = with_columns
213
204
  self.is_join_table = is_join_table
214
205
  self.support_rotate = support_rotate
@@ -286,8 +277,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
286
277
  texts = []
287
278
  for b in blocks:
288
279
  if b[-1] != IMG_BLOCK_TYPE:
289
- text = re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', b[4]
290
- or '').strip()
280
+ text = re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, ' ', b[4] or '').strip()
291
281
  if text:
292
282
  texts.append(text)
293
283
  text_ploys.append(Rect(b[0], b[1], b[2], b[3]))
@@ -301,8 +291,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
301
291
  layout_info = json.loads(layout.page_content)
302
292
  for info in layout_info:
303
293
  bbs = info['bbox']
304
- coords = ((bbs[0], bbs[1]), (bbs[2], bbs[3]), (bbs[4], bbs[5]),
305
- (bbs[6], bbs[7]))
294
+ coords = ((bbs[0], bbs[1]), (bbs[2], bbs[3]), (bbs[4], bbs[5]), (bbs[6], bbs[7]))
306
295
  semantic_polys.append(Polygon(coords))
307
296
  semantic_labels.append(info['category_id'])
308
297
 
@@ -350,8 +339,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
350
339
  rs = text_rects[ind]
351
340
  ord_ind = np.min(ori_orders)
352
341
  mask[ind] = 1
353
- new_block_info.append(
354
- (rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
342
+ new_block_info.append((rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
355
343
 
356
344
  elif np.all(mask[start:end] == 0):
357
345
  rect = merge_rects(text_rects[start:end])
@@ -367,16 +355,14 @@ class PDFWithSemanticLoader(BasePDFLoader):
367
355
  rs = rs[arg_ind]
368
356
 
369
357
  mask[start:end] = 1
370
- new_block_info.append(
371
- (rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
358
+ new_block_info.append((rect[0], rect[1], rect[2], rect[3], ts, rs, ord_ind))
372
359
 
373
360
  for i in range(texts_cnt):
374
361
  if mask[i] == 0:
375
362
  b = blocks[i]
376
363
  r = np.asarray([b[0], b[1], b[2], b[3]])
377
364
  ord_ind = b[-2]
378
- new_block_info.append(
379
- (b[0], b[1], b[2], b[3], [texts[i]], [r], ord_ind))
365
+ new_block_info.append((b[0], b[1], b[2], b[3], [texts[i]], [r], ord_ind))
380
366
 
381
367
  if self.with_columns:
382
368
  new_blocks = sorted(new_block_info, key=lambda x: x[-1])
@@ -430,8 +416,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
430
416
  for label, b in zip(texts_labels, new_blocks):
431
417
  if label in effective_class_inds:
432
418
  text = join_lines(b[4], label == TABLE_ID)
433
- filtered_blocks.append(
434
- (b[0], b[1], b[2], b[3], text, b[5], label))
419
+ filtered_blocks.append((b[0], b[1], b[2], b[3], text, b[5], label))
435
420
 
436
421
  # print('---filtered_blocks---')
437
422
  # for b in filtered_blocks:
@@ -539,8 +524,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
539
524
  if (c0 > LINE_FULL_THRESHOLD and c1 < START_THRESHOLD
540
525
  and c2 < SIMI_HEIGHT_THRESHOLD):
541
526
  new_text = join_lines([b0[4], b1[4]])
542
- new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5],
543
- b0[6])
527
+ new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5], b0[6])
544
528
  groups[i - 1][-1] = new_block
545
529
  groups[i].pop(0)
546
530
 
@@ -549,8 +533,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
549
533
  c0 = (r1_w - r0_w) / r1_h
550
534
  if c0 < SIMI_WIDTH_THRESHOLD:
551
535
  new_text = join_lines([b0[4], b1[4]], True)
552
- new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5],
553
- b0[6])
536
+ new_block = (b0[0], b0[1], b0[2], b0[3], new_text, b0[5], b0[6])
554
537
  groups[i - 1][-1] = new_block
555
538
  groups[i].pop(0)
556
539
 
@@ -559,10 +542,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
559
542
  return groups
560
543
 
561
544
  def save_to_html(self, groups, output_file):
562
- styles = [
563
- 'style="background-color: #EBEBEB;"',
564
- 'style="background-color: #ABBAEA;"'
565
- ]
545
+ styles = ['style="background-color: #EBEBEB;"', 'style="background-color: #ABBAEA;"']
566
546
  idx = 0
567
547
  table_style = 'style="border:1px solid black;"'
568
548
 
@@ -578,8 +558,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
578
558
  rows = b[4].split('\n')
579
559
  content = []
580
560
  for r in rows:
581
- content.append(
582
- f'<tr><td {table_style}>{r}</td></tr>')
561
+ content.append(f'<tr><td {table_style}>{r}</td></tr>')
583
562
  elem_text = '\n'.join(content)
584
563
  text = f'<table {table_style}>{elem_text}</table>'
585
564
  else:
@@ -610,6 +589,7 @@ class PDFWithSemanticLoader(BasePDFLoader):
610
589
 
611
590
  def load(self) -> List[Document]:
612
591
  """Load given path as pages."""
592
+ import pypdfium2
613
593
  blob = Blob.from_path(self.file_path)
614
594
  start = self.start
615
595
  groups = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bisheng-langchain
3
- Version: 0.2.2.2
3
+ Version: 0.2.2.3
4
4
  Summary: bisheng langchain modules
5
5
  Home-page: https://github.com/dataelement/bisheng
6
6
  Author: DataElem
@@ -26,7 +26,6 @@ Requires-Dist: bisheng-pyautogen
26
26
  Requires-Dist: jieba ==0.42.1
27
27
  Requires-Dist: pydantic ==1.10.13
28
28
  Requires-Dist: pymupdf ==1.23.8
29
- Requires-Dist: pypdfium2 ==4.25.0
30
29
  Requires-Dist: shapely ==2.0.2
31
30
  Requires-Dist: filetype ==1.2.0
32
31
 
@@ -25,7 +25,7 @@ bisheng_langchain/chains/router/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
25
25
  bisheng_langchain/chains/router/multi_rule.py,sha256=BiFryj3-7rOxfttD-MyOkKWLCSGB9LVYd2rjOsIfQC8,375
26
26
  bisheng_langchain/chains/router/rule_router.py,sha256=R2YRUnwn7s_7DbsSn27uPn4cIV0D-5iXEORXir0tNGM,1835
27
27
  bisheng_langchain/chat_models/__init__.py,sha256=A3_KoMRp96UqHwwYX4Mt60peVNjCsuUHAixqaV44BP4,492
28
- bisheng_langchain/chat_models/host_llm.py,sha256=9FyGwpqd8SPG1UC65NBS12a6_QoRAHUbZ8kH1kFehgo,21576
28
+ bisheng_langchain/chat_models/host_llm.py,sha256=phIBLY7AoGPIIxl9saFV2XAcRvgaJ8XUJA06bqiN8Uw,21638
29
29
  bisheng_langchain/chat_models/minimax.py,sha256=JLs_f6vWD9beZYUtjD4FG28G8tZHrGUAWOwdLIuJomw,13901
30
30
  bisheng_langchain/chat_models/proxy_llm.py,sha256=wzVBZik9WC3-f7kyQ1eu3Ooibqpcocln08knf5lV1Nw,17082
31
31
  bisheng_langchain/chat_models/qwen.py,sha256=jGx_tW-LPxfegE6NvY6wID8ps2SsP813atjXnc04C-s,18841
@@ -44,7 +44,7 @@ bisheng_langchain/document_loaders/__init__.py,sha256=LuQ-zMYxde2FeiEcvVtjQqnHoz
44
44
  bisheng_langchain/document_loaders/custom_kv.py,sha256=sUKeK0e8-cCmKyj1FsR7SzBNWjo5zRwHWVS5tKVxuPs,6656
45
45
  bisheng_langchain/document_loaders/elem_html.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
46
  bisheng_langchain/document_loaders/elem_image.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
- bisheng_langchain/document_loaders/elem_pdf.py,sha256=64kUITkrTVJe9CH6IAVSdDVcn2Ekx2PM-jT0cdClXlo,22716
47
+ bisheng_langchain/document_loaders/elem_pdf.py,sha256=K-TXILGNFLFjavhun_MFbUF4t2_WGA3Z-kbnr75lmW8,22243
48
48
  bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=N2jCmizi9gIsAO38zWL1zOzfNhoo_XmNIX7fteSeR0Q,4883
49
49
  bisheng_langchain/document_loaders/universal_kv.py,sha256=dJF_GQGKBMUjB_kX9CSp7xZRhXgwVuGPbMIzJwPh-C0,4063
50
50
  bisheng_langchain/document_loaders/parsers/__init__.py,sha256=OOM_FJkwaU-zNS58fASw0TH8FNT6VXKb0VrvisgdrII,171
@@ -69,7 +69,7 @@ bisheng_langchain/vectorstores/__init__.py,sha256=zCZgDe7LyQ0iDkfcm5UJ5NxwKQSRHn
69
69
  bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=gt_uw_fSMcEZWxbiA3V0RyA-utLOZlUY-qxdwnsfZks,12664
70
70
  bisheng_langchain/vectorstores/milvus.py,sha256=44ZbDsIxdsbUnHOpEpCdrW5zvWnYvDdAVoDKjCFoyYI,34424
71
71
  bisheng_langchain/vectorstores/retriever.py,sha256=hj4nAAl352EV_ANnU2OHJn7omCH3nBK82ydo14KqMH4,4353
72
- bisheng_langchain-0.2.2.2.dist-info/METADATA,sha256=Htg1v32q74rGVPN8Z6uSXLq7IXBSjkcmJT2ak517iJc,2333
73
- bisheng_langchain-0.2.2.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
74
- bisheng_langchain-0.2.2.2.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
75
- bisheng_langchain-0.2.2.2.dist-info/RECORD,,
72
+ bisheng_langchain-0.2.2.3.dist-info/METADATA,sha256=9UgVujQ3Ep2eBWGmbeoe8r-dPNqOjPxXHa0_WYSz7pw,2299
73
+ bisheng_langchain-0.2.2.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
74
+ bisheng_langchain-0.2.2.3.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
75
+ bisheng_langchain-0.2.2.3.dist-info/RECORD,,