bisheng-langchain 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,6 +64,10 @@ class ElemUnstructuredLoader(BasePDFLoader):
64
64
  file_path: str,
65
65
  unstructured_api_key: str = None,
66
66
  unstructured_api_url: str = None,
67
+ force_ocr: bool = False,
68
+ enable_formular: bool = True,
69
+ filter_page_header_footer: bool = False,
70
+ ocr_sdk_url: str = None,
67
71
  start: int = 0,
68
72
  n: int = None,
69
73
  verbose: bool = False,
@@ -71,6 +75,10 @@ class ElemUnstructuredLoader(BasePDFLoader):
71
75
  """Initialize with a file path."""
72
76
  self.unstructured_api_url = unstructured_api_url
73
77
  self.unstructured_api_key = unstructured_api_key
78
+ self.force_ocr = force_ocr
79
+ self.enable_formular = enable_formular
80
+ self.filter_page_header_footer = filter_page_header_footer
81
+ self.ocr_sdk_url = ocr_sdk_url,
74
82
  self.headers = {'Content-Type': 'application/json'}
75
83
  self.file_name = file_name
76
84
  self.start = start
@@ -84,9 +92,13 @@ class ElemUnstructuredLoader(BasePDFLoader):
84
92
  b64_data = base64.b64encode(open(self.file_path, 'rb').read()).decode()
85
93
  parameters = {'start': self.start, 'n': self.n}
86
94
  parameters.update(self.extra_kwargs)
95
+ # TODO: add filter_page_header_footer into payload when elt4llm is ready.
87
96
  payload = dict(filename=os.path.basename(self.file_name),
88
97
  b64_data=[b64_data],
89
98
  mode='partition',
99
+ force_ocr=self.force_ocr,
100
+ enable_formula=self.enable_formular,
101
+ ocr_sdk_url=self.ocr_sdk_url,
90
102
  parameters=parameters)
91
103
 
92
104
  resp = requests.post(self.unstructured_api_url, headers=self.headers, json=payload)
@@ -183,19 +183,20 @@ def upload_minio(
183
183
  import minio
184
184
 
185
185
  minio_client = minio.Minio(
186
- endpoint=param.get('MINIO_ENDPOINT'),
187
- access_key=param.get('MINIO_ACCESS_KEY'),
188
- secret_key=param.get('MINIO_SECRET_KEY'),
189
- secure=param.get('SCHEMA'),
190
- cert_check=param.get('CERT_CHECK'),
186
+ endpoint=param.get('endpoint'),
187
+ access_key=param.get('access_key'),
188
+ secret_key=param.get('secret_key'),
189
+ secure=param.get('schema'),
190
+ cert_check=param.get('cert_check'),
191
191
  )
192
192
  minio_share = minio.Minio(
193
- endpoint=param.get('MINIO_SHAREPOIN'),
194
- access_key=param.get('MINIO_ACCESS_KEY'),
195
- secret_key=param.get('MINIO_SECRET_KEY'),
196
- secure=param.get('SCHEMA'),
197
- cert_check=param.get('CERT_CHECK'),
193
+ endpoint=param.get('sharepoint'),
194
+ access_key=param.get('access_key'),
195
+ secret_key=param.get('secret_key'),
196
+ secure=param.get('schema'),
197
+ cert_check=param.get('cert_check'),
198
198
  )
199
+ bucket = param.get('tmp_bucket', 'tmp-dir')
199
200
  logger.debug(
200
201
  'upload_file obj={} bucket={} file_paht={}',
201
202
  object_name,
@@ -1,5 +1,5 @@
1
1
  from bisheng_langchain.chat_models import ChatQWen
2
- from langchain.chains import LLMChain
2
+ from langchain.chains.llm import LLMChain
3
3
  from langchain.prompts.chat import (
4
4
  ChatPromptTemplate,
5
5
  SystemMessagePromptTemplate,
@@ -15,22 +15,32 @@ human_template = """
15
15
  """
16
16
 
17
17
  messages = [
18
- SystemMessagePromptTemplate.from_template(system_template),
19
- HumanMessagePromptTemplate.from_template(human_template),
20
- ]
18
+ SystemMessagePromptTemplate.from_template(system_template),
19
+ HumanMessagePromptTemplate.from_template(human_template),
20
+ ]
21
21
  title_extract_prompt = ChatPromptTemplate.from_messages(messages)
22
22
 
23
23
 
24
- def extract_title(llm, text, max_length=7000) -> str:
25
- chain = LLMChain(llm=llm, prompt=title_extract_prompt)
24
+ def extract_title(llm, text, max_length=7000, abstract_prompt: str = None) -> str:
25
+ """
26
+ 此方法在bisheng_langchain模型的还有两处调用用,在不能提供abstract_propmpt的情况下
27
+ 使用原来现有提示词.
28
+ """
29
+ if abstract_prompt:
30
+ updated_messages = [
31
+ SystemMessagePromptTemplate.from_template(abstract_prompt),
32
+ HumanMessagePromptTemplate.from_template(human_template),
33
+ ]
34
+ updated_title_extract_prompt = ChatPromptTemplate.from_messages(updated_messages)
35
+ chain = LLMChain(llm=llm, prompt=updated_title_extract_prompt)
36
+ else:
37
+ chain = LLMChain(llm=llm, prompt=title_extract_prompt)
26
38
  ans = chain.run(context=text[:max_length])
27
- return ans
39
+ return ans
28
40
 
29
41
 
30
- if __name__ == '__main__':
31
- llm = ChatQWen(model_name='qwen1.5-72b-chat',
32
- api_key='',
33
- temperature=0.01)
42
+ if __name__ == "__main__":
43
+ llm = ChatQWen(model_name="qwen1.5-72b-chat", api_key="", temperature=0.01)
34
44
  text = "江苏蔚蓝锂芯股份有限公司\n2021 年年度报告 \n2022 年 03 月\n\n 第一节 重要提示、目录和释义\n公司董事会、监事会及董事、监事、高级管理人员保证年度报告内容的真实、准确、完整,不存在虚假记载、误导性陈述或重大遗漏,并承担个别和连带的法律责任。\n公司负责人 CHEN KAI、主管会计工作负责人林文华及会计机构负责人(会计主管人员)张宗红声明:保证本年度报告中财务报告的真实、准确、完整。\n所有董事均已出席了审议本报告的董事会会议。"
35
45
  ans = extract_title(llm, text)
36
- print(ans)
46
+ print(ans)
@@ -106,6 +106,7 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
106
106
  self._separator_rule = separator_rule or ['after' for _ in range(4)]
107
107
  self.separator_rule = {one: self._separator_rule[index] for index, one in enumerate(separators)}
108
108
  self._is_separator_regex = is_separator_regex
109
+ self._chunk_overlap = kwargs.get('chunk_overlap', 0)
109
110
 
110
111
  def split_documents(self, documents: Iterable[Document]) -> List[Document]:
111
112
  texts, metadatas = [], []
@@ -167,7 +168,6 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
167
168
  documents = []
168
169
  for i, text in enumerate(texts):
169
170
  index = -1
170
- # metadata = copy.deepcopy(_metadatas[i])
171
171
  indexes = metadatas[i].get('indexes', [])
172
172
  pages = metadatas[i].get('pages', [])
173
173
  types = metadatas[i].get('types', [])
@@ -215,7 +215,7 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
215
215
  # for elem in box_no_duplicates:
216
216
  # new_metadata['chunk_bboxes'].append(
217
217
  # {'page': elem[0], 'bbox': new_metadata['bboxes'][elem[1]]})
218
-
219
218
  new_doc = Document(page_content=chunk, metadata=new_metadata)
219
+ prev_document = new_doc
220
220
  documents.append(new_doc)
221
221
  return documents
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bisheng-langchain
3
- Version: 1.2.1
3
+ Version: 1.3.0
4
4
  Summary: bisheng langchain modules
5
5
  Home-page: https://github.com/dataelement/bisheng
6
6
  Author: DataElem
@@ -1,5 +1,5 @@
1
1
  bisheng_langchain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- bisheng_langchain/text_splitter.py,sha256=yYpzMa0c1tRZNGyuFFJsLvSeqberYHV2-BR28pQim8I,8794
2
+ bisheng_langchain/text_splitter.py,sha256=HJFGo6g_JGxWCAat98VIlKAKfaIYaR6f-g7SJwvhcRM,8840
3
3
  bisheng_langchain/agents/__init__.py,sha256=ctsKj77fS8qlkhz_9sS_AhCjFvFNxEpJ9KBYVrApLRg,226
4
4
  bisheng_langchain/agents/chatglm_functions_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  bisheng_langchain/agents/chatglm_functions_agent/base.py,sha256=IxPf9_atUKy8UMgIDYrgmWBkrVfOLdyJSDfVcD_rsDg,13724
@@ -54,7 +54,7 @@ bisheng_langchain/document_loaders/custom_kv.py,sha256=-7h7QqGUFPhpNYAUZBDmkr_pD
54
54
  bisheng_langchain/document_loaders/elem_html.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
55
  bisheng_langchain/document_loaders/elem_image.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  bisheng_langchain/document_loaders/elem_pdf.py,sha256=WpRIStBl1DUDa0NCd594gKU2NsgURRv5jnKSM71ZPI8,22273
57
- bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=SA02ClxYb7DSfBdAA9bYT8oxMETTsp5GdHPkAH1Wkk8,8075
57
+ bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=AIKnApzovohheEJFMolE7J-tGY_ClUfiGBb-SwYuC60,8697
58
58
  bisheng_langchain/document_loaders/universal_kv.py,sha256=7z19Z_NwtILmtkbIURf4qMyEJGjlE-5CkhqF2KFGc7I,4134
59
59
  bisheng_langchain/document_loaders/parsers/__init__.py,sha256=OOM_FJkwaU-zNS58fASw0TH8FNT6VXKb0VrvisgdrII,171
60
60
  bisheng_langchain/document_loaders/parsers/ellm_client.py,sha256=Y_CRYwBr-gFArOirF1b76KyI5N8eVpsLeDiIsKtYkpU,1641
@@ -104,7 +104,7 @@ bisheng_langchain/gpts/tools/bing_search/tool.py,sha256=FlaeNEiOO52YjxpXu62efaMH
104
104
  bisheng_langchain/gpts/tools/calculator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
105
  bisheng_langchain/gpts/tools/calculator/tool.py,sha256=5FFL3YAYGQqC2L7zFP3LK6zApZ4GFI9MjUa5VdQ9nvY,695
106
106
  bisheng_langchain/gpts/tools/code_interpreter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
- bisheng_langchain/gpts/tools/code_interpreter/tool.py,sha256=WK8FX1wLS8896rResE8jSZ7L5SyjAuaGiw3SWRocdQM,11428
107
+ bisheng_langchain/gpts/tools/code_interpreter/tool.py,sha256=oUhlJzn5sQzk-GTxyzq_i89Is2IYaO4hoIYcoo9N_e4,11441
108
108
  bisheng_langchain/gpts/tools/dalle_image_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
109
  bisheng_langchain/gpts/tools/dalle_image_generator/tool.py,sha256=m1c_1pVJ3cM8Qrp0wDXEgEQ_9XJo_CyCmcwRayBsIYE,7492
110
110
  bisheng_langchain/gpts/tools/get_current_time/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -126,7 +126,7 @@ bisheng_langchain/rag/bisheng_rag_chain.py,sha256=75HCIkUAMGrD1E6BaSA9dhnug137PZ
126
126
  bisheng_langchain/rag/bisheng_rag_pipeline.py,sha256=neoBK3TtuQ07_WeuJCzYlvtsDQNepUa_68NT8VCgytw,13749
127
127
  bisheng_langchain/rag/bisheng_rag_pipeline_v2.py,sha256=iOoF7mbLp9qDGPsV0fEmgph_Ba8VnECYvCPebXk8xmo,16144
128
128
  bisheng_langchain/rag/bisheng_rag_tool.py,sha256=JAxsoASwaCaGHrFlAylYOZQZ9ZdnMcfYvAaZZKvIz0g,13676
129
- bisheng_langchain/rag/extract_info.py,sha256=jtZ4Bchjv4tOaayC2MnkV-lLu3vDA0Hsk_S-ATni34g,1695
129
+ bisheng_langchain/rag/extract_info.py,sha256=QEFlAaC_9-87dl4BtTz9ciQaQhfE_XjWUnQrwnUfRiw,2215
130
130
  bisheng_langchain/rag/run_qa_gen_web.py,sha256=-fIvHNnD3lD0iNU5m0Me1GDwRjlcsB8tE5RnPtFRG2s,1840
131
131
  bisheng_langchain/rag/run_rag_evaluate_web.py,sha256=a9vMhq-ZhEiHHr43uKUzKtjdk280uAP_UHQW_eOaQMw,2224
132
132
  bisheng_langchain/rag/utils.py,sha256=ecl4sDR8iUrVCBRPAAT0hZOHkH50-TLS3567GLP1sRM,7122
@@ -166,7 +166,7 @@ bisheng_langchain/vectorstores/__init__.py,sha256=zCZgDe7LyQ0iDkfcm5UJ5NxwKQSRHn
166
166
  bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=BxvT9FUTju4AZPtQFTbYLmIIKKw8bqcact5Cav_5H2I,15357
167
167
  bisheng_langchain/vectorstores/milvus.py,sha256=jWq_lce-ihOz07D1kwj5ctPzElYexNCjJ-xSv-pK1CI,37172
168
168
  bisheng_langchain/vectorstores/retriever.py,sha256=fNtk8qSwBo2Qrlt1NpZVXaNATW2tBywkyS0q0NtN5MI,4326
169
- bisheng_langchain-1.2.1.dist-info/METADATA,sha256=DFifMhbexNo46m3h6BXMn9LwUnpndls5v_X51yyu7Yk,2435
170
- bisheng_langchain-1.2.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
171
- bisheng_langchain-1.2.1.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
172
- bisheng_langchain-1.2.1.dist-info/RECORD,,
169
+ bisheng_langchain-1.3.0.dist-info/METADATA,sha256=Osl1j2o-wDqBmnqjsqVEWW-sNmiNHr-mTmRpHYbFy_g,2435
170
+ bisheng_langchain-1.3.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
171
+ bisheng_langchain-1.3.0.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
172
+ bisheng_langchain-1.3.0.dist-info/RECORD,,