bisheng-langchain 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bisheng_langchain/document_loaders/elem_unstrcutured_loader.py +12 -0
- bisheng_langchain/gpts/tools/code_interpreter/tool.py +11 -10
- bisheng_langchain/rag/extract_info.py +22 -12
- bisheng_langchain/text_splitter.py +2 -2
- {bisheng_langchain-1.2.1.dist-info → bisheng_langchain-1.3.0.dist-info}/METADATA +1 -1
- {bisheng_langchain-1.2.1.dist-info → bisheng_langchain-1.3.0.dist-info}/RECORD +8 -8
- {bisheng_langchain-1.2.1.dist-info → bisheng_langchain-1.3.0.dist-info}/WHEEL +0 -0
- {bisheng_langchain-1.2.1.dist-info → bisheng_langchain-1.3.0.dist-info}/top_level.txt +0 -0
@@ -64,6 +64,10 @@ class ElemUnstructuredLoader(BasePDFLoader):
|
|
64
64
|
file_path: str,
|
65
65
|
unstructured_api_key: str = None,
|
66
66
|
unstructured_api_url: str = None,
|
67
|
+
force_ocr: bool = False,
|
68
|
+
enable_formular: bool = True,
|
69
|
+
filter_page_header_footer: bool = False,
|
70
|
+
ocr_sdk_url: str = None,
|
67
71
|
start: int = 0,
|
68
72
|
n: int = None,
|
69
73
|
verbose: bool = False,
|
@@ -71,6 +75,10 @@ class ElemUnstructuredLoader(BasePDFLoader):
|
|
71
75
|
"""Initialize with a file path."""
|
72
76
|
self.unstructured_api_url = unstructured_api_url
|
73
77
|
self.unstructured_api_key = unstructured_api_key
|
78
|
+
self.force_ocr = force_ocr
|
79
|
+
self.enable_formular = enable_formular
|
80
|
+
self.filter_page_header_footer = filter_page_header_footer
|
81
|
+
self.ocr_sdk_url = ocr_sdk_url,
|
74
82
|
self.headers = {'Content-Type': 'application/json'}
|
75
83
|
self.file_name = file_name
|
76
84
|
self.start = start
|
@@ -84,9 +92,13 @@ class ElemUnstructuredLoader(BasePDFLoader):
|
|
84
92
|
b64_data = base64.b64encode(open(self.file_path, 'rb').read()).decode()
|
85
93
|
parameters = {'start': self.start, 'n': self.n}
|
86
94
|
parameters.update(self.extra_kwargs)
|
95
|
+
# TODO: add filter_page_header_footer into payload when elt4llm is ready.
|
87
96
|
payload = dict(filename=os.path.basename(self.file_name),
|
88
97
|
b64_data=[b64_data],
|
89
98
|
mode='partition',
|
99
|
+
force_ocr=self.force_ocr,
|
100
|
+
enable_formula=self.enable_formular,
|
101
|
+
ocr_sdk_url=self.ocr_sdk_url,
|
90
102
|
parameters=parameters)
|
91
103
|
|
92
104
|
resp = requests.post(self.unstructured_api_url, headers=self.headers, json=payload)
|
@@ -183,19 +183,20 @@ def upload_minio(
|
|
183
183
|
import minio
|
184
184
|
|
185
185
|
minio_client = minio.Minio(
|
186
|
-
endpoint=param.get('
|
187
|
-
access_key=param.get('
|
188
|
-
secret_key=param.get('
|
189
|
-
secure=param.get('
|
190
|
-
cert_check=param.get('
|
186
|
+
endpoint=param.get('endpoint'),
|
187
|
+
access_key=param.get('access_key'),
|
188
|
+
secret_key=param.get('secret_key'),
|
189
|
+
secure=param.get('schema'),
|
190
|
+
cert_check=param.get('cert_check'),
|
191
191
|
)
|
192
192
|
minio_share = minio.Minio(
|
193
|
-
endpoint=param.get('
|
194
|
-
access_key=param.get('
|
195
|
-
secret_key=param.get('
|
196
|
-
secure=param.get('
|
197
|
-
cert_check=param.get('
|
193
|
+
endpoint=param.get('sharepoint'),
|
194
|
+
access_key=param.get('access_key'),
|
195
|
+
secret_key=param.get('secret_key'),
|
196
|
+
secure=param.get('schema'),
|
197
|
+
cert_check=param.get('cert_check'),
|
198
198
|
)
|
199
|
+
bucket = param.get('tmp_bucket', 'tmp-dir')
|
199
200
|
logger.debug(
|
200
201
|
'upload_file obj={} bucket={} file_paht={}',
|
201
202
|
object_name,
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from bisheng_langchain.chat_models import ChatQWen
|
2
|
-
from langchain.chains import LLMChain
|
2
|
+
from langchain.chains.llm import LLMChain
|
3
3
|
from langchain.prompts.chat import (
|
4
4
|
ChatPromptTemplate,
|
5
5
|
SystemMessagePromptTemplate,
|
@@ -15,22 +15,32 @@ human_template = """
|
|
15
15
|
"""
|
16
16
|
|
17
17
|
messages = [
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
SystemMessagePromptTemplate.from_template(system_template),
|
19
|
+
HumanMessagePromptTemplate.from_template(human_template),
|
20
|
+
]
|
21
21
|
title_extract_prompt = ChatPromptTemplate.from_messages(messages)
|
22
22
|
|
23
23
|
|
24
|
-
def extract_title(llm, text, max_length=7000) -> str:
|
25
|
-
|
24
|
+
def extract_title(llm, text, max_length=7000, abstract_prompt: str = None) -> str:
|
25
|
+
"""
|
26
|
+
此方法在bisheng_langchain模型的还有两处调用用,在不能提供abstract_propmpt的情况下
|
27
|
+
使用原来现有提示词.
|
28
|
+
"""
|
29
|
+
if abstract_prompt:
|
30
|
+
updated_messages = [
|
31
|
+
SystemMessagePromptTemplate.from_template(abstract_prompt),
|
32
|
+
HumanMessagePromptTemplate.from_template(human_template),
|
33
|
+
]
|
34
|
+
updated_title_extract_prompt = ChatPromptTemplate.from_messages(updated_messages)
|
35
|
+
chain = LLMChain(llm=llm, prompt=updated_title_extract_prompt)
|
36
|
+
else:
|
37
|
+
chain = LLMChain(llm=llm, prompt=title_extract_prompt)
|
26
38
|
ans = chain.run(context=text[:max_length])
|
27
|
-
return ans
|
39
|
+
return ans
|
28
40
|
|
29
41
|
|
30
|
-
if __name__ ==
|
31
|
-
llm = ChatQWen(model_name=
|
32
|
-
api_key='',
|
33
|
-
temperature=0.01)
|
42
|
+
if __name__ == "__main__":
|
43
|
+
llm = ChatQWen(model_name="qwen1.5-72b-chat", api_key="", temperature=0.01)
|
34
44
|
text = "江苏蔚蓝锂芯股份有限公司\n2021 年年度报告 \n2022 年 03 月\n\n 第一节 重要提示、目录和释义\n公司董事会、监事会及董事、监事、高级管理人员保证年度报告内容的真实、准确、完整,不存在虚假记载、误导性陈述或重大遗漏,并承担个别和连带的法律责任。\n公司负责人 CHEN KAI、主管会计工作负责人林文华及会计机构负责人(会计主管人员)张宗红声明:保证本年度报告中财务报告的真实、准确、完整。\n所有董事均已出席了审议本报告的董事会会议。"
|
35
45
|
ans = extract_title(llm, text)
|
36
|
-
print(ans)
|
46
|
+
print(ans)
|
@@ -106,6 +106,7 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|
106
106
|
self._separator_rule = separator_rule or ['after' for _ in range(4)]
|
107
107
|
self.separator_rule = {one: self._separator_rule[index] for index, one in enumerate(separators)}
|
108
108
|
self._is_separator_regex = is_separator_regex
|
109
|
+
self._chunk_overlap = kwargs.get('chunk_overlap', 0)
|
109
110
|
|
110
111
|
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
|
111
112
|
texts, metadatas = [], []
|
@@ -167,7 +168,6 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|
167
168
|
documents = []
|
168
169
|
for i, text in enumerate(texts):
|
169
170
|
index = -1
|
170
|
-
# metadata = copy.deepcopy(_metadatas[i])
|
171
171
|
indexes = metadatas[i].get('indexes', [])
|
172
172
|
pages = metadatas[i].get('pages', [])
|
173
173
|
types = metadatas[i].get('types', [])
|
@@ -215,7 +215,7 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|
215
215
|
# for elem in box_no_duplicates:
|
216
216
|
# new_metadata['chunk_bboxes'].append(
|
217
217
|
# {'page': elem[0], 'bbox': new_metadata['bboxes'][elem[1]]})
|
218
|
-
|
219
218
|
new_doc = Document(page_content=chunk, metadata=new_metadata)
|
219
|
+
prev_document = new_doc
|
220
220
|
documents.append(new_doc)
|
221
221
|
return documents
|
@@ -1,5 +1,5 @@
|
|
1
1
|
bisheng_langchain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
bisheng_langchain/text_splitter.py,sha256=
|
2
|
+
bisheng_langchain/text_splitter.py,sha256=HJFGo6g_JGxWCAat98VIlKAKfaIYaR6f-g7SJwvhcRM,8840
|
3
3
|
bisheng_langchain/agents/__init__.py,sha256=ctsKj77fS8qlkhz_9sS_AhCjFvFNxEpJ9KBYVrApLRg,226
|
4
4
|
bisheng_langchain/agents/chatglm_functions_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
bisheng_langchain/agents/chatglm_functions_agent/base.py,sha256=IxPf9_atUKy8UMgIDYrgmWBkrVfOLdyJSDfVcD_rsDg,13724
|
@@ -54,7 +54,7 @@ bisheng_langchain/document_loaders/custom_kv.py,sha256=-7h7QqGUFPhpNYAUZBDmkr_pD
|
|
54
54
|
bisheng_langchain/document_loaders/elem_html.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
55
|
bisheng_langchain/document_loaders/elem_image.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
bisheng_langchain/document_loaders/elem_pdf.py,sha256=WpRIStBl1DUDa0NCd594gKU2NsgURRv5jnKSM71ZPI8,22273
|
57
|
-
bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=
|
57
|
+
bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=AIKnApzovohheEJFMolE7J-tGY_ClUfiGBb-SwYuC60,8697
|
58
58
|
bisheng_langchain/document_loaders/universal_kv.py,sha256=7z19Z_NwtILmtkbIURf4qMyEJGjlE-5CkhqF2KFGc7I,4134
|
59
59
|
bisheng_langchain/document_loaders/parsers/__init__.py,sha256=OOM_FJkwaU-zNS58fASw0TH8FNT6VXKb0VrvisgdrII,171
|
60
60
|
bisheng_langchain/document_loaders/parsers/ellm_client.py,sha256=Y_CRYwBr-gFArOirF1b76KyI5N8eVpsLeDiIsKtYkpU,1641
|
@@ -104,7 +104,7 @@ bisheng_langchain/gpts/tools/bing_search/tool.py,sha256=FlaeNEiOO52YjxpXu62efaMH
|
|
104
104
|
bisheng_langchain/gpts/tools/calculator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
105
105
|
bisheng_langchain/gpts/tools/calculator/tool.py,sha256=5FFL3YAYGQqC2L7zFP3LK6zApZ4GFI9MjUa5VdQ9nvY,695
|
106
106
|
bisheng_langchain/gpts/tools/code_interpreter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
107
|
-
bisheng_langchain/gpts/tools/code_interpreter/tool.py,sha256=
|
107
|
+
bisheng_langchain/gpts/tools/code_interpreter/tool.py,sha256=oUhlJzn5sQzk-GTxyzq_i89Is2IYaO4hoIYcoo9N_e4,11441
|
108
108
|
bisheng_langchain/gpts/tools/dalle_image_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
109
|
bisheng_langchain/gpts/tools/dalle_image_generator/tool.py,sha256=m1c_1pVJ3cM8Qrp0wDXEgEQ_9XJo_CyCmcwRayBsIYE,7492
|
110
110
|
bisheng_langchain/gpts/tools/get_current_time/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -126,7 +126,7 @@ bisheng_langchain/rag/bisheng_rag_chain.py,sha256=75HCIkUAMGrD1E6BaSA9dhnug137PZ
|
|
126
126
|
bisheng_langchain/rag/bisheng_rag_pipeline.py,sha256=neoBK3TtuQ07_WeuJCzYlvtsDQNepUa_68NT8VCgytw,13749
|
127
127
|
bisheng_langchain/rag/bisheng_rag_pipeline_v2.py,sha256=iOoF7mbLp9qDGPsV0fEmgph_Ba8VnECYvCPebXk8xmo,16144
|
128
128
|
bisheng_langchain/rag/bisheng_rag_tool.py,sha256=JAxsoASwaCaGHrFlAylYOZQZ9ZdnMcfYvAaZZKvIz0g,13676
|
129
|
-
bisheng_langchain/rag/extract_info.py,sha256=
|
129
|
+
bisheng_langchain/rag/extract_info.py,sha256=QEFlAaC_9-87dl4BtTz9ciQaQhfE_XjWUnQrwnUfRiw,2215
|
130
130
|
bisheng_langchain/rag/run_qa_gen_web.py,sha256=-fIvHNnD3lD0iNU5m0Me1GDwRjlcsB8tE5RnPtFRG2s,1840
|
131
131
|
bisheng_langchain/rag/run_rag_evaluate_web.py,sha256=a9vMhq-ZhEiHHr43uKUzKtjdk280uAP_UHQW_eOaQMw,2224
|
132
132
|
bisheng_langchain/rag/utils.py,sha256=ecl4sDR8iUrVCBRPAAT0hZOHkH50-TLS3567GLP1sRM,7122
|
@@ -166,7 +166,7 @@ bisheng_langchain/vectorstores/__init__.py,sha256=zCZgDe7LyQ0iDkfcm5UJ5NxwKQSRHn
|
|
166
166
|
bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=BxvT9FUTju4AZPtQFTbYLmIIKKw8bqcact5Cav_5H2I,15357
|
167
167
|
bisheng_langchain/vectorstores/milvus.py,sha256=jWq_lce-ihOz07D1kwj5ctPzElYexNCjJ-xSv-pK1CI,37172
|
168
168
|
bisheng_langchain/vectorstores/retriever.py,sha256=fNtk8qSwBo2Qrlt1NpZVXaNATW2tBywkyS0q0NtN5MI,4326
|
169
|
-
bisheng_langchain-1.
|
170
|
-
bisheng_langchain-1.
|
171
|
-
bisheng_langchain-1.
|
172
|
-
bisheng_langchain-1.
|
169
|
+
bisheng_langchain-1.3.0.dist-info/METADATA,sha256=Osl1j2o-wDqBmnqjsqVEWW-sNmiNHr-mTmRpHYbFy_g,2435
|
170
|
+
bisheng_langchain-1.3.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
171
|
+
bisheng_langchain-1.3.0.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
|
172
|
+
bisheng_langchain-1.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|