bisheng-langchain 0.3.4.dev3__py3-none-any.whl → 0.3.5.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bisheng_langchain/document_loaders/elem_unstrcutured_loader.py +24 -5
- bisheng_langchain/gpts/tools/api_tools/sina.py +8 -2
- bisheng_langchain/text_splitter.py +68 -56
- bisheng_langchain/vectorstores/milvus.py +14 -3
- {bisheng_langchain-0.3.4.dev3.dist-info → bisheng_langchain-0.3.5.dev1.dist-info}/METADATA +2 -2
- {bisheng_langchain-0.3.4.dev3.dist-info → bisheng_langchain-0.3.5.dev1.dist-info}/RECORD +8 -8
- {bisheng_langchain-0.3.4.dev3.dist-info → bisheng_langchain-0.3.5.dev1.dist-info}/WHEEL +0 -0
- {bisheng_langchain-0.3.4.dev3.dist-info → bisheng_langchain-0.3.5.dev1.dist-info}/top_level.txt +0 -0
@@ -34,7 +34,10 @@ def merge_partitions(partitions):
|
|
34
34
|
elif label == 'Table':
|
35
35
|
doc_content.append('\n\n' + text)
|
36
36
|
else:
|
37
|
-
|
37
|
+
if last_label == 'Table':
|
38
|
+
doc_content.append(text_elem_sep * 2 + text)
|
39
|
+
else:
|
40
|
+
doc_content.append(text_elem_sep + text)
|
38
41
|
|
39
42
|
last_label = label
|
40
43
|
metadata['bboxes'].extend(list(map(lambda x: list(map(int, x)), extra_data['bboxes'])))
|
@@ -73,6 +76,7 @@ class ElemUnstructuredLoader(BasePDFLoader):
|
|
73
76
|
self.start = start
|
74
77
|
self.n = n
|
75
78
|
self.extra_kwargs = kwargs
|
79
|
+
self.partitions = None
|
76
80
|
super().__init__(file_path)
|
77
81
|
|
78
82
|
def load(self) -> List[Document]:
|
@@ -93,18 +97,33 @@ class ElemUnstructuredLoader(BasePDFLoader):
|
|
93
97
|
resp = resp.json()
|
94
98
|
if 200 != resp.get('status_code'):
|
95
99
|
logger.info(f'file partition {os.path.basename(self.file_name)} error resp={resp}')
|
100
|
+
raise Exception(f'file partition error {os.path.basename(self.file_name)} error resp={resp}')
|
96
101
|
partitions = resp['partitions']
|
97
|
-
if
|
98
|
-
logger.info(f'
|
102
|
+
if partitions:
|
103
|
+
logger.info(f'content_from_partitions')
|
104
|
+
self.partitions = partitions
|
105
|
+
content, metadata = merge_partitions(partitions)
|
106
|
+
elif resp.get('text'):
|
107
|
+
logger.info(f'content_from_text')
|
108
|
+
content = resp['text']
|
109
|
+
metadata = {
|
110
|
+
"bboxes": [],
|
111
|
+
"pages": [],
|
112
|
+
"indexes": [],
|
113
|
+
"types": [],
|
114
|
+
}
|
115
|
+
else:
|
116
|
+
logger.warning(f'content_is_empty resp={resp}')
|
117
|
+
content = ''
|
118
|
+
metadata = {}
|
119
|
+
|
99
120
|
logger.info(f'unstruct_return code={resp.get("status_code")}')
|
100
121
|
|
101
122
|
if resp.get('b64_pdf'):
|
102
123
|
with open(self.file_path, 'wb') as f:
|
103
124
|
f.write(base64.b64decode(resp['b64_pdf']))
|
104
125
|
|
105
|
-
content, metadata = merge_partitions(partitions)
|
106
126
|
metadata['source'] = self.file_name
|
107
|
-
|
108
127
|
doc = Document(page_content=content, metadata=metadata)
|
109
128
|
return [doc]
|
110
129
|
|
@@ -145,7 +145,10 @@ class StockInfo(APIToolBase):
|
|
145
145
|
if resp.status_code != 200:
|
146
146
|
logger.info('api_call_fail res={}', resp.text)
|
147
147
|
k_data = resp.text
|
148
|
-
|
148
|
+
k_data = kLinePattern.search(k_data)
|
149
|
+
if not k_data:
|
150
|
+
return '{}'
|
151
|
+
data_array = json.loads(k_data.group(1))
|
149
152
|
for item in data_array:
|
150
153
|
if item.get('day') == date:
|
151
154
|
return json.dumps(item)
|
@@ -173,7 +176,10 @@ class StockInfo(APIToolBase):
|
|
173
176
|
count = datetime.today() - date_obj
|
174
177
|
url = self.url.format(stockName=stock_number, stock=stock, count=count.days)
|
175
178
|
k_data = await self.async_client.aget(url)
|
176
|
-
|
179
|
+
k_data = kLinePattern.search(k_data)
|
180
|
+
if not k_data:
|
181
|
+
return '{}'
|
182
|
+
data_array = json.loads(k_data.group(1))
|
177
183
|
for item in data_array:
|
178
184
|
if item.get('day') == date:
|
179
185
|
return json.dumps(item)
|
@@ -21,17 +21,22 @@ logger = logging.getLogger(__name__)
|
|
21
21
|
|
22
22
|
|
23
23
|
def _split_text_with_regex(
|
24
|
-
|
24
|
+
text: str, separator: str, keep_separator: bool, separator_rule: str
|
25
25
|
) -> List[str]:
|
26
26
|
# Now that we have the separator, split the text
|
27
27
|
if separator:
|
28
28
|
if keep_separator:
|
29
29
|
# The parentheses in the pattern keep the delimiters in the result.
|
30
30
|
_splits = re.split(f'({separator})', text)
|
31
|
-
|
32
|
-
if
|
33
|
-
splits
|
34
|
-
|
31
|
+
|
32
|
+
if separator_rule == "before":
|
33
|
+
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
|
34
|
+
if len(_splits) % 2 == 0:
|
35
|
+
splits += _splits[-1:]
|
36
|
+
splits = [_splits[0]] + splits
|
37
|
+
else:
|
38
|
+
splits = [_splits[i-1] + _splits[i] for i in range(1, len(_splits), 2)]
|
39
|
+
splits = splits + [_splits[-1]]
|
35
40
|
else:
|
36
41
|
splits = re.split(separator, text)
|
37
42
|
else:
|
@@ -82,11 +87,14 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|
82
87
|
"""
|
83
88
|
todo
|
84
89
|
"""
|
90
|
+
|
85
91
|
def __init__(
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
92
|
+
self,
|
93
|
+
separators: Optional[List[str]] = None,
|
94
|
+
separator_rule: Optional[List[str]] = None,
|
95
|
+
is_separator_regex: bool = False,
|
96
|
+
keep_separator: bool = True,
|
97
|
+
**kwargs: Any,
|
90
98
|
) -> None:
|
91
99
|
"""Create a new TextSplitter."""
|
92
100
|
super().__init__(
|
@@ -95,7 +103,9 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|
95
103
|
**kwargs
|
96
104
|
)
|
97
105
|
self._separators = separators or ['\n\n', '\n', ' ', '']
|
98
|
-
self.
|
106
|
+
self._separator_rule = separator_rule or ['after' for _ in range(4)]
|
107
|
+
self.separator_rule = {one: self._separator_rule[index] for index, one in enumerate(separators)}
|
108
|
+
self._is_separator_regex = is_separator_regex
|
99
109
|
|
100
110
|
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
|
101
111
|
texts, metadatas = [], []
|
@@ -110,19 +120,21 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|
110
120
|
final_chunks = []
|
111
121
|
# Get appropriate separator to use
|
112
122
|
separator = separators[-1]
|
123
|
+
separator_rule = 'after'
|
113
124
|
new_separators = []
|
114
125
|
for i, _s in enumerate(separators):
|
115
126
|
_separator = _s if self._is_separator_regex else re.escape(_s)
|
127
|
+
separator_rule = self.separator_rule[_s]
|
116
128
|
if _s == '':
|
117
129
|
separator = _s
|
118
130
|
break
|
119
131
|
if re.search(_separator, text):
|
120
132
|
separator = _s
|
121
|
-
new_separators = separators[i + 1
|
133
|
+
new_separators = separators[i + 1:]
|
122
134
|
break
|
123
135
|
|
124
136
|
_separator = separator if self._is_separator_regex else re.escape(separator)
|
125
|
-
splits = _split_text_with_regex(text, _separator, self._keep_separator)
|
137
|
+
splits = _split_text_with_regex(text, _separator, self._keep_separator, separator_rule)
|
126
138
|
|
127
139
|
# Now go merging things, recursively splitting longer texts.
|
128
140
|
_good_splits = []
|
@@ -149,60 +161,60 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|
149
161
|
return self._split_text(text, self._separators)
|
150
162
|
|
151
163
|
def create_documents(
|
152
|
-
|
164
|
+
self, texts: List[str], metadatas: Optional[List[dict]] = None
|
153
165
|
) -> List[Document]:
|
154
166
|
"""Create documents from a list of texts."""
|
155
167
|
documents = []
|
156
168
|
for i, text in enumerate(texts):
|
157
169
|
index = -1
|
158
170
|
# metadata = copy.deepcopy(_metadatas[i])
|
159
|
-
indexes = metadatas[i]
|
160
|
-
pages = metadatas[i]
|
161
|
-
types = metadatas[i]
|
162
|
-
bboxes = metadatas[i]
|
171
|
+
indexes = metadatas[i].get('indexes', [])
|
172
|
+
pages = metadatas[i].get('pages', [])
|
173
|
+
types = metadatas[i].get('types', [])
|
174
|
+
bboxes = metadatas[i].get('bboxes', [])
|
163
175
|
searcher = IntervalSearch(indexes)
|
164
176
|
split_texts = self.split_text(text)
|
165
177
|
for chunk in split_texts:
|
166
178
|
new_metadata = copy.deepcopy(metadatas[i])
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
179
|
+
if indexes and bboxes:
|
180
|
+
index = text.find(chunk, index + 1)
|
181
|
+
inter0 = [index, index + len(chunk) - 1]
|
182
|
+
norm_inter = searcher.find(inter0)
|
183
|
+
new_metadata['chunk_bboxes'] = []
|
184
|
+
for j in range(norm_inter[0], norm_inter[1] + 1):
|
185
|
+
new_metadata['chunk_bboxes'].append(
|
186
|
+
{'page': pages[j], 'bbox': bboxes[j]})
|
187
|
+
|
188
|
+
c = Counter([types[j] for j in norm_inter])
|
189
|
+
chunk_type = c.most_common(1)[0][0]
|
190
|
+
new_metadata['chunk_type'] = chunk_type
|
191
|
+
new_metadata['source'] = metadatas[i].get('source', '')
|
192
|
+
|
193
|
+
# for chunk in split_texts:
|
194
|
+
# new_metadata = {}
|
195
|
+
# new_metadata['chunk_type'] = metadata.get('chunk_type', 'paragraph')
|
196
|
+
# new_metadata['bboxes'] = metadata.get('bboxes', [])
|
197
|
+
# new_metadata['source'] = metadata.get('source', '')
|
198
|
+
# # chunk's start index in text
|
199
|
+
# index = text.find(chunk, index + 1)
|
200
|
+
# new_metadata['start'] = metadata.get('start', 0) + index
|
201
|
+
# new_metadata['end'] = metadata.get('start', 0) + index + len(chunk) - 1
|
202
|
+
|
203
|
+
# if 'page' in metadata:
|
204
|
+
# new_metadata['page'] = metadata['page'][new_metadata['start']:new_metadata['end']+1]
|
205
|
+
# if 'token_to_bbox' in metadata:
|
206
|
+
# new_metadata['token_to_bbox'] = metadata['token_to_bbox'][new_metadata['start']:new_metadata['end']+1]
|
207
|
+
|
208
|
+
# if 'page' in new_metadata and 'token_to_bbox' in new_metadata:
|
209
|
+
# box_no_duplicates = set()
|
210
|
+
# for index in range(len(new_metadata['page'])):
|
211
|
+
# box_no_duplicates.add(
|
212
|
+
# (new_metadata['page'][index], new_metadata['token_to_bbox'][index]))
|
213
|
+
|
214
|
+
# new_metadata['chunk_bboxes'] = []
|
215
|
+
# for elem in box_no_duplicates:
|
216
|
+
# new_metadata['chunk_bboxes'].append(
|
217
|
+
# {'page': elem[0], 'bbox': new_metadata['bboxes'][elem[1]]})
|
206
218
|
|
207
219
|
new_doc = Document(page_content=chunk, metadata=new_metadata)
|
208
220
|
documents.append(new_doc)
|
@@ -10,6 +10,7 @@ from langchain.docstore.document import Document
|
|
10
10
|
from langchain.embeddings.base import Embeddings
|
11
11
|
from langchain.vectorstores.utils import maximal_marginal_relevance
|
12
12
|
from langchain_community.vectorstores.milvus import Milvus as MilvusLangchain
|
13
|
+
from pymilvus.exceptions import ConnectionNotExistException
|
13
14
|
|
14
15
|
logger = logging.getLogger(__name__)
|
15
16
|
|
@@ -231,7 +232,7 @@ class Milvus(MilvusLangchain):
|
|
231
232
|
from pymilvus import connections
|
232
233
|
connections.remove_connection(using)
|
233
234
|
|
234
|
-
def _create_connection_alias(self, connection_args: dict) -> str:
|
235
|
+
def _create_connection_alias(self, connection_args: dict, personal_alias: str = None) -> str:
|
235
236
|
"""Create the connection to the Milvus server."""
|
236
237
|
from pymilvus import MilvusException, connections
|
237
238
|
|
@@ -269,7 +270,10 @@ class Milvus(MilvusLangchain):
|
|
269
270
|
return con[0]
|
270
271
|
|
271
272
|
# Generate a new connection if one doesn't exist
|
272
|
-
|
273
|
+
if personal_alias:
|
274
|
+
alias = personal_alias
|
275
|
+
else:
|
276
|
+
alias = uuid4().hex
|
273
277
|
try:
|
274
278
|
connections.connect(alias=alias, **connection_args)
|
275
279
|
logger.debug('Created new connection using: %s', alias)
|
@@ -522,7 +526,14 @@ class Milvus(MilvusLangchain):
|
|
522
526
|
insert_list = [insert_dict[x][i:end] for x in self.fields if x in insert_dict]
|
523
527
|
# Insert into the collection.
|
524
528
|
try:
|
525
|
-
res
|
529
|
+
res = self.col.insert(insert_list, timeout=timeout, **kwargs)
|
530
|
+
pks.extend(res.primary_keys)
|
531
|
+
except ConnectionNotExistException as e:
|
532
|
+
logger.warning("retrying connection to milvus")
|
533
|
+
# reconnect to milvus
|
534
|
+
self._create_connection_alias(self.connection_args, self.alias)
|
535
|
+
|
536
|
+
# insert data
|
526
537
|
res = self.col.insert(insert_list, timeout=timeout, **kwargs)
|
527
538
|
pks.extend(res.primary_keys)
|
528
539
|
except MilvusException as e:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: bisheng-langchain
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.5.dev1
|
4
4
|
Summary: bisheng langchain modules
|
5
5
|
Home-page: https://github.com/dataelement/bisheng
|
6
6
|
Author: DataElem
|
@@ -30,7 +30,7 @@ Requires-Dist: shapely==2.0.2
|
|
30
30
|
Requires-Dist: filetype==1.2.0
|
31
31
|
Requires-Dist: langgraph==0.0.50
|
32
32
|
Requires-Dist: openai==1.14.3
|
33
|
-
Requires-Dist: langchain-openai==0.1.
|
33
|
+
Requires-Dist: langchain-openai==0.1.5
|
34
34
|
Requires-Dist: llama-index==0.9.48
|
35
35
|
Requires-Dist: bisheng-ragas==1.0.0
|
36
36
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
bisheng_langchain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
bisheng_langchain/text_splitter.py,sha256=
|
2
|
+
bisheng_langchain/text_splitter.py,sha256=yYpzMa0c1tRZNGyuFFJsLvSeqberYHV2-BR28pQim8I,8794
|
3
3
|
bisheng_langchain/agents/__init__.py,sha256=ctsKj77fS8qlkhz_9sS_AhCjFvFNxEpJ9KBYVrApLRg,226
|
4
4
|
bisheng_langchain/agents/chatglm_functions_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
bisheng_langchain/agents/chatglm_functions_agent/base.py,sha256=tyytq0XIFXpfxDP0s5QKeprKOunMqi1fHMfQ0-kOmDE,13674
|
@@ -54,7 +54,7 @@ bisheng_langchain/document_loaders/custom_kv.py,sha256=xWUPhcr1hjbdya4zgEHG4Fl0s
|
|
54
54
|
bisheng_langchain/document_loaders/elem_html.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
55
|
bisheng_langchain/document_loaders/elem_image.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
bisheng_langchain/document_loaders/elem_pdf.py,sha256=K-TXILGNFLFjavhun_MFbUF4t2_WGA3Z-kbnr75lmW8,22243
|
57
|
-
bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=
|
57
|
+
bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=JW87AhzCY_KS_YYszyxU3GgPjxP4vWOHDfifJEpP5CI,8055
|
58
58
|
bisheng_langchain/document_loaders/universal_kv.py,sha256=ZdIgFIc2fH2kkvJNb7j2wi6FLS_PaaatVy6z_YNV2hw,4114
|
59
59
|
bisheng_langchain/document_loaders/parsers/__init__.py,sha256=OOM_FJkwaU-zNS58fASw0TH8FNT6VXKb0VrvisgdrII,171
|
60
60
|
bisheng_langchain/document_loaders/parsers/ellm_client.py,sha256=Y_CRYwBr-gFArOirF1b76KyI5N8eVpsLeDiIsKtYkpU,1641
|
@@ -93,7 +93,7 @@ bisheng_langchain/gpts/tools/api_tools/base.py,sha256=fWQSDIOVb4JZrtJ9ML9q2ycsAa
|
|
93
93
|
bisheng_langchain/gpts/tools/api_tools/flow.py,sha256=ot2YAYgQGWgUpb2nCECAmpqHY6m0SgzwkupF9kDT3lU,2461
|
94
94
|
bisheng_langchain/gpts/tools/api_tools/macro_data.py,sha256=FyG-qtl2ECS1CDKt6olN0eDTDM91d-UvDkMDBiVLgYQ,27429
|
95
95
|
bisheng_langchain/gpts/tools/api_tools/openapi.py,sha256=CzKt9FRkgngBcWgabD4emPqAXkAgagkD-pMjG680MTE,3903
|
96
|
-
bisheng_langchain/gpts/tools/api_tools/sina.py,sha256=
|
96
|
+
bisheng_langchain/gpts/tools/api_tools/sina.py,sha256=4KpK7_HUUtjpdJ-K4LjPlb-occyAZcRtmmCWqJ2BotE,9708
|
97
97
|
bisheng_langchain/gpts/tools/api_tools/tianyancha.py,sha256=abDAz-yAH1-2rKiSmZ6TgnrNUnpgAZpDY8oDiWfWapc,6684
|
98
98
|
bisheng_langchain/gpts/tools/bing_search/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
99
99
|
bisheng_langchain/gpts/tools/bing_search/tool.py,sha256=v_VlqcMplITA5go5qWA4qZ5p43E1-1s0bzmyY7H0hqY,1710
|
@@ -151,9 +151,9 @@ bisheng_langchain/utils/azure_dalle_image_generator.py,sha256=96-_nO4hDSwyPE4rSY
|
|
151
151
|
bisheng_langchain/utils/requests.py,sha256=vWGKyNTxApVeaVdKxqACfIT1Q8wMy-jC3kUv2Ce9Mzc,8688
|
152
152
|
bisheng_langchain/vectorstores/__init__.py,sha256=zCZgDe7LyQ0iDkfcm5UJ5NxwKQSRHnqrsjx700Fy11M,213
|
153
153
|
bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=Pm1rS50GJ0HWbjBsFDgs28SVuVbjGSRPOor6yJlnE7w,13347
|
154
|
-
bisheng_langchain/vectorstores/milvus.py,sha256=
|
154
|
+
bisheng_langchain/vectorstores/milvus.py,sha256=xh7NokraKg_Xc9ofz0RVfJ_I36ftnprLJtV-1NfaeyQ,37162
|
155
155
|
bisheng_langchain/vectorstores/retriever.py,sha256=hj4nAAl352EV_ANnU2OHJn7omCH3nBK82ydo14KqMH4,4353
|
156
|
-
bisheng_langchain-0.3.
|
157
|
-
bisheng_langchain-0.3.
|
158
|
-
bisheng_langchain-0.3.
|
159
|
-
bisheng_langchain-0.3.
|
156
|
+
bisheng_langchain-0.3.5.dev1.dist-info/METADATA,sha256=Q20qBElwEheYunRPAoIvCRj8jH4RrXId03MA-SA6JnE,2476
|
157
|
+
bisheng_langchain-0.3.5.dev1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
158
|
+
bisheng_langchain-0.3.5.dev1.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
|
159
|
+
bisheng_langchain-0.3.5.dev1.dist-info/RECORD,,
|
File without changes
|
{bisheng_langchain-0.3.4.dev3.dist-info → bisheng_langchain-0.3.5.dev1.dist-info}/top_level.txt
RENAMED
File without changes
|