bisheng-langchain 0.3.4.dev2__py3-none-any.whl → 0.3.5.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,7 +34,10 @@ def merge_partitions(partitions):
34
34
  elif label == 'Table':
35
35
  doc_content.append('\n\n' + text)
36
36
  else:
37
- doc_content.append(text_elem_sep + text)
37
+ if last_label == 'Table':
38
+ doc_content.append(text_elem_sep * 2 + text)
39
+ else:
40
+ doc_content.append(text_elem_sep + text)
38
41
 
39
42
  last_label = label
40
43
  metadata['bboxes'].extend(list(map(lambda x: list(map(int, x)), extra_data['bboxes'])))
@@ -73,6 +76,7 @@ class ElemUnstructuredLoader(BasePDFLoader):
73
76
  self.start = start
74
77
  self.n = n
75
78
  self.extra_kwargs = kwargs
79
+ self.partitions = None
76
80
  super().__init__(file_path)
77
81
 
78
82
  def load(self) -> List[Document]:
@@ -93,18 +97,33 @@ class ElemUnstructuredLoader(BasePDFLoader):
93
97
  resp = resp.json()
94
98
  if 200 != resp.get('status_code'):
95
99
  logger.info(f'file partition {os.path.basename(self.file_name)} error resp={resp}')
100
+ raise Exception(f'file partition error {os.path.basename(self.file_name)} error resp={resp}')
96
101
  partitions = resp['partitions']
97
- if not partitions:
98
- logger.info(f'partition_error resp={resp}')
102
+ if partitions:
103
+ logger.info(f'content_from_partitions')
104
+ self.partitions = partitions
105
+ content, metadata = merge_partitions(partitions)
106
+ elif resp.get('text'):
107
+ logger.info(f'content_from_text')
108
+ content = resp['text']
109
+ metadata = {
110
+ "bboxes": [],
111
+ "pages": [],
112
+ "indexes": [],
113
+ "types": [],
114
+ }
115
+ else:
116
+ logger.warning(f'content_is_empty resp={resp}')
117
+ content = ''
118
+ metadata = {}
119
+
99
120
  logger.info(f'unstruct_return code={resp.get("status_code")}')
100
121
 
101
122
  if resp.get('b64_pdf'):
102
123
  with open(self.file_path, 'wb') as f:
103
124
  f.write(base64.b64decode(resp['b64_pdf']))
104
125
 
105
- content, metadata = merge_partitions(partitions)
106
126
  metadata['source'] = self.file_name
107
-
108
127
  doc = Document(page_content=content, metadata=metadata)
109
128
  return [doc]
110
129
 
@@ -145,7 +145,10 @@ class StockInfo(APIToolBase):
145
145
  if resp.status_code != 200:
146
146
  logger.info('api_call_fail res={}', resp.text)
147
147
  k_data = resp.text
148
- data_array = json.loads(kLinePattern.search(k_data).group(1))
148
+ k_data = kLinePattern.search(k_data)
149
+ if not k_data:
150
+ return '{}'
151
+ data_array = json.loads(k_data.group(1))
149
152
  for item in data_array:
150
153
  if item.get('day') == date:
151
154
  return json.dumps(item)
@@ -173,7 +176,10 @@ class StockInfo(APIToolBase):
173
176
  count = datetime.today() - date_obj
174
177
  url = self.url.format(stockName=stock_number, stock=stock, count=count.days)
175
178
  k_data = await self.async_client.aget(url)
176
- data_array = json.loads(kLinePattern.search(k_data).group(1))
179
+ k_data = kLinePattern.search(k_data)
180
+ if not k_data:
181
+ return '{}'
182
+ data_array = json.loads(k_data.group(1))
177
183
  for item in data_array:
178
184
  if item.get('day') == date:
179
185
  return json.dumps(item)
@@ -21,17 +21,22 @@ logger = logging.getLogger(__name__)
21
21
 
22
22
 
23
23
  def _split_text_with_regex(
24
- text: str, separator: str, keep_separator: bool
24
+ text: str, separator: str, keep_separator: bool, separator_rule: str
25
25
  ) -> List[str]:
26
26
  # Now that we have the separator, split the text
27
27
  if separator:
28
28
  if keep_separator:
29
29
  # The parentheses in the pattern keep the delimiters in the result.
30
30
  _splits = re.split(f'({separator})', text)
31
- splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
32
- if len(_splits) % 2 == 0:
33
- splits += _splits[-1:]
34
- splits = [_splits[0]] + splits
31
+
32
+ if separator_rule == "before":
33
+ splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
34
+ if len(_splits) % 2 == 0:
35
+ splits += _splits[-1:]
36
+ splits = [_splits[0]] + splits
37
+ else:
38
+ splits = [_splits[i-1] + _splits[i] for i in range(1, len(_splits), 2)]
39
+ splits = splits + [_splits[-1]]
35
40
  else:
36
41
  splits = re.split(separator, text)
37
42
  else:
@@ -82,11 +87,14 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
82
87
  """
83
88
  todo
84
89
  """
90
+
85
91
  def __init__(
86
- self,
87
- separators: Optional[List[str]] = None,
88
- keep_separator: bool = True,
89
- **kwargs: Any,
92
+ self,
93
+ separators: Optional[List[str]] = None,
94
+ separator_rule: Optional[List[str]] = None,
95
+ is_separator_regex: bool = False,
96
+ keep_separator: bool = True,
97
+ **kwargs: Any,
90
98
  ) -> None:
91
99
  """Create a new TextSplitter."""
92
100
  super().__init__(
@@ -95,7 +103,9 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
95
103
  **kwargs
96
104
  )
97
105
  self._separators = separators or ['\n\n', '\n', ' ', '']
98
- self._is_separator_regex = False
106
+ self._separator_rule = separator_rule or ['after' for _ in range(4)]
107
+ self.separator_rule = {one: self._separator_rule[index] for index, one in enumerate(separators)}
108
+ self._is_separator_regex = is_separator_regex
99
109
 
100
110
  def split_documents(self, documents: Iterable[Document]) -> List[Document]:
101
111
  texts, metadatas = [], []
@@ -110,19 +120,21 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
110
120
  final_chunks = []
111
121
  # Get appropriate separator to use
112
122
  separator = separators[-1]
123
+ separator_rule = 'after'
113
124
  new_separators = []
114
125
  for i, _s in enumerate(separators):
115
126
  _separator = _s if self._is_separator_regex else re.escape(_s)
127
+ separator_rule = self.separator_rule[_s]
116
128
  if _s == '':
117
129
  separator = _s
118
130
  break
119
131
  if re.search(_separator, text):
120
132
  separator = _s
121
- new_separators = separators[i + 1 :]
133
+ new_separators = separators[i + 1:]
122
134
  break
123
135
 
124
136
  _separator = separator if self._is_separator_regex else re.escape(separator)
125
- splits = _split_text_with_regex(text, _separator, self._keep_separator)
137
+ splits = _split_text_with_regex(text, _separator, self._keep_separator, separator_rule)
126
138
 
127
139
  # Now go merging things, recursively splitting longer texts.
128
140
  _good_splits = []
@@ -149,60 +161,60 @@ class ElemCharacterTextSplitter(RecursiveCharacterTextSplitter):
149
161
  return self._split_text(text, self._separators)
150
162
 
151
163
  def create_documents(
152
- self, texts: List[str], metadatas: Optional[List[dict]] = None
164
+ self, texts: List[str], metadatas: Optional[List[dict]] = None
153
165
  ) -> List[Document]:
154
166
  """Create documents from a list of texts."""
155
167
  documents = []
156
168
  for i, text in enumerate(texts):
157
169
  index = -1
158
170
  # metadata = copy.deepcopy(_metadatas[i])
159
- indexes = metadatas[i]['indexes']
160
- pages = metadatas[i]['pages']
161
- types = metadatas[i]['types']
162
- bboxes = metadatas[i]['bboxes']
171
+ indexes = metadatas[i].get('indexes', [])
172
+ pages = metadatas[i].get('pages', [])
173
+ types = metadatas[i].get('types', [])
174
+ bboxes = metadatas[i].get('bboxes', [])
163
175
  searcher = IntervalSearch(indexes)
164
176
  split_texts = self.split_text(text)
165
177
  for chunk in split_texts:
166
178
  new_metadata = copy.deepcopy(metadatas[i])
167
- index = text.find(chunk, index + 1)
168
- inter0 = [index, index + len(chunk) - 1]
169
- norm_inter = searcher.find(inter0)
170
- new_metadata['chunk_bboxes'] = []
171
- for j in range(norm_inter[0], norm_inter[1] + 1):
172
- new_metadata['chunk_bboxes'].append(
173
- {'page': pages[j], 'bbox': bboxes[j]})
174
-
175
- c = Counter([types[j] for j in norm_inter])
176
- chunk_type = c.most_common(1)[0][0]
177
- new_metadata['chunk_type'] = chunk_type
178
- new_metadata['source'] = metadatas[i].get('source', '')
179
-
180
-
181
- # for chunk in split_texts:
182
- # new_metadata = {}
183
- # new_metadata['chunk_type'] = metadata.get('chunk_type', 'paragraph')
184
- # new_metadata['bboxes'] = metadata.get('bboxes', [])
185
- # new_metadata['source'] = metadata.get('source', '')
186
- # # chunk's start index in text
187
- # index = text.find(chunk, index + 1)
188
- # new_metadata['start'] = metadata.get('start', 0) + index
189
- # new_metadata['end'] = metadata.get('start', 0) + index + len(chunk) - 1
190
-
191
- # if 'page' in metadata:
192
- # new_metadata['page'] = metadata['page'][new_metadata['start']:new_metadata['end']+1]
193
- # if 'token_to_bbox' in metadata:
194
- # new_metadata['token_to_bbox'] = metadata['token_to_bbox'][new_metadata['start']:new_metadata['end']+1]
195
-
196
- # if 'page' in new_metadata and 'token_to_bbox' in new_metadata:
197
- # box_no_duplicates = set()
198
- # for index in range(len(new_metadata['page'])):
199
- # box_no_duplicates.add(
200
- # (new_metadata['page'][index], new_metadata['token_to_bbox'][index]))
201
-
202
- # new_metadata['chunk_bboxes'] = []
203
- # for elem in box_no_duplicates:
204
- # new_metadata['chunk_bboxes'].append(
205
- # {'page': elem[0], 'bbox': new_metadata['bboxes'][elem[1]]})
179
+ if indexes and bboxes:
180
+ index = text.find(chunk, index + 1)
181
+ inter0 = [index, index + len(chunk) - 1]
182
+ norm_inter = searcher.find(inter0)
183
+ new_metadata['chunk_bboxes'] = []
184
+ for j in range(norm_inter[0], norm_inter[1] + 1):
185
+ new_metadata['chunk_bboxes'].append(
186
+ {'page': pages[j], 'bbox': bboxes[j]})
187
+
188
+ c = Counter([types[j] for j in norm_inter])
189
+ chunk_type = c.most_common(1)[0][0]
190
+ new_metadata['chunk_type'] = chunk_type
191
+ new_metadata['source'] = metadatas[i].get('source', '')
192
+
193
+ # for chunk in split_texts:
194
+ # new_metadata = {}
195
+ # new_metadata['chunk_type'] = metadata.get('chunk_type', 'paragraph')
196
+ # new_metadata['bboxes'] = metadata.get('bboxes', [])
197
+ # new_metadata['source'] = metadata.get('source', '')
198
+ # # chunk's start index in text
199
+ # index = text.find(chunk, index + 1)
200
+ # new_metadata['start'] = metadata.get('start', 0) + index
201
+ # new_metadata['end'] = metadata.get('start', 0) + index + len(chunk) - 1
202
+
203
+ # if 'page' in metadata:
204
+ # new_metadata['page'] = metadata['page'][new_metadata['start']:new_metadata['end']+1]
205
+ # if 'token_to_bbox' in metadata:
206
+ # new_metadata['token_to_bbox'] = metadata['token_to_bbox'][new_metadata['start']:new_metadata['end']+1]
207
+
208
+ # if 'page' in new_metadata and 'token_to_bbox' in new_metadata:
209
+ # box_no_duplicates = set()
210
+ # for index in range(len(new_metadata['page'])):
211
+ # box_no_duplicates.add(
212
+ # (new_metadata['page'][index], new_metadata['token_to_bbox'][index]))
213
+
214
+ # new_metadata['chunk_bboxes'] = []
215
+ # for elem in box_no_duplicates:
216
+ # new_metadata['chunk_bboxes'].append(
217
+ # {'page': elem[0], 'bbox': new_metadata['bboxes'][elem[1]]})
206
218
 
207
219
  new_doc = Document(page_content=chunk, metadata=new_metadata)
208
220
  documents.append(new_doc)
@@ -10,6 +10,7 @@ from langchain.docstore.document import Document
10
10
  from langchain.embeddings.base import Embeddings
11
11
  from langchain.vectorstores.utils import maximal_marginal_relevance
12
12
  from langchain_community.vectorstores.milvus import Milvus as MilvusLangchain
13
+ from pymilvus.exceptions import ConnectionNotExistException
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
@@ -231,7 +232,7 @@ class Milvus(MilvusLangchain):
231
232
  from pymilvus import connections
232
233
  connections.remove_connection(using)
233
234
 
234
- def _create_connection_alias(self, connection_args: dict) -> str:
235
+ def _create_connection_alias(self, connection_args: dict, personal_alias: str = None) -> str:
235
236
  """Create the connection to the Milvus server."""
236
237
  from pymilvus import MilvusException, connections
237
238
 
@@ -269,7 +270,10 @@ class Milvus(MilvusLangchain):
269
270
  return con[0]
270
271
 
271
272
  # Generate a new connection if one doesn't exist
272
- alias = uuid4().hex
273
+ if personal_alias:
274
+ alias = personal_alias
275
+ else:
276
+ alias = uuid4().hex
273
277
  try:
274
278
  connections.connect(alias=alias, **connection_args)
275
279
  logger.debug('Created new connection using: %s', alias)
@@ -522,7 +526,14 @@ class Milvus(MilvusLangchain):
522
526
  insert_list = [insert_dict[x][i:end] for x in self.fields if x in insert_dict]
523
527
  # Insert into the collection.
524
528
  try:
525
- res: Collection
529
+ res = self.col.insert(insert_list, timeout=timeout, **kwargs)
530
+ pks.extend(res.primary_keys)
531
+ except ConnectionNotExistException as e:
532
+ logger.warning("retrying connection to milvus")
533
+ # reconnect to milvus
534
+ self._create_connection_alias(self.connection_args, self.alias)
535
+
536
+ # insert data
526
537
  res = self.col.insert(insert_list, timeout=timeout, **kwargs)
527
538
  pks.extend(res.primary_keys)
528
539
  except MilvusException as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bisheng-langchain
3
- Version: 0.3.4.dev2
3
+ Version: 0.3.5.dev1
4
4
  Summary: bisheng langchain modules
5
5
  Home-page: https://github.com/dataelement/bisheng
6
6
  Author: DataElem
@@ -30,7 +30,7 @@ Requires-Dist: shapely==2.0.2
30
30
  Requires-Dist: filetype==1.2.0
31
31
  Requires-Dist: langgraph==0.0.50
32
32
  Requires-Dist: openai==1.14.3
33
- Requires-Dist: langchain-openai==0.1.0
33
+ Requires-Dist: langchain-openai==0.1.5
34
34
  Requires-Dist: llama-index==0.9.48
35
35
  Requires-Dist: bisheng-ragas==1.0.0
36
36
 
@@ -1,5 +1,5 @@
1
1
  bisheng_langchain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- bisheng_langchain/text_splitter.py,sha256=8snY_Fojh-A1EEGXBSEqCh0N77KQc_dqsgeptAlf344,7934
2
+ bisheng_langchain/text_splitter.py,sha256=yYpzMa0c1tRZNGyuFFJsLvSeqberYHV2-BR28pQim8I,8794
3
3
  bisheng_langchain/agents/__init__.py,sha256=ctsKj77fS8qlkhz_9sS_AhCjFvFNxEpJ9KBYVrApLRg,226
4
4
  bisheng_langchain/agents/chatglm_functions_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  bisheng_langchain/agents/chatglm_functions_agent/base.py,sha256=tyytq0XIFXpfxDP0s5QKeprKOunMqi1fHMfQ0-kOmDE,13674
@@ -54,7 +54,7 @@ bisheng_langchain/document_loaders/custom_kv.py,sha256=xWUPhcr1hjbdya4zgEHG4Fl0s
54
54
  bisheng_langchain/document_loaders/elem_html.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
55
  bisheng_langchain/document_loaders/elem_image.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  bisheng_langchain/document_loaders/elem_pdf.py,sha256=K-TXILGNFLFjavhun_MFbUF4t2_WGA3Z-kbnr75lmW8,22243
57
- bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=FtoyfmE85CwZuMvr52_bqcHQCgypKCWMGwLZrzgQYbY,7353
57
+ bisheng_langchain/document_loaders/elem_unstrcutured_loader.py,sha256=JW87AhzCY_KS_YYszyxU3GgPjxP4vWOHDfifJEpP5CI,8055
58
58
  bisheng_langchain/document_loaders/universal_kv.py,sha256=ZdIgFIc2fH2kkvJNb7j2wi6FLS_PaaatVy6z_YNV2hw,4114
59
59
  bisheng_langchain/document_loaders/parsers/__init__.py,sha256=OOM_FJkwaU-zNS58fASw0TH8FNT6VXKb0VrvisgdrII,171
60
60
  bisheng_langchain/document_loaders/parsers/ellm_client.py,sha256=Y_CRYwBr-gFArOirF1b76KyI5N8eVpsLeDiIsKtYkpU,1641
@@ -93,7 +93,7 @@ bisheng_langchain/gpts/tools/api_tools/base.py,sha256=fWQSDIOVb4JZrtJ9ML9q2ycsAa
93
93
  bisheng_langchain/gpts/tools/api_tools/flow.py,sha256=ot2YAYgQGWgUpb2nCECAmpqHY6m0SgzwkupF9kDT3lU,2461
94
94
  bisheng_langchain/gpts/tools/api_tools/macro_data.py,sha256=FyG-qtl2ECS1CDKt6olN0eDTDM91d-UvDkMDBiVLgYQ,27429
95
95
  bisheng_langchain/gpts/tools/api_tools/openapi.py,sha256=CzKt9FRkgngBcWgabD4emPqAXkAgagkD-pMjG680MTE,3903
96
- bisheng_langchain/gpts/tools/api_tools/sina.py,sha256=GGA4ZYvNEpqBZ_l8MUYqgkI8xZe9XcGa9-KlHZVqr6I,9542
96
+ bisheng_langchain/gpts/tools/api_tools/sina.py,sha256=4KpK7_HUUtjpdJ-K4LjPlb-occyAZcRtmmCWqJ2BotE,9708
97
97
  bisheng_langchain/gpts/tools/api_tools/tianyancha.py,sha256=abDAz-yAH1-2rKiSmZ6TgnrNUnpgAZpDY8oDiWfWapc,6684
98
98
  bisheng_langchain/gpts/tools/bing_search/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
99
  bisheng_langchain/gpts/tools/bing_search/tool.py,sha256=v_VlqcMplITA5go5qWA4qZ5p43E1-1s0bzmyY7H0hqY,1710
@@ -151,9 +151,9 @@ bisheng_langchain/utils/azure_dalle_image_generator.py,sha256=96-_nO4hDSwyPE4rSY
151
151
  bisheng_langchain/utils/requests.py,sha256=vWGKyNTxApVeaVdKxqACfIT1Q8wMy-jC3kUv2Ce9Mzc,8688
152
152
  bisheng_langchain/vectorstores/__init__.py,sha256=zCZgDe7LyQ0iDkfcm5UJ5NxwKQSRHnqrsjx700Fy11M,213
153
153
  bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=Pm1rS50GJ0HWbjBsFDgs28SVuVbjGSRPOor6yJlnE7w,13347
154
- bisheng_langchain/vectorstores/milvus.py,sha256=8HHbIxoSbLYDFlFJSfmjLOfqGpOSZd24iVYWSYz3TX0,36637
154
+ bisheng_langchain/vectorstores/milvus.py,sha256=xh7NokraKg_Xc9ofz0RVfJ_I36ftnprLJtV-1NfaeyQ,37162
155
155
  bisheng_langchain/vectorstores/retriever.py,sha256=hj4nAAl352EV_ANnU2OHJn7omCH3nBK82ydo14KqMH4,4353
156
- bisheng_langchain-0.3.4.dev2.dist-info/METADATA,sha256=0CtYk6qb9LYHkglPIGNWk4Q7tujIsOHNM836_CtwZqs,2476
157
- bisheng_langchain-0.3.4.dev2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
158
- bisheng_langchain-0.3.4.dev2.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
159
- bisheng_langchain-0.3.4.dev2.dist-info/RECORD,,
156
+ bisheng_langchain-0.3.5.dev1.dist-info/METADATA,sha256=Q20qBElwEheYunRPAoIvCRj8jH4RrXId03MA-SA6JnE,2476
157
+ bisheng_langchain-0.3.5.dev1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
158
+ bisheng_langchain-0.3.5.dev1.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
159
+ bisheng_langchain-0.3.5.dev1.dist-info/RECORD,,