bisheng-langchain 0.3.5.dev1__py3-none-any.whl → 0.3.6.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -134,8 +134,6 @@ class TrainsetGenerator:
134
134
  chunk_size: int = 1024,
135
135
  seed: int = 42,
136
136
  prompt: Optional[ChatPromptTemplate] = SEED_QUESTION_CHAT_PROMPT,
137
- filter_lowquality_context: bool = False,
138
- filter_lowquality_question: bool = False,
139
137
  answer_prompt: Optional[HumanMessagePromptTemplate] = ANSWER_FORMULATE,
140
138
  ) -> None:
141
139
  self.generator_llm = generator_llm
@@ -154,8 +152,6 @@ class TrainsetGenerator:
154
152
  self.threshold = 5.0
155
153
  self.rng = default_rng(seed)
156
154
  self.prompt = prompt
157
- self.filter_lowquality_context = filter_lowquality_context
158
- self.filter_lowquality_question = filter_lowquality_question
159
155
  if answer_prompt is None:
160
156
  answer_prompt = ANSWER_FORMULATE
161
157
  self.answer_prompt = answer_prompt
@@ -167,8 +163,6 @@ class TrainsetGenerator:
167
163
  chunk_size: int = 512,
168
164
  trainset_distribution: dict = DEFAULT_TRAIN_DISTRIBUTION,
169
165
  prompt: Optional[ChatPromptTemplate] = SEED_QUESTION_CHAT_PROMPT,
170
- filter_lowquality_context: bool = False,
171
- filter_lowquality_question: bool = False,
172
166
  answer_prompt: Optional[PromptTemplate] = ANSWER_FORMULATE,
173
167
  ):
174
168
  generator_llm = llm
@@ -179,8 +173,6 @@ class TrainsetGenerator:
179
173
  chunk_size=chunk_size,
180
174
  trainset_distribution=trainset_distribution,
181
175
  prompt=prompt,
182
- filter_lowquality_context=filter_lowquality_context,
183
- filter_lowquality_question=filter_lowquality_question,
184
176
  answer_prompt=answer_prompt,
185
177
  )
186
178
 
@@ -324,17 +316,14 @@ class TrainsetGenerator:
324
316
  )
325
317
 
326
318
  text_chunk = " ".join([node.get_content() for node in nodes])
327
- if self.filter_lowquality_context:
328
- score = self._filter_context(text_chunk)
329
- if not score:
330
- continue
319
+ score = self._filter_context(text_chunk)
320
+ if not score:
321
+ continue
331
322
  seed_question = self._seed_question(text_chunk)
332
323
 
333
324
  question = seed_question
334
- if self.filter_lowquality_question:
335
- is_valid_question = self._filter_question(question)
336
- else:
337
- is_valid_question = True
325
+ # is_valid_question = self._filter_question(question)
326
+ is_valid_question = True
338
327
  if is_valid_question:
339
328
  context = [text_chunk] * len(question.split("\n"))
340
329
  is_conv = len(context) > 1
@@ -372,8 +361,6 @@ class QAGenerationChainV2(Chain):
372
361
  llm: BaseLanguageModel,
373
362
  k: Optional[int] = None,
374
363
  chunk_size: int = 512,
375
- filter_lowquality_context: bool = False,
376
- filter_lowquality_question: bool = False,
377
364
  question_prompt: Optional[ChatPromptTemplate] = SEED_QUESTION_CHAT_PROMPT,
378
365
  answer_prompt: Optional[HumanMessagePromptTemplate] = ANSWER_FORMULATE,
379
366
  **kwargs: Any,
@@ -390,14 +377,8 @@ class QAGenerationChainV2(Chain):
390
377
  Returns:
391
378
  a QAGenerationChain class
392
379
  """
393
- generator = TrainsetGenerator.from_default(
394
- llm,
395
- chunk_size=chunk_size,
396
- prompt=question_prompt,
397
- answer_prompt=answer_prompt,
398
- filter_lowquality_context=filter_lowquality_context,
399
- filter_lowquality_question=filter_lowquality_question
400
- )
380
+ generator = TrainsetGenerator.from_default(llm, chunk_size=chunk_size, prompt=question_prompt,
381
+ answer_prompt=answer_prompt)
401
382
  return cls(documents=documents, generator=generator, k=k, **kwargs)
402
383
 
403
384
  @property
@@ -424,14 +405,14 @@ class QAGenerationChainV2(Chain):
424
405
  dataset = self.generator.generate(documents=self.documents, train_size=self.k)
425
406
  df = dataset.to_pandas()
426
407
  qa_pairs = df.to_dict("records")
427
- qa = []
408
+ qa = ''
428
409
  for pair in qa_pairs:
429
- qa.append({
430
- "question": pair["question"],
431
- "answer": pair["ground_truth"][0],
432
- "context": pair["ground_truth_context"][0],
433
- })
434
- qa = f'```json\n{json.dumps(qa, ensure_ascii=False, indent=4)}\n```'
410
+ qa += json.dumps(
411
+ {
412
+ "question": pair["question"],
413
+ "answer": pair["ground_truth"][0],
414
+ "context": pair["ground_truth_context"][0],
415
+ }, ensure_ascii=False)
435
416
  return {self.output_key: qa}
436
417
 
437
418
  async def _acall(
@@ -16,7 +16,7 @@ from langchain.text_splitter import TextSplitter
16
16
 
17
17
 
18
18
  class KeywordRetriever(BaseRetriever):
19
- keyword_store: VectorStore
19
+ keyword_store: ElasticKeywordsSearch
20
20
  text_splitter: TextSplitter
21
21
  search_type: str = 'similarity'
22
22
  search_kwargs: dict = Field(default_factory=dict)
@@ -13,6 +13,7 @@ from langchain.llms.base import BaseLLM
13
13
  from langchain.prompts.prompt import PromptTemplate
14
14
  from langchain.utils import get_from_dict_or_env
15
15
  from langchain.vectorstores.base import VectorStore
16
+ from loguru import logger
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from elasticsearch import Elasticsearch # noqa: F401
@@ -326,6 +327,49 @@ class ElasticKeywordsSearch(VectorStore, ABC):
326
327
  response = client.search(index=index_name, body={'query': script_query, 'size': size})
327
328
  return response
328
329
 
329
- def delete(self, **kwargs: Any) -> None:
330
+ def delete_index(self, **kwargs: Any) -> None:
330
331
  # TODO: Check if this can be done in bulk
331
332
  self.client.indices.delete(index=self.index_name)
333
+
334
+ def delete(
335
+ self,
336
+ ids: Optional[List[str]] = None,
337
+ refresh_indices: Optional[bool] = True,
338
+ **kwargs: Any,
339
+ ) -> Optional[bool]:
340
+ """Delete documents from the Elasticsearch index.
341
+
342
+ Args:
343
+ ids: List of ids of documents to delete.
344
+ refresh_indices: Whether to refresh the index
345
+ after deleting documents. Defaults to True.
346
+ """
347
+ try:
348
+ from elasticsearch.helpers import BulkIndexError, bulk
349
+ except ImportError:
350
+ raise ImportError('Could not import elasticsearch python package. '
351
+ 'Please install it with `pip install elasticsearch`.')
352
+
353
+ body = []
354
+
355
+ if ids is None:
356
+ raise ValueError('ids must be provided.')
357
+
358
+ for _id in ids:
359
+ body.append({'_op_type': 'delete', '_index': self.index_name, '_id': _id})
360
+
361
+ if len(body) > 0:
362
+ try:
363
+ bulk(self.client, body, refresh=refresh_indices, ignore_status=404)
364
+ logger.debug(f'Deleted {len(body)} texts from index')
365
+
366
+ return True
367
+ except BulkIndexError as e:
368
+ logger.error(f'Error deleting texts: {e}')
369
+ firstError = e.errors[0].get('index', {}).get('error', {})
370
+ logger.error(f"First error reason: {firstError.get('reason')}")
371
+ raise e
372
+
373
+ else:
374
+ logger.debug('No texts to delete from index')
375
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bisheng-langchain
3
- Version: 0.3.5.dev1
3
+ Version: 0.3.6.dev1
4
4
  Summary: bisheng langchain modules
5
5
  Home-page: https://github.com/dataelement/bisheng
6
6
  Author: DataElem
@@ -23,7 +23,7 @@ bisheng_langchain/chains/conversational_retrieval/__init__.py,sha256=47DEQpj8HBS
23
23
  bisheng_langchain/chains/conversational_retrieval/base.py,sha256=XiqBqov6No-wTVCou6qyMT5p2JQgoQI7OLQOYH8XUos,5313
24
24
  bisheng_langchain/chains/qa_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  bisheng_langchain/chains/qa_generation/base.py,sha256=VYGmLDB0bnlDQ6T8ivLP55wwFbMo9HOzlPEDUuRx5fU,4148
26
- bisheng_langchain/chains/qa_generation/base_v2.py,sha256=2F2kGe3ermJraQu4oC-m8vm_ENBy_Zi4uHrJDcSOeJw,15460
26
+ bisheng_langchain/chains/qa_generation/base_v2.py,sha256=ZtHEuNFwbE9txCGR3wx0oDAoj9V6bAxi3GXF8Z78cqQ,14580
27
27
  bisheng_langchain/chains/qa_generation/prompt.py,sha256=4eJk9aDUYDN1qaaYRPy9EobCIncnwS8BbQaDFzzePtM,1944
28
28
  bisheng_langchain/chains/qa_generation/prompt_v2.py,sha256=sQLanA_iOnLqrUIwzfTOTANt-1vJ44CM54HFDU8Jo1Q,8938
29
29
  bisheng_langchain/chains/question_answering/__init__.py,sha256=_gOZMc-SWprK6xc-Jj64jcr9nc-G4YkZbEYwfJNq_bY,8795
@@ -126,7 +126,7 @@ bisheng_langchain/rag/config/baseline_s2b_mix.yaml,sha256=rkPfzU2-mvjRrZ0zMHaQsn
126
126
  bisheng_langchain/rag/config/baseline_v2.yaml,sha256=RP-DwIRIS_ZK8ixbXi2Z28rKqHD56pWmr2o2WWIwq3Y,2382
127
127
  bisheng_langchain/rag/init_retrievers/__init__.py,sha256=qpLLAuqZPtumTlJj17Ie5AbDDmiUiDxYefg_pumqu-c,218
128
128
  bisheng_langchain/rag/init_retrievers/baseline_vector_retriever.py,sha256=oRKZZpxlLQAtsubIcAXeXpf1a9h6Pt6uOtNTLeD2jps,2362
129
- bisheng_langchain/rag/init_retrievers/keyword_retriever.py,sha256=Da4Q5BrfN0GckJaeAgPYMlzQAp9ll7ZGGyvs7OdCQ5c,2513
129
+ bisheng_langchain/rag/init_retrievers/keyword_retriever.py,sha256=NRT0fBx6HFR7j9IbRl_NBuqF7hnL-9v5GCqHpgnrfPQ,2523
130
130
  bisheng_langchain/rag/init_retrievers/mix_retriever.py,sha256=Whxq4kjNPLsxnHcVo60usdFFwLTCD-1jO38q08LXkVQ,4653
131
131
  bisheng_langchain/rag/init_retrievers/smaller_chunks_retriever.py,sha256=RQ7QLEOOhBrkw-EimXVJqIGa96D-KkNDik2h9hzg9fU,3805
132
132
  bisheng_langchain/rag/prompts/__init__.py,sha256=IUCq9gzqGQN_6IDk0D_F5t3mOUI_KbmSzYnnXoX4VKE,223
@@ -150,10 +150,10 @@ bisheng_langchain/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
150
150
  bisheng_langchain/utils/azure_dalle_image_generator.py,sha256=96-_nO4hDSwyPE4rSYop5SgJ-U9CE2un4bTdW0E5RGU,6582
151
151
  bisheng_langchain/utils/requests.py,sha256=vWGKyNTxApVeaVdKxqACfIT1Q8wMy-jC3kUv2Ce9Mzc,8688
152
152
  bisheng_langchain/vectorstores/__init__.py,sha256=zCZgDe7LyQ0iDkfcm5UJ5NxwKQSRHnqrsjx700Fy11M,213
153
- bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=Pm1rS50GJ0HWbjBsFDgs28SVuVbjGSRPOor6yJlnE7w,13347
153
+ bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=inZarhahRaesrvLqyeRCMQvHGAASY53opEVA0_o8S14,14901
154
154
  bisheng_langchain/vectorstores/milvus.py,sha256=xh7NokraKg_Xc9ofz0RVfJ_I36ftnprLJtV-1NfaeyQ,37162
155
155
  bisheng_langchain/vectorstores/retriever.py,sha256=hj4nAAl352EV_ANnU2OHJn7omCH3nBK82ydo14KqMH4,4353
156
- bisheng_langchain-0.3.5.dev1.dist-info/METADATA,sha256=Q20qBElwEheYunRPAoIvCRj8jH4RrXId03MA-SA6JnE,2476
157
- bisheng_langchain-0.3.5.dev1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
158
- bisheng_langchain-0.3.5.dev1.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
159
- bisheng_langchain-0.3.5.dev1.dist-info/RECORD,,
156
+ bisheng_langchain-0.3.6.dev1.dist-info/METADATA,sha256=KG32YRknnVoAxFzVKE_qMMQBjbhZen046fXQYyhXQvs,2476
157
+ bisheng_langchain-0.3.6.dev1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
158
+ bisheng_langchain-0.3.6.dev1.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
159
+ bisheng_langchain-0.3.6.dev1.dist-info/RECORD,,