bisheng-langchain 0.3.5.dev1__py3-none-any.whl → 0.3.6.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bisheng_langchain/chains/qa_generation/base_v2.py +14 -33
- bisheng_langchain/rag/init_retrievers/keyword_retriever.py +1 -1
- bisheng_langchain/vectorstores/elastic_keywords_search.py +45 -1
- {bisheng_langchain-0.3.5.dev1.dist-info → bisheng_langchain-0.3.6.dev1.dist-info}/METADATA +1 -1
- {bisheng_langchain-0.3.5.dev1.dist-info → bisheng_langchain-0.3.6.dev1.dist-info}/RECORD +7 -7
- {bisheng_langchain-0.3.5.dev1.dist-info → bisheng_langchain-0.3.6.dev1.dist-info}/WHEEL +0 -0
- {bisheng_langchain-0.3.5.dev1.dist-info → bisheng_langchain-0.3.6.dev1.dist-info}/top_level.txt +0 -0
@@ -134,8 +134,6 @@ class TrainsetGenerator:
|
|
134
134
|
chunk_size: int = 1024,
|
135
135
|
seed: int = 42,
|
136
136
|
prompt: Optional[ChatPromptTemplate] = SEED_QUESTION_CHAT_PROMPT,
|
137
|
-
filter_lowquality_context: bool = False,
|
138
|
-
filter_lowquality_question: bool = False,
|
139
137
|
answer_prompt: Optional[HumanMessagePromptTemplate] = ANSWER_FORMULATE,
|
140
138
|
) -> None:
|
141
139
|
self.generator_llm = generator_llm
|
@@ -154,8 +152,6 @@ class TrainsetGenerator:
|
|
154
152
|
self.threshold = 5.0
|
155
153
|
self.rng = default_rng(seed)
|
156
154
|
self.prompt = prompt
|
157
|
-
self.filter_lowquality_context = filter_lowquality_context
|
158
|
-
self.filter_lowquality_question = filter_lowquality_question
|
159
155
|
if answer_prompt is None:
|
160
156
|
answer_prompt = ANSWER_FORMULATE
|
161
157
|
self.answer_prompt = answer_prompt
|
@@ -167,8 +163,6 @@ class TrainsetGenerator:
|
|
167
163
|
chunk_size: int = 512,
|
168
164
|
trainset_distribution: dict = DEFAULT_TRAIN_DISTRIBUTION,
|
169
165
|
prompt: Optional[ChatPromptTemplate] = SEED_QUESTION_CHAT_PROMPT,
|
170
|
-
filter_lowquality_context: bool = False,
|
171
|
-
filter_lowquality_question: bool = False,
|
172
166
|
answer_prompt: Optional[PromptTemplate] = ANSWER_FORMULATE,
|
173
167
|
):
|
174
168
|
generator_llm = llm
|
@@ -179,8 +173,6 @@ class TrainsetGenerator:
|
|
179
173
|
chunk_size=chunk_size,
|
180
174
|
trainset_distribution=trainset_distribution,
|
181
175
|
prompt=prompt,
|
182
|
-
filter_lowquality_context=filter_lowquality_context,
|
183
|
-
filter_lowquality_question=filter_lowquality_question,
|
184
176
|
answer_prompt=answer_prompt,
|
185
177
|
)
|
186
178
|
|
@@ -324,17 +316,14 @@ class TrainsetGenerator:
|
|
324
316
|
)
|
325
317
|
|
326
318
|
text_chunk = " ".join([node.get_content() for node in nodes])
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
continue
|
319
|
+
score = self._filter_context(text_chunk)
|
320
|
+
if not score:
|
321
|
+
continue
|
331
322
|
seed_question = self._seed_question(text_chunk)
|
332
323
|
|
333
324
|
question = seed_question
|
334
|
-
|
335
|
-
|
336
|
-
else:
|
337
|
-
is_valid_question = True
|
325
|
+
# is_valid_question = self._filter_question(question)
|
326
|
+
is_valid_question = True
|
338
327
|
if is_valid_question:
|
339
328
|
context = [text_chunk] * len(question.split("\n"))
|
340
329
|
is_conv = len(context) > 1
|
@@ -372,8 +361,6 @@ class QAGenerationChainV2(Chain):
|
|
372
361
|
llm: BaseLanguageModel,
|
373
362
|
k: Optional[int] = None,
|
374
363
|
chunk_size: int = 512,
|
375
|
-
filter_lowquality_context: bool = False,
|
376
|
-
filter_lowquality_question: bool = False,
|
377
364
|
question_prompt: Optional[ChatPromptTemplate] = SEED_QUESTION_CHAT_PROMPT,
|
378
365
|
answer_prompt: Optional[HumanMessagePromptTemplate] = ANSWER_FORMULATE,
|
379
366
|
**kwargs: Any,
|
@@ -390,14 +377,8 @@ class QAGenerationChainV2(Chain):
|
|
390
377
|
Returns:
|
391
378
|
a QAGenerationChain class
|
392
379
|
"""
|
393
|
-
generator = TrainsetGenerator.from_default(
|
394
|
-
|
395
|
-
chunk_size=chunk_size,
|
396
|
-
prompt=question_prompt,
|
397
|
-
answer_prompt=answer_prompt,
|
398
|
-
filter_lowquality_context=filter_lowquality_context,
|
399
|
-
filter_lowquality_question=filter_lowquality_question
|
400
|
-
)
|
380
|
+
generator = TrainsetGenerator.from_default(llm, chunk_size=chunk_size, prompt=question_prompt,
|
381
|
+
answer_prompt=answer_prompt)
|
401
382
|
return cls(documents=documents, generator=generator, k=k, **kwargs)
|
402
383
|
|
403
384
|
@property
|
@@ -424,14 +405,14 @@ class QAGenerationChainV2(Chain):
|
|
424
405
|
dataset = self.generator.generate(documents=self.documents, train_size=self.k)
|
425
406
|
df = dataset.to_pandas()
|
426
407
|
qa_pairs = df.to_dict("records")
|
427
|
-
qa =
|
408
|
+
qa = ''
|
428
409
|
for pair in qa_pairs:
|
429
|
-
qa.
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
410
|
+
qa += json.dumps(
|
411
|
+
{
|
412
|
+
"question": pair["question"],
|
413
|
+
"answer": pair["ground_truth"][0],
|
414
|
+
"context": pair["ground_truth_context"][0],
|
415
|
+
}, ensure_ascii=False)
|
435
416
|
return {self.output_key: qa}
|
436
417
|
|
437
418
|
async def _acall(
|
@@ -16,7 +16,7 @@ from langchain.text_splitter import TextSplitter
|
|
16
16
|
|
17
17
|
|
18
18
|
class KeywordRetriever(BaseRetriever):
|
19
|
-
keyword_store:
|
19
|
+
keyword_store: ElasticKeywordsSearch
|
20
20
|
text_splitter: TextSplitter
|
21
21
|
search_type: str = 'similarity'
|
22
22
|
search_kwargs: dict = Field(default_factory=dict)
|
@@ -13,6 +13,7 @@ from langchain.llms.base import BaseLLM
|
|
13
13
|
from langchain.prompts.prompt import PromptTemplate
|
14
14
|
from langchain.utils import get_from_dict_or_env
|
15
15
|
from langchain.vectorstores.base import VectorStore
|
16
|
+
from loguru import logger
|
16
17
|
|
17
18
|
if TYPE_CHECKING:
|
18
19
|
from elasticsearch import Elasticsearch # noqa: F401
|
@@ -326,6 +327,49 @@ class ElasticKeywordsSearch(VectorStore, ABC):
|
|
326
327
|
response = client.search(index=index_name, body={'query': script_query, 'size': size})
|
327
328
|
return response
|
328
329
|
|
329
|
-
def
|
330
|
+
def delete_index(self, **kwargs: Any) -> None:
|
330
331
|
# TODO: Check if this can be done in bulk
|
331
332
|
self.client.indices.delete(index=self.index_name)
|
333
|
+
|
334
|
+
def delete(
|
335
|
+
self,
|
336
|
+
ids: Optional[List[str]] = None,
|
337
|
+
refresh_indices: Optional[bool] = True,
|
338
|
+
**kwargs: Any,
|
339
|
+
) -> Optional[bool]:
|
340
|
+
"""Delete documents from the Elasticsearch index.
|
341
|
+
|
342
|
+
Args:
|
343
|
+
ids: List of ids of documents to delete.
|
344
|
+
refresh_indices: Whether to refresh the index
|
345
|
+
after deleting documents. Defaults to True.
|
346
|
+
"""
|
347
|
+
try:
|
348
|
+
from elasticsearch.helpers import BulkIndexError, bulk
|
349
|
+
except ImportError:
|
350
|
+
raise ImportError('Could not import elasticsearch python package. '
|
351
|
+
'Please install it with `pip install elasticsearch`.')
|
352
|
+
|
353
|
+
body = []
|
354
|
+
|
355
|
+
if ids is None:
|
356
|
+
raise ValueError('ids must be provided.')
|
357
|
+
|
358
|
+
for _id in ids:
|
359
|
+
body.append({'_op_type': 'delete', '_index': self.index_name, '_id': _id})
|
360
|
+
|
361
|
+
if len(body) > 0:
|
362
|
+
try:
|
363
|
+
bulk(self.client, body, refresh=refresh_indices, ignore_status=404)
|
364
|
+
logger.debug(f'Deleted {len(body)} texts from index')
|
365
|
+
|
366
|
+
return True
|
367
|
+
except BulkIndexError as e:
|
368
|
+
logger.error(f'Error deleting texts: {e}')
|
369
|
+
firstError = e.errors[0].get('index', {}).get('error', {})
|
370
|
+
logger.error(f"First error reason: {firstError.get('reason')}")
|
371
|
+
raise e
|
372
|
+
|
373
|
+
else:
|
374
|
+
logger.debug('No texts to delete from index')
|
375
|
+
return False
|
@@ -23,7 +23,7 @@ bisheng_langchain/chains/conversational_retrieval/__init__.py,sha256=47DEQpj8HBS
|
|
23
23
|
bisheng_langchain/chains/conversational_retrieval/base.py,sha256=XiqBqov6No-wTVCou6qyMT5p2JQgoQI7OLQOYH8XUos,5313
|
24
24
|
bisheng_langchain/chains/qa_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
bisheng_langchain/chains/qa_generation/base.py,sha256=VYGmLDB0bnlDQ6T8ivLP55wwFbMo9HOzlPEDUuRx5fU,4148
|
26
|
-
bisheng_langchain/chains/qa_generation/base_v2.py,sha256=
|
26
|
+
bisheng_langchain/chains/qa_generation/base_v2.py,sha256=ZtHEuNFwbE9txCGR3wx0oDAoj9V6bAxi3GXF8Z78cqQ,14580
|
27
27
|
bisheng_langchain/chains/qa_generation/prompt.py,sha256=4eJk9aDUYDN1qaaYRPy9EobCIncnwS8BbQaDFzzePtM,1944
|
28
28
|
bisheng_langchain/chains/qa_generation/prompt_v2.py,sha256=sQLanA_iOnLqrUIwzfTOTANt-1vJ44CM54HFDU8Jo1Q,8938
|
29
29
|
bisheng_langchain/chains/question_answering/__init__.py,sha256=_gOZMc-SWprK6xc-Jj64jcr9nc-G4YkZbEYwfJNq_bY,8795
|
@@ -126,7 +126,7 @@ bisheng_langchain/rag/config/baseline_s2b_mix.yaml,sha256=rkPfzU2-mvjRrZ0zMHaQsn
|
|
126
126
|
bisheng_langchain/rag/config/baseline_v2.yaml,sha256=RP-DwIRIS_ZK8ixbXi2Z28rKqHD56pWmr2o2WWIwq3Y,2382
|
127
127
|
bisheng_langchain/rag/init_retrievers/__init__.py,sha256=qpLLAuqZPtumTlJj17Ie5AbDDmiUiDxYefg_pumqu-c,218
|
128
128
|
bisheng_langchain/rag/init_retrievers/baseline_vector_retriever.py,sha256=oRKZZpxlLQAtsubIcAXeXpf1a9h6Pt6uOtNTLeD2jps,2362
|
129
|
-
bisheng_langchain/rag/init_retrievers/keyword_retriever.py,sha256=
|
129
|
+
bisheng_langchain/rag/init_retrievers/keyword_retriever.py,sha256=NRT0fBx6HFR7j9IbRl_NBuqF7hnL-9v5GCqHpgnrfPQ,2523
|
130
130
|
bisheng_langchain/rag/init_retrievers/mix_retriever.py,sha256=Whxq4kjNPLsxnHcVo60usdFFwLTCD-1jO38q08LXkVQ,4653
|
131
131
|
bisheng_langchain/rag/init_retrievers/smaller_chunks_retriever.py,sha256=RQ7QLEOOhBrkw-EimXVJqIGa96D-KkNDik2h9hzg9fU,3805
|
132
132
|
bisheng_langchain/rag/prompts/__init__.py,sha256=IUCq9gzqGQN_6IDk0D_F5t3mOUI_KbmSzYnnXoX4VKE,223
|
@@ -150,10 +150,10 @@ bisheng_langchain/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
150
150
|
bisheng_langchain/utils/azure_dalle_image_generator.py,sha256=96-_nO4hDSwyPE4rSYop5SgJ-U9CE2un4bTdW0E5RGU,6582
|
151
151
|
bisheng_langchain/utils/requests.py,sha256=vWGKyNTxApVeaVdKxqACfIT1Q8wMy-jC3kUv2Ce9Mzc,8688
|
152
152
|
bisheng_langchain/vectorstores/__init__.py,sha256=zCZgDe7LyQ0iDkfcm5UJ5NxwKQSRHnqrsjx700Fy11M,213
|
153
|
-
bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=
|
153
|
+
bisheng_langchain/vectorstores/elastic_keywords_search.py,sha256=inZarhahRaesrvLqyeRCMQvHGAASY53opEVA0_o8S14,14901
|
154
154
|
bisheng_langchain/vectorstores/milvus.py,sha256=xh7NokraKg_Xc9ofz0RVfJ_I36ftnprLJtV-1NfaeyQ,37162
|
155
155
|
bisheng_langchain/vectorstores/retriever.py,sha256=hj4nAAl352EV_ANnU2OHJn7omCH3nBK82ydo14KqMH4,4353
|
156
|
-
bisheng_langchain-0.3.
|
157
|
-
bisheng_langchain-0.3.
|
158
|
-
bisheng_langchain-0.3.
|
159
|
-
bisheng_langchain-0.3.
|
156
|
+
bisheng_langchain-0.3.6.dev1.dist-info/METADATA,sha256=KG32YRknnVoAxFzVKE_qMMQBjbhZen046fXQYyhXQvs,2476
|
157
|
+
bisheng_langchain-0.3.6.dev1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
158
|
+
bisheng_langchain-0.3.6.dev1.dist-info/top_level.txt,sha256=Z6pPNyCo4ihyr9iqGQbH8sJiC4dAUwA_mAyGRQB5_Fs,18
|
159
|
+
bisheng_langchain-0.3.6.dev1.dist-info/RECORD,,
|
File without changes
|
{bisheng_langchain-0.3.5.dev1.dist-info → bisheng_langchain-0.3.6.dev1.dist-info}/top_level.txt
RENAMED
File without changes
|