mb-rag 1.1.45__py3-none-any.whl → 1.1.47__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mb-rag might be problematic. Click here for more details.
- mb_rag/rag/embeddings.py +61 -4
- mb_rag/version.py +1 -1
- {mb_rag-1.1.45.dist-info → mb_rag-1.1.47.dist-info}/METADATA +1 -1
- {mb_rag-1.1.45.dist-info → mb_rag-1.1.47.dist-info}/RECORD +6 -6
- {mb_rag-1.1.45.dist-info → mb_rag-1.1.47.dist-info}/WHEEL +0 -0
- {mb_rag-1.1.45.dist-info → mb_rag-1.1.47.dist-info}/top_level.txt +0 -0
mb_rag/rag/embeddings.py
CHANGED
|
@@ -61,7 +61,9 @@ from langchain.text_splitter import (
|
|
|
61
61
|
CharacterTextSplitter,
|
|
62
62
|
RecursiveCharacterTextSplitter,
|
|
63
63
|
SentenceTransformersTokenTextSplitter,
|
|
64
|
-
TokenTextSplitter
|
|
64
|
+
TokenTextSplitter,
|
|
65
|
+
MarkdownHeaderTextSplitter,
|
|
66
|
+
SemanticChunker)
|
|
65
67
|
from langchain_community.document_loaders import TextLoader, FireCrawlLoader
|
|
66
68
|
from langchain_chroma import Chroma
|
|
67
69
|
from ..utils.extra import load_env_file
|
|
@@ -69,6 +71,8 @@ from langchain.chains import create_history_aware_retriever, create_retrieval_ch
|
|
|
69
71
|
from langchain.chains.combine_documents import create_stuff_documents_chain
|
|
70
72
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
71
73
|
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
74
|
+
from langchain.retrievers import ContextualCompressionRetriever
|
|
75
|
+
from langchain_community.document_compressors import FlashrankRerank
|
|
72
76
|
|
|
73
77
|
load_env_file()
|
|
74
78
|
|
|
@@ -181,13 +185,13 @@ class ModelProvider:
|
|
|
181
185
|
return GoogleGenerativeAIEmbeddings(**kwargs)
|
|
182
186
|
|
|
183
187
|
@staticmethod
|
|
184
|
-
def get_rag_qwen(model_name: str = "
|
|
188
|
+
def get_rag_qwen(model_name: str = "Qwen/Qwen3-Embedding-0.6B", **kwargs):
|
|
185
189
|
"""
|
|
186
190
|
Load Qwen embedding model.
|
|
187
191
|
Uses Transformers for embedding generation.
|
|
188
192
|
|
|
189
193
|
Args:
|
|
190
|
-
model_name (str): Model identifier (default: "
|
|
194
|
+
model_name (str): Model identifier (default: "Qwen/Qwen3-Embedding-0.6B")
|
|
191
195
|
**kwargs: Additional arguments for model initialization
|
|
192
196
|
|
|
193
197
|
Returns:
|
|
@@ -195,7 +199,7 @@ class ModelProvider:
|
|
|
195
199
|
"""
|
|
196
200
|
from langchain.embeddings import HuggingFaceEmbeddings
|
|
197
201
|
|
|
198
|
-
return HuggingFaceEmbeddings(model_name=
|
|
202
|
+
return HuggingFaceEmbeddings(model_name=model_name, **kwargs)
|
|
199
203
|
|
|
200
204
|
def load_embedding_model(model_name: str = 'openai', model_type: str = "text-embedding-ada-002", **kwargs):
|
|
201
205
|
"""
|
|
@@ -308,6 +312,14 @@ class TextProcessor:
|
|
|
308
312
|
'token': TokenTextSplitter(
|
|
309
313
|
chunk_size=chunk_size,
|
|
310
314
|
chunk_overlap=chunk_overlap
|
|
315
|
+
),
|
|
316
|
+
'markdown_header': MarkdownHeaderTextSplitter(
|
|
317
|
+
chunk_size=chunk_size,
|
|
318
|
+
chunk_overlap=chunk_overlap
|
|
319
|
+
),
|
|
320
|
+
'semantic_chunker': SemanticChunker(
|
|
321
|
+
chunk_size=chunk_size,
|
|
322
|
+
chunk_overlap=chunk_overlap
|
|
311
323
|
)
|
|
312
324
|
}
|
|
313
325
|
|
|
@@ -323,6 +335,7 @@ class TextProcessor:
|
|
|
323
335
|
print(f"Text data splitted into {len(docs)} chunks")
|
|
324
336
|
return docs
|
|
325
337
|
|
|
338
|
+
|
|
326
339
|
class embedding_generator:
|
|
327
340
|
"""
|
|
328
341
|
Main class for generating embeddings and managing RAG operations.
|
|
@@ -374,6 +387,7 @@ class embedding_generator:
|
|
|
374
387
|
self.vector_store = self.load_vectorstore(**(vector_store_kwargs or {}))
|
|
375
388
|
self.collection_name = collection_name
|
|
376
389
|
self.text_processor = TextProcessor(logger)
|
|
390
|
+
self.compression_retriever = None
|
|
377
391
|
|
|
378
392
|
def check_file(self, file_path: str) -> bool:
|
|
379
393
|
"""Check if file exists."""
|
|
@@ -570,6 +584,49 @@ class embedding_generator:
|
|
|
570
584
|
retriever = self.retriever
|
|
571
585
|
return retriever.get_relevant_documents(query)
|
|
572
586
|
|
|
587
|
+
def load_flashrank_compression_retriever(self, base_retriever=None, model_name: str = "flashrank/flashrank-base", top_n: int = 5):
|
|
588
|
+
"""
|
|
589
|
+
Load a ContextualCompressionRetriever using FlashrankRerank.
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
base_retriever: Existing retriever (if None, uses self.retriever)
|
|
593
|
+
model_name (str): Flashrank model identifier (default: "flashrank/flashrank-base")
|
|
594
|
+
top_n (int): Number of top documents to return after reranking
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
ContextualCompressionRetriever: A compression-based retriever using Flashrank
|
|
598
|
+
"""
|
|
599
|
+
if base_retriever is None:
|
|
600
|
+
base_retriever = self.retriever
|
|
601
|
+
if base_retriever is None:
|
|
602
|
+
raise ValueError("Base retriever is required.")
|
|
603
|
+
|
|
604
|
+
compressor = FlashrankRerank(model=model_name, top_n=top_n)
|
|
605
|
+
self.compression_retriever = ContextualCompressionRetriever(
|
|
606
|
+
base_compressor=compressor,
|
|
607
|
+
base_retriever=base_retriever
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
if self.logger:
|
|
611
|
+
self.logger.info("Loaded Flashrank compression retriever.")
|
|
612
|
+
return self.compression_retriever
|
|
613
|
+
|
|
614
|
+
def compression_invoke(self, query: str):
|
|
615
|
+
"""
|
|
616
|
+
Invoke compression retriever. Only one compression retriever (Reranker) added right now.
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
query (str): Query string
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
Any: Query results
|
|
623
|
+
"""
|
|
624
|
+
|
|
625
|
+
if self.compression_retriever is None:
|
|
626
|
+
self.compression_retriever = self.load_flashrank_compression_retriever(base_retriever=self.retriever)
|
|
627
|
+
print("Compression retriever loaded.")
|
|
628
|
+
return self.compression_retriever.invoke(query)
|
|
629
|
+
|
|
573
630
|
def generate_rag_chain(self, context_prompt: str = None, retriever=None, llm=None):
|
|
574
631
|
"""
|
|
575
632
|
Generate RAG chain for conversation.
|
mb_rag/version.py
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
mb_rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mb_rag/version.py,sha256=
|
|
2
|
+
mb_rag/version.py,sha256=5oZ2NODxfaDs2uAEuvKTpFLPnK9Lnk_JL9tUdtfhFAc,207
|
|
3
3
|
mb_rag/chatbot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
mb_rag/chatbot/basic.py,sha256=8tXU_3Yiqv0J-2Bnpw8p9sQaOlZHzX-Xenjs9GmWqes,23825
|
|
5
5
|
mb_rag/chatbot/chains.py,sha256=vDbLX5R29sWN1pcFqJ5fyxJEgMCM81JAikunAEvMC9A,7223
|
|
6
6
|
mb_rag/chatbot/prompts.py,sha256=n1PyiLbU-5fkslRv6aVOzt0dDlwya_cEdQ7kRnRhMuY,1749
|
|
7
7
|
mb_rag/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
mb_rag/rag/embeddings.py,sha256=
|
|
8
|
+
mb_rag/rag/embeddings.py,sha256=uP7dlEtvI7UE7aUdFHdsRax6HaWKMMMdV5LZiG4CIZY,30515
|
|
9
9
|
mb_rag/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
mb_rag/utils/all_data_extract.py,sha256=TL6O4vNc7mPW-OGK-LhXQQIkSr0o3_7BqNAD-YpTQMU,2532
|
|
11
11
|
mb_rag/utils/bounding_box.py,sha256=G0hdDam8QmYtD9lfwMeDHGm-TTo6KZg-yK5ESFL9zaM,8366
|
|
12
12
|
mb_rag/utils/document_extract.py,sha256=vZiFB1RYm1BIEaNA0MveJ5Zp-KEi0ngKjW8xEdtPqXA,12558
|
|
13
13
|
mb_rag/utils/extra.py,sha256=spbFrGgdruNyYQ5PzgvpSIa6Nm0rn9bb4qc8W9g582o,2492
|
|
14
14
|
mb_rag/utils/pdf_extract.py,sha256=cVeMyhnAU4XZxjIZHKMYhrktTjUNOjhx2r_LZKReOZE,15598
|
|
15
|
-
mb_rag-1.1.
|
|
16
|
-
mb_rag-1.1.
|
|
17
|
-
mb_rag-1.1.
|
|
18
|
-
mb_rag-1.1.
|
|
15
|
+
mb_rag-1.1.47.dist-info/METADATA,sha256=S2KejHwSkHGmx_UzZG46mCnHjLvbkYq5K6INyqBWAtk,234
|
|
16
|
+
mb_rag-1.1.47.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
mb_rag-1.1.47.dist-info/top_level.txt,sha256=FIK1eAa5uYnurgXZquBG-s3PIy-HDTC5yJBW4lTH_pM,7
|
|
18
|
+
mb_rag-1.1.47.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|