mb-rag 1.1.66__tar.gz → 1.1.67__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mb_rag-1.1.66 → mb_rag-1.1.67}/PKG-INFO +1 -1
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/rag/embeddings.py +83 -88
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/version.py +1 -1
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag.egg-info/PKG-INFO +1 -1
- {mb_rag-1.1.66 → mb_rag-1.1.67}/README.md +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/__init__.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/basic.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/chatbot/__init__.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/chatbot/chains.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/chatbot/conversation.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/prompts_bank.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/rag/__init__.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/utils/__init__.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/utils/all_data_extract.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/utils/bounding_box.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/utils/document_extract.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/utils/extra.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/utils/llm_wrapper.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/utils/pdf_extract.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag/utils/viewer.py +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag.egg-info/SOURCES.txt +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag.egg-info/dependency_links.txt +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag.egg-info/requires.txt +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/mb_rag.egg-info/top_level.txt +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/pyproject.toml +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/setup.cfg +0 -0
- {mb_rag-1.1.66 → mb_rag-1.1.67}/setup.py +0 -0
|
@@ -57,21 +57,20 @@ import os
|
|
|
57
57
|
import shutil
|
|
58
58
|
import importlib.util
|
|
59
59
|
from typing import List, Dict, Optional, Union, Any
|
|
60
|
-
from
|
|
60
|
+
from langchain_text_splitters import (
|
|
61
61
|
CharacterTextSplitter,
|
|
62
62
|
RecursiveCharacterTextSplitter,
|
|
63
63
|
SentenceTransformersTokenTextSplitter,
|
|
64
64
|
TokenTextSplitter,
|
|
65
|
-
MarkdownHeaderTextSplitter
|
|
66
|
-
SemanticChunker)
|
|
65
|
+
MarkdownHeaderTextSplitter)
|
|
67
66
|
from langchain_community.document_loaders import TextLoader, FireCrawlLoader
|
|
68
67
|
from langchain_chroma import Chroma
|
|
69
68
|
from ..utils.extra import load_env_file
|
|
70
|
-
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
|
71
|
-
from langchain.chains.combine_documents import create_stuff_documents_chain
|
|
69
|
+
# from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
|
70
|
+
# from langchain.chains.combine_documents import create_stuff_documents_chain
|
|
72
71
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
73
72
|
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
74
|
-
from langchain.retrievers import ContextualCompressionRetriever
|
|
73
|
+
# from langchain.retrievers import ContextualCompressionRetriever
|
|
75
74
|
from langchain_community.document_compressors import FlashrankRerank
|
|
76
75
|
|
|
77
76
|
load_env_file()
|
|
@@ -317,10 +316,6 @@ class TextProcessor:
|
|
|
317
316
|
chunk_size=chunk_size,
|
|
318
317
|
chunk_overlap=chunk_overlap
|
|
319
318
|
),
|
|
320
|
-
'semantic_chunker': SemanticChunker(
|
|
321
|
-
chunk_size=chunk_size,
|
|
322
|
-
chunk_overlap=chunk_overlap
|
|
323
|
-
)
|
|
324
319
|
}
|
|
325
320
|
|
|
326
321
|
if text_splitter_type not in splitters:
|
|
@@ -584,32 +579,32 @@ class embedding_generator:
|
|
|
584
579
|
retriever = self.retriever
|
|
585
580
|
return retriever.get_relevant_documents(query)
|
|
586
581
|
|
|
587
|
-
def load_flashrank_compression_retriever(self, base_retriever=None, model_name: str = "flashrank/flashrank-base", top_n: int = 5):
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
582
|
+
# def load_flashrank_compression_retriever(self, base_retriever=None, model_name: str = "flashrank/flashrank-base", top_n: int = 5):
|
|
583
|
+
# """
|
|
584
|
+
# Load a ContextualCompressionRetriever using FlashrankRerank.
|
|
585
|
+
|
|
586
|
+
# Args:
|
|
587
|
+
# base_retriever: Existing retriever (if None, uses self.retriever)
|
|
588
|
+
# model_name (str): Flashrank model identifier (default: "flashrank/flashrank-base")
|
|
589
|
+
# top_n (int): Number of top documents to return after reranking
|
|
590
|
+
|
|
591
|
+
# Returns:
|
|
592
|
+
# ContextualCompressionRetriever: A compression-based retriever using Flashrank
|
|
593
|
+
# """
|
|
594
|
+
# if base_retriever is None:
|
|
595
|
+
# base_retriever = self.retriever
|
|
596
|
+
# if base_retriever is None:
|
|
597
|
+
# raise ValueError("Base retriever is required.")
|
|
598
|
+
|
|
599
|
+
# compressor = FlashrankRerank(model=model_name, top_n=top_n)
|
|
600
|
+
# self.compression_retriever = ContextualCompressionRetriever(
|
|
601
|
+
# base_compressor=compressor,
|
|
602
|
+
# base_retriever=base_retriever
|
|
603
|
+
# )
|
|
604
|
+
|
|
605
|
+
# if self.logger:
|
|
606
|
+
# self.logger.info("Loaded Flashrank compression retriever.")
|
|
607
|
+
# return self.compression_retriever
|
|
613
608
|
|
|
614
609
|
def compression_invoke(self, query: str):
|
|
615
610
|
"""
|
|
@@ -627,58 +622,58 @@ class embedding_generator:
|
|
|
627
622
|
print("Compression retriever loaded.")
|
|
628
623
|
return self.compression_retriever.invoke(query)
|
|
629
624
|
|
|
630
|
-
def generate_rag_chain(self, context_prompt: str = None, retriever=None, llm=None):
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
625
|
+
# def generate_rag_chain(self, context_prompt: str = None, retriever=None, llm=None):
|
|
626
|
+
# """
|
|
627
|
+
# Generate RAG chain for conversation.
|
|
628
|
+
|
|
629
|
+
# Args:
|
|
630
|
+
# context_prompt (str): Optional context prompt
|
|
631
|
+
# retriever: Optional retriever instance
|
|
632
|
+
# llm: Optional language model instance
|
|
633
|
+
|
|
634
|
+
# Returns:
|
|
635
|
+
# Any: Generated RAG chain
|
|
636
|
+
|
|
637
|
+
# Example:
|
|
638
|
+
# ```python
|
|
639
|
+
# rag_chain = gen.generate_rag_chain(retriever=retriever)
|
|
640
|
+
# ```
|
|
641
|
+
# """
|
|
642
|
+
# if context_prompt is None:
|
|
643
|
+
# context_prompt = ("You are an assistant for question-answering tasks. "
|
|
644
|
+
# "Use the following pieces of retrieved context to answer the question. "
|
|
645
|
+
# "If you don't know the answer, just say that you don't know. "
|
|
646
|
+
# "Use three sentences maximum and keep the answer concise.\n\n{context}")
|
|
647
|
+
|
|
648
|
+
# contextualize_q_system_prompt = ("Given a chat history and the latest user question "
|
|
649
|
+
# "which might reference context in the chat history, "
|
|
650
|
+
# "formulate a standalone question which can be understood, "
|
|
651
|
+
# "just reformulate it if needed and otherwise return it as is.")
|
|
652
|
+
|
|
653
|
+
# contextualize_q_prompt = ChatPromptTemplate.from_messages([
|
|
654
|
+
# ("system", contextualize_q_system_prompt),
|
|
655
|
+
# MessagesPlaceholder("chat_history"),
|
|
656
|
+
# ("human", "{input}"),
|
|
657
|
+
# ])
|
|
658
|
+
|
|
659
|
+
# if retriever is None:
|
|
660
|
+
# retriever = self.retriever
|
|
661
|
+
# if llm is None:
|
|
662
|
+
# if not ModelProvider.check_package("langchain_openai"):
|
|
663
|
+
# raise ImportError("OpenAI package not found. Please install: pip install langchain-openai")
|
|
664
|
+
# from langchain_openai import ChatOpenAI
|
|
665
|
+
# llm = ChatOpenAI(model="gpt-4o", temperature=0.8)
|
|
666
|
+
|
|
667
|
+
# history_aware_retriever = create_history_aware_retriever(llm, retriever,
|
|
668
|
+
# contextualize_q_prompt)
|
|
669
|
+
# qa_prompt = ChatPromptTemplate.from_messages([
|
|
670
|
+
# ("system", context_prompt),
|
|
671
|
+
# MessagesPlaceholder("chat_history"),
|
|
672
|
+
# ("human", "{input}"),
|
|
673
|
+
# ])
|
|
674
|
+
# question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
|
|
675
|
+
# rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
|
|
676
|
+
# return rag_chain
|
|
682
677
|
|
|
683
678
|
def conversation_chain(self, query: str, rag_chain, file: str = None):
|
|
684
679
|
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|