ragmint 0.2.1__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragmint/app.py +512 -0
- ragmint/autotuner.py +201 -17
- ragmint/core/chunking.py +68 -4
- ragmint/core/embeddings.py +46 -10
- ragmint/core/evaluation.py +33 -14
- ragmint/core/pipeline.py +34 -10
- ragmint/core/retriever.py +152 -20
- ragmint/experiments/validation_qa.json +1 -14
- ragmint/explainer.py +47 -20
- ragmint/integrations/__init__.py +0 -0
- ragmint/integrations/config_adapter.py +96 -0
- ragmint/integrations/langchain_prebuilder.py +99 -0
- ragmint/leaderboard.py +41 -35
- ragmint/qa_generator.py +190 -0
- ragmint/tests/test_autotuner.py +52 -30
- ragmint/tests/test_config_adapter.py +39 -0
- ragmint/tests/test_embeddings.py +46 -0
- ragmint/tests/test_explainer.py +28 -12
- ragmint/tests/test_integration_autotuner_ragmint.py +39 -52
- ragmint/tests/test_langchain_prebuilder.py +82 -0
- ragmint/tests/test_leaderboard.py +78 -25
- ragmint/tests/test_pipeline.py +3 -2
- ragmint/tests/test_qa_generator.py +66 -0
- ragmint/tests/test_retriever.py +3 -2
- ragmint/tests/test_tuner.py +1 -1
- ragmint/tuner.py +109 -22
- ragmint-0.4.6.data/data/README.md +485 -0
- ragmint-0.4.6.dist-info/METADATA +530 -0
- ragmint-0.4.6.dist-info/RECORD +48 -0
- ragmint-0.4.6.dist-info/licenses/LICENSE +19 -0
- ragmint/tests/test_explainer_integration.py +0 -18
- ragmint-0.2.1.dist-info/METADATA +0 -27
- ragmint-0.2.1.dist-info/RECORD +0 -38
- {ragmint-0.2.1.dist-info/licenses → ragmint-0.4.6.data/data}/LICENSE +0 -0
- {ragmint-0.2.1.dist-info → ragmint-0.4.6.dist-info}/WHEEL +0 -0
- {ragmint-0.2.1.dist-info → ragmint-0.4.6.dist-info}/top_level.txt +0 -0
ragmint/core/retriever.py
CHANGED
|
@@ -1,33 +1,165 @@
|
|
|
1
|
-
from typing import List, Dict, Any
|
|
1
|
+
from typing import List, Dict, Any, Optional
|
|
2
2
|
import numpy as np
|
|
3
|
+
from .embeddings import Embeddings
|
|
4
|
+
|
|
5
|
+
# Optional imports
|
|
6
|
+
try:
|
|
7
|
+
import faiss
|
|
8
|
+
except ImportError:
|
|
9
|
+
faiss = None
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import chromadb
|
|
13
|
+
except ImportError:
|
|
14
|
+
chromadb = None
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from sklearn.neighbors import BallTree
|
|
18
|
+
except ImportError:
|
|
19
|
+
BallTree = None
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from rank_bm25 import BM25Okapi
|
|
23
|
+
except ImportError:
|
|
24
|
+
BM25Okapi = None
|
|
3
25
|
|
|
4
26
|
|
|
5
27
|
class Retriever:
|
|
6
28
|
"""
|
|
7
|
-
|
|
29
|
+
Multi-backend retriever supporting:
|
|
30
|
+
- "numpy" : basic cosine similarity (dense)
|
|
31
|
+
- "faiss" : high-performance dense retriever
|
|
32
|
+
- "chroma" : persistent vector DB
|
|
33
|
+
- "sklearn": BallTree (cosine or Euclidean)
|
|
34
|
+
- "bm25" : lexical retriever using Rank-BM25
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
retriever = Retriever(embedder, documents=["A", "B", "C"], backend="bm25")
|
|
38
|
+
results = retriever.retrieve("example query", top_k=3)
|
|
8
39
|
"""
|
|
9
40
|
|
|
10
|
-
def __init__(
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
embedder: Optional[Embeddings] = None,
|
|
44
|
+
documents: Optional[List[str]] = None,
|
|
45
|
+
embeddings: Optional[np.ndarray] = None,
|
|
46
|
+
backend: str = "numpy",
|
|
47
|
+
):
|
|
48
|
+
self.embedder = embedder
|
|
49
|
+
self.documents = documents or []
|
|
50
|
+
self.backend = backend.lower()
|
|
51
|
+
self.embeddings = None
|
|
52
|
+
self.index = None
|
|
53
|
+
self.client = None
|
|
54
|
+
self.bm25 = None
|
|
55
|
+
|
|
56
|
+
# Initialize embeddings for dense backends
|
|
57
|
+
if self.backend not in ["bm25"]:
|
|
58
|
+
if embeddings is not None:
|
|
59
|
+
self.embeddings = np.array(embeddings)
|
|
60
|
+
elif self.documents and self.embedder:
|
|
61
|
+
self.embeddings = self.embedder.encode(self.documents)
|
|
62
|
+
else:
|
|
63
|
+
self.embeddings = np.zeros((0, getattr(self.embedder, "dim", 768)))
|
|
64
|
+
|
|
65
|
+
# Normalize for cosine
|
|
66
|
+
if self.embeddings.size > 0:
|
|
67
|
+
self.embeddings = self._normalize(self.embeddings)
|
|
68
|
+
|
|
69
|
+
# Initialize backend
|
|
70
|
+
self._init_backend()
|
|
71
|
+
|
|
72
|
+
# ------------------------
|
|
73
|
+
# Backend Initialization
|
|
74
|
+
# ------------------------
|
|
75
|
+
def _init_backend(self):
|
|
76
|
+
if self.backend == "faiss":
|
|
77
|
+
if faiss is None:
|
|
78
|
+
raise ImportError("faiss not installed. Run `pip install faiss-cpu`.")
|
|
79
|
+
self.index = faiss.IndexFlatIP(self.embedder.dim)
|
|
80
|
+
self.index.add(self.embeddings.astype("float32"))
|
|
81
|
+
|
|
82
|
+
elif self.backend == "chroma":
|
|
83
|
+
if chromadb is None:
|
|
84
|
+
raise ImportError("chromadb not installed. Run `pip install chromadb`.")
|
|
85
|
+
self.client = chromadb.Client()
|
|
86
|
+
self.collection = self.client.create_collection(name="ragmint_retriever")
|
|
87
|
+
for i, doc in enumerate(self.documents):
|
|
88
|
+
self.collection.add(
|
|
89
|
+
ids=[str(i)],
|
|
90
|
+
documents=[doc],
|
|
91
|
+
embeddings=[self.embeddings[i].tolist()],
|
|
92
|
+
)
|
|
16
93
|
|
|
94
|
+
elif self.backend == "sklearn":
|
|
95
|
+
if BallTree is None:
|
|
96
|
+
raise ImportError("scikit-learn not installed. Run `pip install scikit-learn`.")
|
|
97
|
+
self.index = BallTree(self.embeddings)
|
|
98
|
+
|
|
99
|
+
elif self.backend == "bm25":
|
|
100
|
+
if BM25Okapi is None:
|
|
101
|
+
raise ImportError("rank-bm25 not installed. Run `pip install rank-bm25`.")
|
|
102
|
+
tokenized_corpus = [doc.lower().split() for doc in self.documents]
|
|
103
|
+
self.bm25 = BM25Okapi(tokenized_corpus)
|
|
104
|
+
|
|
105
|
+
elif self.backend != "numpy":
|
|
106
|
+
raise ValueError(f"Unsupported retriever backend: {self.backend}")
|
|
107
|
+
|
|
108
|
+
# ------------------------
|
|
109
|
+
# Retrieval
|
|
110
|
+
# ------------------------
|
|
17
111
|
def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
|
18
|
-
if
|
|
112
|
+
if len(self.documents) == 0:
|
|
113
|
+
return [{"text": "", "score": 0.0}]
|
|
114
|
+
|
|
115
|
+
# BM25 retrieval (lexical)
|
|
116
|
+
if self.backend == "bm25":
|
|
117
|
+
tokenized_query = query.lower().split()
|
|
118
|
+
scores = self.bm25.get_scores(tokenized_query)
|
|
119
|
+
top_indices = np.argsort(scores)[::-1][:top_k]
|
|
120
|
+
return [
|
|
121
|
+
{"text": self.documents[i], "score": float(scores[i])}
|
|
122
|
+
for i in top_indices
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
# Dense retrieval (others)
|
|
126
|
+
if self.embeddings is None or self.embeddings.size == 0:
|
|
19
127
|
return [{"text": "", "score": 0.0}]
|
|
20
128
|
|
|
21
|
-
query_vec = self.
|
|
22
|
-
|
|
23
|
-
top_indices = np.argsort(scores)[::-1][:min(top_k, len(scores))]
|
|
24
|
-
return [{"text": self.documents[i], "score": float(scores[i])} for i in top_indices]
|
|
129
|
+
query_vec = self.embedder.encode([query])[0]
|
|
130
|
+
query_vec = self._normalize(query_vec)
|
|
25
131
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
132
|
+
if self.backend == "numpy":
|
|
133
|
+
scores = np.dot(self.embeddings, query_vec)
|
|
134
|
+
top_indices = np.argsort(scores)[::-1][:top_k]
|
|
135
|
+
return [{"text": self.documents[i], "score": float(scores[i])} for i in top_indices]
|
|
136
|
+
|
|
137
|
+
elif self.backend == "faiss":
|
|
138
|
+
query_vec = np.expand_dims(query_vec.astype("float32"), axis=0)
|
|
139
|
+
scores, indices = self.index.search(query_vec, top_k)
|
|
140
|
+
return [{"text": self.documents[int(i)], "score": float(scores[0][j])} for j, i in enumerate(indices[0])]
|
|
141
|
+
|
|
142
|
+
elif self.backend == "chroma":
|
|
143
|
+
results = self.collection.query(query_texts=[query], n_results=top_k)
|
|
144
|
+
docs = results["documents"][0]
|
|
145
|
+
scores = results["distances"][0]
|
|
146
|
+
return [{"text": d, "score": 1 - s} for d, s in zip(docs, scores)]
|
|
147
|
+
|
|
148
|
+
elif self.backend == "sklearn":
|
|
149
|
+
distances, indices = self.index.query([query_vec], k=top_k)
|
|
150
|
+
scores = 1 - distances[0]
|
|
151
|
+
return [{"text": self.documents[int(i)], "score": float(scores[j])} for j, i in enumerate(indices[0])]
|
|
152
|
+
|
|
153
|
+
else:
|
|
154
|
+
raise ValueError(f"Unknown backend: {self.backend}")
|
|
29
155
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
156
|
+
# ------------------------
|
|
157
|
+
# Utils
|
|
158
|
+
# ------------------------
|
|
159
|
+
@staticmethod
|
|
160
|
+
def _normalize(vectors: np.ndarray) -> np.ndarray:
|
|
161
|
+
if vectors.ndim == 1:
|
|
162
|
+
norm = np.linalg.norm(vectors)
|
|
163
|
+
return vectors / norm if norm > 0 else vectors
|
|
164
|
+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
165
|
+
return np.divide(vectors, norms, out=np.zeros_like(vectors), where=norms != 0)
|
|
@@ -1,14 +1 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"query": "What is Retrieval-Augmented Generation?",
|
|
4
|
-
"expected_answer": "A technique that combines information retrieval with language generation to improve factual accuracy."
|
|
5
|
-
},
|
|
6
|
-
{
|
|
7
|
-
"query": "What is the role of embeddings in a RAG system?",
|
|
8
|
-
"expected_answer": "They represent text as numerical vectors for similarity-based retrieval."
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"query": "What is Maximal Marginal Relevance used for?",
|
|
12
|
-
"expected_answer": "To select diverse and relevant documents during reranking."
|
|
13
|
-
}
|
|
14
|
-
]
|
|
1
|
+
[]
|
ragmint/explainer.py
CHANGED
|
@@ -1,49 +1,76 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Interpretability Layer
|
|
3
3
|
----------------------
|
|
4
|
-
Uses Gemini or Anthropic Claude to explain why
|
|
5
|
-
|
|
4
|
+
Uses Gemini or Anthropic Claude to explain why a particular RAG configuration
|
|
5
|
+
performed best, considering both optimizer results and corpus characteristics.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import os
|
|
9
9
|
import json
|
|
10
|
+
from dotenv import load_dotenv
|
|
10
11
|
|
|
12
|
+
# Load .env if available
|
|
13
|
+
load_dotenv()
|
|
11
14
|
|
|
12
|
-
def explain_results(
|
|
15
|
+
def explain_results(best_result: dict, all_results: list, corpus_stats: dict = None,
|
|
16
|
+
model: str = "gemini-2.5-flash-lite") -> str:
|
|
13
17
|
"""
|
|
14
|
-
Generate a natural-language explanation
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
Metrics B: {json.dumps(results_b, indent=2)}
|
|
25
|
-
Provide a concise, human-friendly explanation and practical improvement tips.
|
|
18
|
+
Generate a detailed natural-language explanation for RAG optimization results.
|
|
19
|
+
|
|
20
|
+
Parameters:
|
|
21
|
+
- best_result: dict containing the best configuration and metrics.
|
|
22
|
+
- all_results: list of all trial results with metrics and configs.
|
|
23
|
+
- corpus_stats: optional dict with corpus info (size, avg_len, num_docs).
|
|
24
|
+
- model: LLM model name (Gemini or Claude).
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
A natural-language explanation string.
|
|
26
28
|
"""
|
|
27
29
|
|
|
28
30
|
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
|
29
|
-
google_key = os.getenv("
|
|
31
|
+
google_key = os.getenv("GOOGLE_API_KEY")
|
|
32
|
+
|
|
33
|
+
# Build dynamic context
|
|
34
|
+
corpus_info = json.dumps(corpus_stats or {}, indent=2)
|
|
35
|
+
best_json = json.dumps(best_result, indent=2)
|
|
36
|
+
all_json = json.dumps(list(all_results)[:10], indent=2) #cap for safety
|
|
30
37
|
|
|
38
|
+
prompt = f"""
|
|
39
|
+
You are an expert AI researcher specializing in Retrieval-Augmented Generation (RAG) optimization.
|
|
40
|
+
|
|
41
|
+
A RAG auto-tuner was run on a corpus with these characteristics:
|
|
42
|
+
{corpus_info}
|
|
43
|
+
|
|
44
|
+
The tuner evaluated multiple configurations and metrics. Below are:
|
|
45
|
+
- The BEST configuration:
|
|
46
|
+
{best_json}
|
|
47
|
+
|
|
48
|
+
- A sample of ALL evaluated configurations:
|
|
49
|
+
{all_json}
|
|
50
|
+
|
|
51
|
+
Please:
|
|
52
|
+
1. Explain WHY this best configuration likely performs better than others.
|
|
53
|
+
2. Highlight trade-offs between accuracy, latency, and resource usage.
|
|
54
|
+
3. Suggest potential improvements (different chunking, embedding, retriever, etc.).
|
|
55
|
+
4. Provide a concise summary of which setup you recommend for this corpus.
|
|
56
|
+
Keep it structured, under 300 words, and easy to read.
|
|
57
|
+
"""
|
|
31
58
|
|
|
32
|
-
# 1️⃣
|
|
59
|
+
# --- 1️⃣ Anthropic Claude first ---
|
|
33
60
|
if anthropic_key:
|
|
34
61
|
try:
|
|
35
62
|
from anthropic import Anthropic
|
|
36
63
|
client = Anthropic(api_key=anthropic_key)
|
|
37
64
|
response = client.messages.create(
|
|
38
65
|
model="claude-3-opus-20240229",
|
|
39
|
-
max_tokens=
|
|
66
|
+
max_tokens=500,
|
|
40
67
|
messages=[{"role": "user", "content": prompt}],
|
|
41
68
|
)
|
|
42
69
|
return response.content[0].text
|
|
43
70
|
except Exception as e:
|
|
44
71
|
return f"[Claude unavailable] {e}"
|
|
45
72
|
|
|
46
|
-
# 2️⃣
|
|
73
|
+
# --- 2️⃣ Gemini fallback ---
|
|
47
74
|
elif google_key:
|
|
48
75
|
try:
|
|
49
76
|
import google.generativeai as genai
|
|
@@ -53,7 +80,7 @@ def explain_results(results_a: dict, results_b: dict, model: str = "gemini-1.5-p
|
|
|
53
80
|
except Exception as e:
|
|
54
81
|
return f"[Gemini unavailable] {e}"
|
|
55
82
|
|
|
56
|
-
# 3️⃣ Fallback
|
|
83
|
+
# --- 3️⃣ Fallback message ---
|
|
57
84
|
else:
|
|
58
85
|
return (
|
|
59
86
|
"[No LLM available] Please set ANTHROPIC_API_KEY or GOOGLE_API_KEY "
|
|
File without changes
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAGMint → LangChain Config Adapter
|
|
3
|
+
----------------------------------
|
|
4
|
+
Takes RAGMint or AutoRAGTuner recommendations and converts them into
|
|
5
|
+
a normalized, pickle-safe configuration that can be used to build
|
|
6
|
+
a LangChain RAG pipeline later.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import pickle
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict, Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LangchainConfigAdapter:
|
|
16
|
+
"""
|
|
17
|
+
Converts RAGMint recommendations into LangChain-compatible configs.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
adapter = LangChainConfigAdapter()
|
|
21
|
+
cfg = adapter.prepare(recommendation)
|
|
22
|
+
adapter.save(cfg, "best_config.pkl")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
DEFAULT_EMBEDDINGS = {
|
|
26
|
+
"OpenAI": "sentence-transformers/all-MiniLM-L6-v2",
|
|
27
|
+
"SentenceTransformers": "sentence-transformers/all-MiniLM-L6-v2",
|
|
28
|
+
"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
|
|
29
|
+
"InstructorXL": "hkunlp/instructor-xl"
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
SUPPORTED_RETRIEVERS = {"faiss", "chroma", "bm25", "numpy", "sklearn"}
|
|
33
|
+
|
|
34
|
+
def __init__(self, recommendation: Dict[str, Any] | None = None):
|
|
35
|
+
self.recommendation = recommendation
|
|
36
|
+
|
|
37
|
+
def prepare(self, recommendation: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
|
38
|
+
recommendation = recommendation or self.recommendation or {}
|
|
39
|
+
"""
|
|
40
|
+
Normalize and validate configuration for LangChain use.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
dict with clean retriever, embedding, and chunking settings.
|
|
44
|
+
"""
|
|
45
|
+
retriever = recommendation.get("retriever", "faiss").lower()
|
|
46
|
+
embedding_model = recommendation.get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2")
|
|
47
|
+
chunk_size = recommendation.get("chunk_size", 400)
|
|
48
|
+
overlap = recommendation.get("overlap", 100)
|
|
49
|
+
|
|
50
|
+
# Normalize embedding model names
|
|
51
|
+
embedding_model = self.DEFAULT_EMBEDDINGS.get(embedding_model, embedding_model)
|
|
52
|
+
|
|
53
|
+
# Validate retriever backend
|
|
54
|
+
if retriever not in self.SUPPORTED_RETRIEVERS:
|
|
55
|
+
raise ValueError(f"Unsupported retriever backend: {retriever}")
|
|
56
|
+
|
|
57
|
+
config = {
|
|
58
|
+
"retriever": retriever,
|
|
59
|
+
"embedding_model": embedding_model,
|
|
60
|
+
"chunk_size": int(chunk_size),
|
|
61
|
+
"overlap": int(overlap),
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return config
|
|
65
|
+
|
|
66
|
+
def save(self, config: Dict[str, Any], path: str):
|
|
67
|
+
"""
|
|
68
|
+
Save configuration to a pickle file.
|
|
69
|
+
"""
|
|
70
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
with open(path, "wb") as f:
|
|
72
|
+
pickle.dump(config, f)
|
|
73
|
+
print(f"💾 Saved LangChain config → {path}")
|
|
74
|
+
|
|
75
|
+
def load(self, path: str) -> Dict[str, Any]:
|
|
76
|
+
"""
|
|
77
|
+
Load configuration from a pickle file.
|
|
78
|
+
"""
|
|
79
|
+
with open(path, "rb") as f:
|
|
80
|
+
cfg = pickle.load(f)
|
|
81
|
+
print(f"✅ Loaded LangChain config ← {path}")
|
|
82
|
+
return cfg
|
|
83
|
+
|
|
84
|
+
def to_json(self, config: Dict[str, Any], path: str):
|
|
85
|
+
"""
|
|
86
|
+
Save configuration as JSON (for human readability).
|
|
87
|
+
"""
|
|
88
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
90
|
+
json.dump(config, f, indent=2)
|
|
91
|
+
print(f"📝 Exported LangChain config → {path}")
|
|
92
|
+
|
|
93
|
+
# Alias for backward compatibility
|
|
94
|
+
def to_standard_config(self, recommendation: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
|
95
|
+
"""Alias for backward compatibility with older test suites."""
|
|
96
|
+
return self.prepare(recommendation)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LangChain Pre-Build Integration
|
|
3
|
+
-------------------------------
|
|
4
|
+
This module bridges RAGMint's auto-tuning system with LangChain,
|
|
5
|
+
returning retriever and embedding components that can plug directly
|
|
6
|
+
into any LangChain RAG pipeline.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
from ragmint.integrations.langchain_prebuilder import LangChainPrebuilder
|
|
10
|
+
from langchain.chains import RetrievalQA
|
|
11
|
+
from langchain_openai import ChatOpenAI
|
|
12
|
+
|
|
13
|
+
prebuilder = LangChainPrebuilder(best_cfg)
|
|
14
|
+
retriever, embeddings = prebuilder.prepare(documents)
|
|
15
|
+
|
|
16
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
|
17
|
+
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from typing import List, Tuple, Dict, Any
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
25
|
+
except ImportError:
|
|
26
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
27
|
+
|
|
28
|
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
29
|
+
from langchain_community.vectorstores import FAISS, Chroma
|
|
30
|
+
from langchain_community.retrievers import BM25Retriever
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LangchainPrebuilder:
|
|
34
|
+
"""
|
|
35
|
+
Dynamically builds LangChain retriever and embedding objects
|
|
36
|
+
based on a RAGMint configuration dictionary.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, cfg: Dict[str, Any]):
|
|
40
|
+
"""
|
|
41
|
+
Args:
|
|
42
|
+
cfg (dict): RAGMint configuration with keys:
|
|
43
|
+
- retriever: "faiss" | "chroma" | "bm25"
|
|
44
|
+
- embedding_model: HuggingFace model name
|
|
45
|
+
- chunk_size: int (default=500)
|
|
46
|
+
- overlap: int (default=100)
|
|
47
|
+
"""
|
|
48
|
+
self.cfg = cfg
|
|
49
|
+
self.retriever_backend = cfg.get("retriever", "faiss").lower()
|
|
50
|
+
self.embedding_model = cfg.get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2")
|
|
51
|
+
self.chunk_size = int(cfg.get("chunk_size", 500))
|
|
52
|
+
self.overlap = int(cfg.get("overlap", 100))
|
|
53
|
+
|
|
54
|
+
def prepare(self, documents: List[str]) -> Tuple[Any, Any]:
|
|
55
|
+
"""
|
|
56
|
+
Prepares LangChain-compatible retriever and embeddings.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
documents (list[str]): Corpus texts
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
(retriever, embeddings): Tuple of initialized LangChain retriever and embedding model
|
|
63
|
+
"""
|
|
64
|
+
# 1️⃣ Split into chunks
|
|
65
|
+
splitter = RecursiveCharacterTextSplitter(
|
|
66
|
+
chunk_size=self.chunk_size,
|
|
67
|
+
chunk_overlap=self.overlap
|
|
68
|
+
)
|
|
69
|
+
docs = splitter.create_documents(documents)
|
|
70
|
+
|
|
71
|
+
# 2️⃣ Create embeddings
|
|
72
|
+
embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model)
|
|
73
|
+
|
|
74
|
+
# 3️⃣ Build retriever
|
|
75
|
+
retriever = self._build_retriever(docs, embeddings)
|
|
76
|
+
return retriever, embeddings
|
|
77
|
+
|
|
78
|
+
def _build_retriever(self, docs, embeddings):
|
|
79
|
+
"""Internal helper for building retriever backend."""
|
|
80
|
+
backend = self.retriever_backend
|
|
81
|
+
|
|
82
|
+
if backend == "faiss":
|
|
83
|
+
db = FAISS.from_documents(docs, embeddings)
|
|
84
|
+
return db.as_retriever(search_kwargs={"k": 5})
|
|
85
|
+
|
|
86
|
+
elif backend == "chroma":
|
|
87
|
+
db = Chroma.from_documents(docs, embeddings, collection_name="ragmint_docs")
|
|
88
|
+
return db.as_retriever(search_kwargs={"k": 5})
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
elif backend == "bm25":
|
|
92
|
+
# Support both Document objects and raw text strings
|
|
93
|
+
texts = [getattr(d, "page_content", d) for d in docs]
|
|
94
|
+
retriever = BM25Retriever.from_texts(texts)
|
|
95
|
+
retriever.k = 5
|
|
96
|
+
return retriever
|
|
97
|
+
|
|
98
|
+
else:
|
|
99
|
+
raise ValueError(f"Unsupported retriever backend: {backend}")
|
ragmint/leaderboard.py
CHANGED
|
@@ -1,45 +1,51 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Dict, Any, Optional
|
|
5
|
-
|
|
4
|
+
from typing import Dict, Any, List, Optional
|
|
5
|
+
|
|
6
6
|
|
|
7
7
|
class Leaderboard:
|
|
8
|
-
def __init__(self, storage_path: Optional[str] =
|
|
8
|
+
def __init__(self, storage_path: Optional[str] = "leaderboard.jsonl"):
|
|
9
9
|
self.storage_path = storage_path
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
self.
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
10
|
+
os.makedirs(os.path.dirname(self.storage_path) or ".", exist_ok=True)
|
|
11
|
+
|
|
12
|
+
if not os.path.exists(self.storage_path):
|
|
13
|
+
open(self.storage_path, "w", encoding="utf-8").close()
|
|
14
|
+
|
|
15
|
+
def upload(
|
|
16
|
+
self,
|
|
17
|
+
run_id: str,
|
|
18
|
+
best_config: Dict[str, Any],
|
|
19
|
+
best_score: float,
|
|
20
|
+
all_results: List[Dict[str, Any]],
|
|
21
|
+
documents: List[str],
|
|
22
|
+
model: str,
|
|
23
|
+
corpus_stats: Optional[Dict[str, Any]] = None,
|
|
24
|
+
):
|
|
25
|
+
"""Persist a full experiment run to local leaderboard."""
|
|
19
26
|
data = {
|
|
20
27
|
"run_id": run_id,
|
|
21
|
-
"config": config,
|
|
22
|
-
"score": score,
|
|
23
28
|
"timestamp": datetime.utcnow().isoformat(),
|
|
29
|
+
"best_config": best_config,
|
|
30
|
+
"best_score": best_score,
|
|
31
|
+
"all_results": all_results,
|
|
32
|
+
"documents": [os.path.basename(d) for d in documents],
|
|
33
|
+
"model": model,
|
|
34
|
+
"corpus_stats": corpus_stats or {},
|
|
24
35
|
}
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
return (
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
)
|
|
42
|
-
else:
|
|
43
|
-
with open(self.storage_path, "r", encoding="utf-8") as f:
|
|
44
|
-
lines = [json.loads(line) for line in f]
|
|
45
|
-
return sorted(lines, key=lambda x: x["score"], reverse=True)[:limit]
|
|
36
|
+
|
|
37
|
+
with open(self.storage_path, "a", encoding="utf-8") as f:
|
|
38
|
+
f.write(json.dumps(data) + "\n")
|
|
39
|
+
|
|
40
|
+
return data
|
|
41
|
+
|
|
42
|
+
def all_results(self) -> List[Dict[str, Any]]:
|
|
43
|
+
if not os.path.exists(self.storage_path):
|
|
44
|
+
return []
|
|
45
|
+
with open(self.storage_path, "r", encoding="utf-8") as f:
|
|
46
|
+
return [json.loads(line) for line in f if line.strip()]
|
|
47
|
+
|
|
48
|
+
def top_results(self, limit: int = 10) -> List[Dict[str, Any]]:
|
|
49
|
+
"""Return top experiments by score."""
|
|
50
|
+
results = self.all_results()
|
|
51
|
+
return sorted(results, key=lambda x: x.get("best_score", 0.0), reverse=True)[:limit]
|