ragmint 0.2.3__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragmint/app.py +512 -0
- ragmint/autotuner.py +201 -17
- ragmint/core/chunking.py +68 -4
- ragmint/core/embeddings.py +46 -10
- ragmint/core/evaluation.py +33 -14
- ragmint/core/pipeline.py +34 -10
- ragmint/core/retriever.py +152 -20
- ragmint/experiments/validation_qa.json +1 -14
- ragmint/explainer.py +47 -20
- ragmint/integrations/__init__.py +0 -0
- ragmint/integrations/config_adapter.py +96 -0
- ragmint/integrations/langchain_prebuilder.py +99 -0
- ragmint/leaderboard.py +41 -35
- ragmint/qa_generator.py +190 -0
- ragmint/tests/test_autotuner.py +52 -30
- ragmint/tests/test_config_adapter.py +39 -0
- ragmint/tests/test_embeddings.py +46 -0
- ragmint/tests/test_explainer.py +28 -12
- ragmint/tests/test_integration_autotuner_ragmint.py +39 -52
- ragmint/tests/test_langchain_prebuilder.py +82 -0
- ragmint/tests/test_leaderboard.py +78 -25
- ragmint/tests/test_pipeline.py +3 -2
- ragmint/tests/test_qa_generator.py +66 -0
- ragmint/tests/test_retriever.py +3 -2
- ragmint/tests/test_tuner.py +1 -1
- ragmint/tuner.py +109 -22
- ragmint-0.4.6.data/data/README.md +485 -0
- ragmint-0.4.6.dist-info/METADATA +530 -0
- ragmint-0.4.6.dist-info/RECORD +48 -0
- ragmint/tests/test_explainer_integration.py +0 -18
- ragmint-0.2.3.data/data/README.md +0 -284
- ragmint-0.2.3.dist-info/METADATA +0 -312
- ragmint-0.2.3.dist-info/RECORD +0 -40
- {ragmint-0.2.3.data → ragmint-0.4.6.data}/data/LICENSE +0 -0
- {ragmint-0.2.3.dist-info → ragmint-0.4.6.dist-info}/WHEEL +0 -0
- {ragmint-0.2.3.dist-info → ragmint-0.4.6.dist-info}/licenses/LICENSE +0 -0
- {ragmint-0.2.3.dist-info → ragmint-0.4.6.dist-info}/top_level.txt +0 -0
ragmint/autotuner.py
CHANGED
|
@@ -1,33 +1,217 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Auto-RAG Tuner
|
|
3
3
|
--------------
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
Automatically recommends and optimizes RAG configurations based on corpus statistics.
|
|
5
|
+
Integrates with RAGMint to perform full end-to-end tuning.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import os
|
|
9
|
+
import logging
|
|
10
|
+
from statistics import mean
|
|
11
|
+
from typing import Dict, Any, Tuple, List, Optional
|
|
12
|
+
import random
|
|
13
|
+
|
|
14
|
+
from sentence_transformers import SentenceTransformer
|
|
15
|
+
from .tuner import RAGMint
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
9
19
|
|
|
10
20
|
|
|
11
21
|
class AutoRAGTuner:
|
|
12
|
-
|
|
22
|
+
DEFAULT_EMBEDDINGS = "sentence-transformers/all-MiniLM-L6-v2"
|
|
23
|
+
|
|
24
|
+
def __init__(self, docs_path: str):
|
|
25
|
+
"""
|
|
26
|
+
AutoRAGTuner automatically analyzes a corpus and runs an optimized RAG tuning pipeline.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
docs_path (str): Path to the directory containing documents (.txt, .md, .rst)
|
|
13
30
|
"""
|
|
14
|
-
|
|
15
|
-
|
|
31
|
+
self.docs_path = docs_path
|
|
32
|
+
self.corpus_stats = self._analyze_corpus()
|
|
33
|
+
|
|
34
|
+
# -----------------------------
|
|
35
|
+
# Corpus Analysis
|
|
36
|
+
# -----------------------------
|
|
37
|
+
def _analyze_corpus(self) -> Dict[str, Any]:
|
|
38
|
+
"""Compute corpus size, average length, and number of documents."""
|
|
39
|
+
docs = []
|
|
40
|
+
total_chars = 0
|
|
41
|
+
num_docs = 0
|
|
42
|
+
|
|
43
|
+
if not os.path.exists(self.docs_path):
|
|
44
|
+
logging.warning(f"⚠️ Corpus path not found: {self.docs_path}")
|
|
45
|
+
return {"size": 0, "avg_len": 0, "num_docs": 0}
|
|
46
|
+
|
|
47
|
+
for file in os.listdir(self.docs_path):
|
|
48
|
+
if file.endswith((".txt", ".md", ".rst")):
|
|
49
|
+
with open(os.path.join(self.docs_path, file), "r", encoding="utf-8") as f:
|
|
50
|
+
content = f.read()
|
|
51
|
+
docs.append(content)
|
|
52
|
+
total_chars += len(content)
|
|
53
|
+
num_docs += 1
|
|
54
|
+
|
|
55
|
+
avg_len = int(mean([len(d) for d in docs])) if docs else 0
|
|
56
|
+
stats = {"size": total_chars, "avg_len": avg_len, "num_docs": num_docs}
|
|
57
|
+
logging.info(f"📊 Corpus stats: {stats}")
|
|
58
|
+
return stats
|
|
59
|
+
|
|
60
|
+
# -----------------------------
|
|
61
|
+
# Chunk Size Suggestion
|
|
62
|
+
# -----------------------------
|
|
63
|
+
def suggest_chunk_sizes(
|
|
64
|
+
self,
|
|
65
|
+
model_name: Optional[str] = None,
|
|
66
|
+
num_pairs: Optional[int] = None,
|
|
67
|
+
step: int = 10
|
|
68
|
+
) -> List[Tuple[int, int]]:
|
|
69
|
+
if num_pairs is None:
|
|
70
|
+
raise ValueError("⚠️ You must specify the number of pairs you want (num_pairs).")
|
|
71
|
+
|
|
72
|
+
if model_name is None:
|
|
73
|
+
model_name = self.DEFAULT_EMBEDDINGS
|
|
74
|
+
logging.warning(f"⚠️ No embedding model provided. Using default: {model_name}")
|
|
75
|
+
|
|
76
|
+
model = SentenceTransformer(model_name)
|
|
77
|
+
max_tokens = getattr(model, "max_seq_length", 256)
|
|
78
|
+
approx_words = max(1, int(max_tokens * 0.75))
|
|
79
|
+
avg_len = self.corpus_stats.get("avg_len", 400)
|
|
80
|
+
|
|
81
|
+
max_chunk = max(50, min(approx_words, max(avg_len * 2, 50)))
|
|
82
|
+
|
|
83
|
+
# Safe chunk and overlap ranges
|
|
84
|
+
chunk_sizes = list(range(50, max_chunk + 1, step))
|
|
85
|
+
overlaps = list(range(10, min(300, max_chunk // 2) + 1, step))
|
|
86
|
+
if not overlaps:
|
|
87
|
+
overlaps = [max(1, max_chunk // 4)]
|
|
88
|
+
|
|
89
|
+
candidates = [(c, o) for c in chunk_sizes for o in overlaps if o < c]
|
|
90
|
+
|
|
91
|
+
# Randomly sample requested number of pairs
|
|
92
|
+
if num_pairs >= len(candidates):
|
|
93
|
+
sampled = candidates
|
|
94
|
+
else:
|
|
95
|
+
sampled = random.sample(candidates, num_pairs)
|
|
96
|
+
|
|
97
|
+
logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {sampled}")
|
|
98
|
+
return sampled
|
|
99
|
+
|
|
100
|
+
# -----------------------------
|
|
101
|
+
# Recommendation Logic
|
|
102
|
+
# -----------------------------
|
|
103
|
+
def recommend(
|
|
104
|
+
self,
|
|
105
|
+
embedding_model: Optional[str] = None,
|
|
106
|
+
num_chunk_pairs: Optional[int] = 5
|
|
107
|
+
) -> Dict[str, Any]:
|
|
16
108
|
"""
|
|
17
|
-
|
|
109
|
+
Recommend retriever, embedding, chunking, and strategy based on corpus stats.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
embedding_model (str, optional): User-provided embedding model.
|
|
113
|
+
num_chunk_pairs (int, optional): Number of (chunk_size, overlap) pairs to generate.
|
|
18
114
|
|
|
19
|
-
|
|
115
|
+
Returns:
|
|
116
|
+
Dict[str, Any]: Recommended RAG configuration
|
|
117
|
+
"""
|
|
20
118
|
size = self.corpus_stats.get("size", 0)
|
|
21
119
|
avg_len = self.corpus_stats.get("avg_len", 0)
|
|
22
120
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
121
|
+
# Determine retriever
|
|
122
|
+
if size <= 2000:
|
|
123
|
+
retriever = "BM25"
|
|
124
|
+
if embedding_model is None:
|
|
125
|
+
embedding_model = self.DEFAULT_EMBEDDINGS
|
|
126
|
+
elif size <= 10000:
|
|
127
|
+
retriever = "Chroma"
|
|
128
|
+
if embedding_model is None:
|
|
129
|
+
embedding_model = "sentence-transformers/paraphrase-MiniLM-L6-v2"
|
|
27
130
|
else:
|
|
28
|
-
|
|
131
|
+
retriever = "FAISS"
|
|
132
|
+
if embedding_model is None:
|
|
133
|
+
embedding_model = "sentence-transformers/all-mpnet-base-v2"
|
|
134
|
+
|
|
135
|
+
if embedding_model is None:
|
|
136
|
+
embedding_model = self.DEFAULT_EMBEDDINGS
|
|
137
|
+
logging.warning(f"⚠️ Using default embedding model: {embedding_model}")
|
|
138
|
+
|
|
139
|
+
# Suggest chunk sizes
|
|
140
|
+
# Inside auto_tune, replace fixed chunk_sizes/overlaps with all candidates:
|
|
141
|
+
chunk_candidates = self.suggest_chunk_sizes(
|
|
142
|
+
model_name=embedding_model,
|
|
143
|
+
num_pairs=num_chunk_pairs
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Safety check
|
|
147
|
+
if not chunk_candidates:
|
|
148
|
+
raise RuntimeError("No chunk candidates generated.")
|
|
149
|
+
|
|
150
|
+
# Pick the first pair as default recommendation
|
|
151
|
+
chunk_size, overlap = chunk_candidates[0]
|
|
152
|
+
|
|
153
|
+
strategy = "fixed" if avg_len < 400 else "sentence"
|
|
154
|
+
|
|
155
|
+
recommendation = {
|
|
156
|
+
"retriever": retriever,
|
|
157
|
+
"embedding_model": embedding_model,
|
|
158
|
+
"chunk_size": chunk_size,
|
|
159
|
+
"overlap": overlap,
|
|
160
|
+
"strategy": strategy,
|
|
161
|
+
"chunk_candidates": chunk_candidates,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
logging.info(f"🔮 AutoRAG Recommendation: {recommendation}")
|
|
165
|
+
return recommendation
|
|
166
|
+
|
|
167
|
+
# -----------------------------
|
|
168
|
+
# Full Auto-Tuning
|
|
169
|
+
# -----------------------------
|
|
170
|
+
def auto_tune(
|
|
171
|
+
self,
|
|
172
|
+
validation_set: str = None,
|
|
173
|
+
metric: str = "faithfulness",
|
|
174
|
+
trials: int = 5,
|
|
175
|
+
search_type: str = "random",
|
|
176
|
+
embedding_model: Optional[str] = None,
|
|
177
|
+
num_chunk_pairs: Optional[int] = 5
|
|
178
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
|
179
|
+
"""
|
|
180
|
+
Run a full automatic optimization using RAGMint.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
validation_set (str): Path to validation set.
|
|
184
|
+
metric (str): Metric to optimize.
|
|
185
|
+
trials (int): Number of optimization trials.
|
|
186
|
+
search_type (str): Search strategy.
|
|
187
|
+
embedding_model (str, optional): User-provided embedding model.
|
|
188
|
+
num_chunk_pairs (int, optional): Number of chunk pairs to try.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Tuple[Dict[str, Any], List[Dict[str, Any]]]: Best configuration and all trial results.
|
|
192
|
+
"""
|
|
193
|
+
rec = self.recommend(embedding_model=embedding_model, num_chunk_pairs=num_chunk_pairs)
|
|
194
|
+
|
|
195
|
+
chunk_candidates = rec["chunk_candidates"]
|
|
196
|
+
|
|
197
|
+
logging.info("🚀 Launching full AutoRAG optimization with RAGMint")
|
|
198
|
+
|
|
199
|
+
tuner = RAGMint(
|
|
200
|
+
docs_path=self.docs_path,
|
|
201
|
+
retrievers=[rec["retriever"]],
|
|
202
|
+
embeddings=[rec["embedding_model"]],
|
|
203
|
+
rerankers=["mmr"],
|
|
204
|
+
chunk_sizes=[c[0] for c in chunk_candidates],
|
|
205
|
+
overlaps=[c[1] for c in chunk_candidates],
|
|
206
|
+
strategies=[rec["strategy"]],
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
best, results = tuner.optimize(
|
|
210
|
+
validation_set=validation_set,
|
|
211
|
+
metric=metric,
|
|
212
|
+
trials=trials,
|
|
213
|
+
search_type=search_type,
|
|
214
|
+
)
|
|
29
215
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
results = evaluate_config(config, validation_data)
|
|
33
|
-
return {"recommended": config, "results": results}
|
|
216
|
+
logging.info(f"🏁 AutoRAG tuning complete. Best: {best}")
|
|
217
|
+
return best, results
|
ragmint/core/chunking.py
CHANGED
|
@@ -1,18 +1,45 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import tiktoken
|
|
6
|
+
except ImportError:
|
|
7
|
+
tiktoken = None
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import nltk
|
|
11
|
+
nltk.download("punkt", quiet=True)
|
|
12
|
+
from nltk.tokenize import sent_tokenize
|
|
13
|
+
except ImportError:
|
|
14
|
+
sent_tokenize = None
|
|
2
15
|
|
|
3
16
|
|
|
4
17
|
class Chunker:
|
|
5
18
|
"""
|
|
6
|
-
Handles text chunking
|
|
7
|
-
-
|
|
8
|
-
-
|
|
19
|
+
Handles text chunking strategies:
|
|
20
|
+
- fixed: character-based
|
|
21
|
+
- token: token-based (requires tiktoken)
|
|
22
|
+
- sentence: splits by full sentences (requires nltk)
|
|
9
23
|
"""
|
|
10
24
|
|
|
11
|
-
def __init__(self, chunk_size: int = 500, overlap: int = 100):
|
|
25
|
+
def __init__(self, chunk_size: int = 500, overlap: int = 100, strategy: str = "fixed"):
|
|
12
26
|
self.chunk_size = chunk_size
|
|
13
27
|
self.overlap = overlap
|
|
28
|
+
self.strategy = strategy
|
|
14
29
|
|
|
15
30
|
def chunk_text(self, text: str) -> List[str]:
|
|
31
|
+
"""Dispatches to the correct chunking strategy."""
|
|
32
|
+
if self.strategy == "token" and tiktoken:
|
|
33
|
+
return self._chunk_by_tokens(text)
|
|
34
|
+
elif self.strategy == "sentence" and sent_tokenize:
|
|
35
|
+
return self._chunk_by_sentences(text)
|
|
36
|
+
else:
|
|
37
|
+
return self._chunk_fixed(text)
|
|
38
|
+
|
|
39
|
+
# -------------------------------
|
|
40
|
+
# Fixed-length (default)
|
|
41
|
+
# -------------------------------
|
|
42
|
+
def _chunk_fixed(self, text: str) -> List[str]:
|
|
16
43
|
chunks = []
|
|
17
44
|
start = 0
|
|
18
45
|
while start < len(text):
|
|
@@ -20,3 +47,40 @@ class Chunker:
|
|
|
20
47
|
chunks.append(text[start:end])
|
|
21
48
|
start += self.chunk_size - self.overlap
|
|
22
49
|
return chunks
|
|
50
|
+
|
|
51
|
+
# -------------------------------
|
|
52
|
+
# Token-based (for LLM embedding)
|
|
53
|
+
# -------------------------------
|
|
54
|
+
def _chunk_by_tokens(self, text: str) -> List[str]:
|
|
55
|
+
if not tiktoken:
|
|
56
|
+
raise ImportError("tiktoken is required for token-based chunking.")
|
|
57
|
+
enc = tiktoken.get_encoding("cl100k_base")
|
|
58
|
+
tokens = enc.encode(text)
|
|
59
|
+
|
|
60
|
+
chunks = []
|
|
61
|
+
for i in range(0, len(tokens), self.chunk_size - self.overlap):
|
|
62
|
+
chunk_tokens = tokens[i:i + self.chunk_size]
|
|
63
|
+
chunks.append(enc.decode(chunk_tokens))
|
|
64
|
+
return chunks
|
|
65
|
+
|
|
66
|
+
# -------------------------------
|
|
67
|
+
# Sentence-based
|
|
68
|
+
# -------------------------------
|
|
69
|
+
def _chunk_by_sentences(self, text: str) -> List[str]:
|
|
70
|
+
if not sent_tokenize:
|
|
71
|
+
raise ImportError("nltk is required for sentence-based chunking.")
|
|
72
|
+
sentences = sent_tokenize(text)
|
|
73
|
+
chunks = []
|
|
74
|
+
current_chunk = ""
|
|
75
|
+
|
|
76
|
+
for sentence in sentences:
|
|
77
|
+
if len(current_chunk) + len(sentence) <= self.chunk_size:
|
|
78
|
+
current_chunk += " " + sentence
|
|
79
|
+
else:
|
|
80
|
+
chunks.append(current_chunk.strip())
|
|
81
|
+
current_chunk = sentence
|
|
82
|
+
|
|
83
|
+
if current_chunk:
|
|
84
|
+
chunks.append(current_chunk.strip())
|
|
85
|
+
|
|
86
|
+
return chunks
|
ragmint/core/embeddings.py
CHANGED
|
@@ -1,19 +1,55 @@
|
|
|
1
1
|
import numpy as np
|
|
2
|
+
from dotenv import load_dotenv
|
|
2
3
|
|
|
4
|
+
try:
|
|
5
|
+
from sentence_transformers import SentenceTransformer
|
|
6
|
+
except ImportError:
|
|
7
|
+
SentenceTransformer = None
|
|
3
8
|
|
|
4
|
-
|
|
9
|
+
|
|
10
|
+
class Embeddings:
|
|
5
11
|
"""
|
|
6
|
-
Wrapper for embedding backends (
|
|
12
|
+
Wrapper for embedding backends: HuggingFace (SentenceTransformers) or Dummy.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
model = Embeddings("huggingface", model_name="all-MiniLM-L6-v2")
|
|
16
|
+
embeddings = model.encode(["example text"])
|
|
7
17
|
"""
|
|
8
18
|
|
|
9
|
-
def __init__(self, backend: str = "
|
|
10
|
-
|
|
19
|
+
def __init__(self, backend: str = "huggingface", model_name: str = None):
|
|
20
|
+
load_dotenv()
|
|
21
|
+
self.backend = backend.lower()
|
|
22
|
+
self.model_name = model_name or "all-MiniLM-L6-v2"
|
|
23
|
+
|
|
24
|
+
if self.backend == "huggingface":
|
|
25
|
+
if SentenceTransformer is None:
|
|
26
|
+
raise ImportError("Please install `sentence-transformers` to use HuggingFace embeddings.")
|
|
27
|
+
self.model = SentenceTransformer(self.model_name)
|
|
28
|
+
self.dim = self.model.get_sentence_embedding_dimension()
|
|
29
|
+
|
|
30
|
+
elif self.backend == "dummy":
|
|
31
|
+
self.model = None
|
|
32
|
+
self.dim = 768 # Default embedding dimension for dummy backend
|
|
33
|
+
|
|
34
|
+
else:
|
|
35
|
+
raise ValueError(f"Unsupported embedding backend: {backend}")
|
|
11
36
|
|
|
12
37
|
def encode(self, texts):
|
|
13
|
-
if
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
38
|
+
if isinstance(texts, str):
|
|
39
|
+
texts = [texts]
|
|
40
|
+
|
|
41
|
+
if self.backend == "huggingface":
|
|
42
|
+
embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
|
43
|
+
|
|
44
|
+
elif self.backend == "dummy":
|
|
45
|
+
# Return a NumPy array of shape (len(texts), dim)
|
|
46
|
+
embeddings = np.random.rand(len(texts), self.dim).astype(np.float32)
|
|
47
|
+
|
|
18
48
|
else:
|
|
19
|
-
|
|
49
|
+
raise ValueError(f"Unknown embedding backend: {self.backend}")
|
|
50
|
+
|
|
51
|
+
# ✅ Always ensure NumPy array output
|
|
52
|
+
if not isinstance(embeddings, np.ndarray):
|
|
53
|
+
embeddings = np.array(embeddings, dtype=np.float32)
|
|
54
|
+
|
|
55
|
+
return embeddings
|
ragmint/core/evaluation.py
CHANGED
|
@@ -1,33 +1,53 @@
|
|
|
1
1
|
import time
|
|
2
|
-
from typing import Dict, Any
|
|
3
|
-
|
|
2
|
+
from typing import Dict, Any, List
|
|
3
|
+
import numpy as np
|
|
4
|
+
from .embeddings import Embeddings
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class Evaluator:
|
|
7
8
|
"""
|
|
8
|
-
|
|
9
|
-
- Faithfulness
|
|
10
|
-
- Latency
|
|
9
|
+
Semantic evaluation of generated answers:
|
|
10
|
+
- Faithfulness: cosine similarity between answer and context embeddings
|
|
11
|
+
- Latency: time to compute embeddings and similarity
|
|
11
12
|
"""
|
|
12
13
|
|
|
13
|
-
def __init__(self):
|
|
14
|
-
|
|
14
|
+
def __init__(self, embeddings: Embeddings = None):
|
|
15
|
+
self.embeddings = embeddings or Embeddings() # default to HuggingFace all-MiniLM-L6-v2
|
|
15
16
|
|
|
16
17
|
def evaluate(self, query: str, answer: str, context: str) -> Dict[str, Any]:
|
|
17
18
|
start = time.time()
|
|
18
|
-
faithfulness = self._similarity(answer, context)
|
|
19
|
-
latency = time.time() - start
|
|
20
19
|
|
|
20
|
+
# Compute embeddings
|
|
21
|
+
emb_answer = self.embeddings.encode(answer)
|
|
22
|
+
emb_context = self.embeddings.encode(context)
|
|
23
|
+
|
|
24
|
+
# Compute cosine similarity
|
|
25
|
+
faithfulness = self._cosine_similarity(emb_answer, emb_context)
|
|
26
|
+
|
|
27
|
+
faithfulness = np.clip(faithfulness, 0.0, 1.0)
|
|
28
|
+
|
|
29
|
+
latency = time.time() - start
|
|
21
30
|
return {
|
|
22
31
|
"faithfulness": faithfulness,
|
|
23
32
|
"latency": latency,
|
|
24
33
|
}
|
|
25
34
|
|
|
26
|
-
|
|
27
|
-
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
37
|
+
# Ensure vectors are 1D
|
|
38
|
+
a = a.flatten()
|
|
39
|
+
b = b.flatten()
|
|
40
|
+
if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
|
|
41
|
+
return 0.0
|
|
42
|
+
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
|
|
28
43
|
|
|
29
|
-
|
|
30
|
-
|
|
44
|
+
|
|
45
|
+
def evaluate_config(config: Dict[str, Any], validation_data: List[Dict[str, str]], embeddings: Embeddings = None) -> \
|
|
46
|
+
List[Dict[str, Any]]:
|
|
47
|
+
"""
|
|
48
|
+
Evaluate a set of model outputs against validation data.
|
|
49
|
+
"""
|
|
50
|
+
evaluator = Evaluator(embeddings=embeddings)
|
|
31
51
|
results = []
|
|
32
52
|
for sample in validation_data:
|
|
33
53
|
query = sample.get("query", "")
|
|
@@ -35,4 +55,3 @@ def evaluate_config(config, validation_data):
|
|
|
35
55
|
context = sample.get("context", "")
|
|
36
56
|
results.append(evaluator.evaluate(query, answer, context))
|
|
37
57
|
return results
|
|
38
|
-
|
ragmint/core/pipeline.py
CHANGED
|
@@ -1,33 +1,57 @@
|
|
|
1
|
-
from typing import Any, Dict,
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
2
|
from .retriever import Retriever
|
|
3
3
|
from .reranker import Reranker
|
|
4
4
|
from .evaluation import Evaluator
|
|
5
|
+
from .chunking import Chunker
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class RAGPipeline:
|
|
8
9
|
"""
|
|
9
10
|
Core Retrieval-Augmented Generation pipeline.
|
|
10
|
-
|
|
11
|
+
Retrieves, reranks, and evaluates a query given the configured backends.
|
|
12
|
+
Supports text chunking for optimal retrieval performance.
|
|
11
13
|
"""
|
|
12
14
|
|
|
13
|
-
def __init__(
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
retriever: Retriever,
|
|
18
|
+
reranker: Reranker,
|
|
19
|
+
evaluator: Evaluator,
|
|
20
|
+
chunk_size: int = 500,
|
|
21
|
+
overlap: int = 100,
|
|
22
|
+
chunking_strategy: str = "fixed"
|
|
23
|
+
):
|
|
14
24
|
self.retriever = retriever
|
|
15
25
|
self.reranker = reranker
|
|
16
26
|
self.evaluator = evaluator
|
|
17
27
|
|
|
18
|
-
|
|
28
|
+
# Initialize chunker for preprocessing
|
|
29
|
+
self.chunker = Chunker(chunk_size=chunk_size, overlap=overlap, strategy=chunking_strategy)
|
|
30
|
+
|
|
31
|
+
def preprocess_docs(self, documents):
|
|
32
|
+
"""Applies the selected chunking strategy to the document set."""
|
|
33
|
+
all_chunks = []
|
|
34
|
+
for doc in documents:
|
|
35
|
+
chunks = self.chunker.chunk_text(doc)
|
|
36
|
+
all_chunks.extend(chunks)
|
|
37
|
+
return all_chunks
|
|
38
|
+
|
|
39
|
+
def run(self, query: str, top_k: int = 5, use_chunking: bool = True) -> Dict[str, Any]:
|
|
40
|
+
# Optional preprocessing step
|
|
41
|
+
if use_chunking and hasattr(self.retriever, "documents") and self.retriever.documents:
|
|
42
|
+
self.retriever.documents = self.preprocess_docs(self.retriever.documents)
|
|
43
|
+
|
|
19
44
|
# Retrieve documents
|
|
20
45
|
retrieved_docs = self.retriever.retrieve(query, top_k=top_k)
|
|
46
|
+
|
|
21
47
|
# Rerank
|
|
22
48
|
reranked_docs = self.reranker.rerank(query, retrieved_docs)
|
|
23
49
|
|
|
24
|
-
#
|
|
25
|
-
if reranked_docs
|
|
26
|
-
answer = reranked_docs[0]["text"]
|
|
27
|
-
else:
|
|
28
|
-
answer = ""
|
|
29
|
-
|
|
50
|
+
# Construct pseudo-answer
|
|
51
|
+
answer = reranked_docs[0]["text"] if reranked_docs else ""
|
|
30
52
|
context = "\n".join([d["text"] for d in reranked_docs])
|
|
53
|
+
|
|
54
|
+
# Evaluate
|
|
31
55
|
metrics = self.evaluator.evaluate(query, answer, context)
|
|
32
56
|
|
|
33
57
|
return {
|