ragmint 0.2.1__py3-none-any.whl → 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. ragmint/app.py +512 -0
  2. ragmint/autotuner.py +201 -17
  3. ragmint/core/chunking.py +68 -4
  4. ragmint/core/embeddings.py +46 -10
  5. ragmint/core/evaluation.py +33 -14
  6. ragmint/core/pipeline.py +34 -10
  7. ragmint/core/retriever.py +152 -20
  8. ragmint/experiments/validation_qa.json +1 -14
  9. ragmint/explainer.py +47 -20
  10. ragmint/integrations/__init__.py +0 -0
  11. ragmint/integrations/config_adapter.py +96 -0
  12. ragmint/integrations/langchain_prebuilder.py +99 -0
  13. ragmint/leaderboard.py +41 -35
  14. ragmint/qa_generator.py +190 -0
  15. ragmint/tests/test_autotuner.py +52 -30
  16. ragmint/tests/test_config_adapter.py +39 -0
  17. ragmint/tests/test_embeddings.py +46 -0
  18. ragmint/tests/test_explainer.py +28 -12
  19. ragmint/tests/test_integration_autotuner_ragmint.py +39 -52
  20. ragmint/tests/test_langchain_prebuilder.py +82 -0
  21. ragmint/tests/test_leaderboard.py +78 -25
  22. ragmint/tests/test_pipeline.py +3 -2
  23. ragmint/tests/test_qa_generator.py +66 -0
  24. ragmint/tests/test_retriever.py +3 -2
  25. ragmint/tests/test_tuner.py +1 -1
  26. ragmint/tuner.py +109 -22
  27. ragmint-0.4.6.data/data/README.md +485 -0
  28. ragmint-0.4.6.dist-info/METADATA +530 -0
  29. ragmint-0.4.6.dist-info/RECORD +48 -0
  30. ragmint-0.4.6.dist-info/licenses/LICENSE +19 -0
  31. ragmint/tests/test_explainer_integration.py +0 -18
  32. ragmint-0.2.1.dist-info/METADATA +0 -27
  33. ragmint-0.2.1.dist-info/RECORD +0 -38
  34. {ragmint-0.2.1.dist-info/licenses → ragmint-0.4.6.data/data}/LICENSE +0 -0
  35. {ragmint-0.2.1.dist-info → ragmint-0.4.6.dist-info}/WHEEL +0 -0
  36. {ragmint-0.2.1.dist-info → ragmint-0.4.6.dist-info}/top_level.txt +0 -0
ragmint/autotuner.py CHANGED
@@ -1,33 +1,217 @@
1
1
  """
2
2
  Auto-RAG Tuner
3
3
  --------------
4
- Recommends retriever–embedding pairs dynamically based on corpus size
5
- and dataset characteristics. Integrates seamlessly with RAGMint evaluator.
4
+ Automatically recommends and optimizes RAG configurations based on corpus statistics.
5
+ Integrates with RAGMint to perform full end-to-end tuning.
6
6
  """
7
7
 
8
- from .core.evaluation import evaluate_config
8
+ import os
9
+ import logging
10
+ from statistics import mean
11
+ from typing import Dict, Any, Tuple, List, Optional
12
+ import random
13
+
14
+ from sentence_transformers import SentenceTransformer
15
+ from .tuner import RAGMint
16
+
17
+
18
+ logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
9
19
 
10
20
 
11
21
  class AutoRAGTuner:
12
- def __init__(self, corpus_stats: dict):
22
+ DEFAULT_EMBEDDINGS = "sentence-transformers/all-MiniLM-L6-v2"
23
+
24
+ def __init__(self, docs_path: str):
25
+ """
26
+ AutoRAGTuner automatically analyzes a corpus and runs an optimized RAG tuning pipeline.
27
+
28
+ Args:
29
+ docs_path (str): Path to the directory containing documents (.txt, .md, .rst)
13
30
  """
14
- corpus_stats: dict
15
- Example: {'size': 12000, 'avg_len': 240}
31
+ self.docs_path = docs_path
32
+ self.corpus_stats = self._analyze_corpus()
33
+
34
+ # -----------------------------
35
+ # Corpus Analysis
36
+ # -----------------------------
37
+ def _analyze_corpus(self) -> Dict[str, Any]:
38
+ """Compute corpus size, average length, and number of documents."""
39
+ docs = []
40
+ total_chars = 0
41
+ num_docs = 0
42
+
43
+ if not os.path.exists(self.docs_path):
44
+ logging.warning(f"⚠️ Corpus path not found: {self.docs_path}")
45
+ return {"size": 0, "avg_len": 0, "num_docs": 0}
46
+
47
+ for file in os.listdir(self.docs_path):
48
+ if file.endswith((".txt", ".md", ".rst")):
49
+ with open(os.path.join(self.docs_path, file), "r", encoding="utf-8") as f:
50
+ content = f.read()
51
+ docs.append(content)
52
+ total_chars += len(content)
53
+ num_docs += 1
54
+
55
+ avg_len = int(mean([len(d) for d in docs])) if docs else 0
56
+ stats = {"size": total_chars, "avg_len": avg_len, "num_docs": num_docs}
57
+ logging.info(f"📊 Corpus stats: {stats}")
58
+ return stats
59
+
60
+ # -----------------------------
61
+ # Chunk Size Suggestion
62
+ # -----------------------------
63
+ def suggest_chunk_sizes(
64
+ self,
65
+ model_name: Optional[str] = None,
66
+ num_pairs: Optional[int] = None,
67
+ step: int = 10
68
+ ) -> List[Tuple[int, int]]:
69
+ if num_pairs is None:
70
+ raise ValueError("⚠️ You must specify the number of pairs you want (num_pairs).")
71
+
72
+ if model_name is None:
73
+ model_name = self.DEFAULT_EMBEDDINGS
74
+ logging.warning(f"⚠️ No embedding model provided. Using default: {model_name}")
75
+
76
+ model = SentenceTransformer(model_name)
77
+ max_tokens = getattr(model, "max_seq_length", 256)
78
+ approx_words = max(1, int(max_tokens * 0.75))
79
+ avg_len = self.corpus_stats.get("avg_len", 400)
80
+
81
+ max_chunk = max(50, min(approx_words, max(avg_len * 2, 50)))
82
+
83
+ # Safe chunk and overlap ranges
84
+ chunk_sizes = list(range(50, max_chunk + 1, step))
85
+ overlaps = list(range(10, min(300, max_chunk // 2) + 1, step))
86
+ if not overlaps:
87
+ overlaps = [max(1, max_chunk // 4)]
88
+
89
+ candidates = [(c, o) for c in chunk_sizes for o in overlaps if o < c]
90
+
91
+ # Randomly sample requested number of pairs
92
+ if num_pairs >= len(candidates):
93
+ sampled = candidates
94
+ else:
95
+ sampled = random.sample(candidates, num_pairs)
96
+
97
+ logging.info(f"📦 Suggested {num_pairs} (chunk_size, overlap) pairs: {sampled}")
98
+ return sampled
99
+
100
+ # -----------------------------
101
+ # Recommendation Logic
102
+ # -----------------------------
103
+ def recommend(
104
+ self,
105
+ embedding_model: Optional[str] = None,
106
+ num_chunk_pairs: Optional[int] = 5
107
+ ) -> Dict[str, Any]:
16
108
  """
17
- self.corpus_stats = corpus_stats
109
+ Recommend retriever, embedding, chunking, and strategy based on corpus stats.
110
+
111
+ Args:
112
+ embedding_model (str, optional): User-provided embedding model.
113
+ num_chunk_pairs (int, optional): Number of (chunk_size, overlap) pairs to generate.
18
114
 
19
- def recommend(self):
115
+ Returns:
116
+ Dict[str, Any]: Recommended RAG configuration
117
+ """
20
118
  size = self.corpus_stats.get("size", 0)
21
119
  avg_len = self.corpus_stats.get("avg_len", 0)
22
120
 
23
- if size < 1000:
24
- return {"retriever": "BM25", "embedding_model": "OpenAI"}
25
- elif size < 10000:
26
- return {"retriever": "Chroma", "embedding_model": "SentenceTransformers"}
121
+ # Determine retriever
122
+ if size <= 2000:
123
+ retriever = "BM25"
124
+ if embedding_model is None:
125
+ embedding_model = self.DEFAULT_EMBEDDINGS
126
+ elif size <= 10000:
127
+ retriever = "Chroma"
128
+ if embedding_model is None:
129
+ embedding_model = "sentence-transformers/paraphrase-MiniLM-L6-v2"
27
130
  else:
28
- return {"retriever": "FAISS", "embedding_model": "InstructorXL"}
131
+ retriever = "FAISS"
132
+ if embedding_model is None:
133
+ embedding_model = "sentence-transformers/all-mpnet-base-v2"
134
+
135
+ if embedding_model is None:
136
+ embedding_model = self.DEFAULT_EMBEDDINGS
137
+ logging.warning(f"⚠️ Using default embedding model: {embedding_model}")
138
+
139
+ # Suggest chunk sizes
140
+ # Inside auto_tune, replace fixed chunk_sizes/overlaps with all candidates:
141
+ chunk_candidates = self.suggest_chunk_sizes(
142
+ model_name=embedding_model,
143
+ num_pairs=num_chunk_pairs
144
+ )
145
+
146
+ # Safety check
147
+ if not chunk_candidates:
148
+ raise RuntimeError("No chunk candidates generated.")
149
+
150
+ # Pick the first pair as default recommendation
151
+ chunk_size, overlap = chunk_candidates[0]
152
+
153
+ strategy = "fixed" if avg_len < 400 else "sentence"
154
+
155
+ recommendation = {
156
+ "retriever": retriever,
157
+ "embedding_model": embedding_model,
158
+ "chunk_size": chunk_size,
159
+ "overlap": overlap,
160
+ "strategy": strategy,
161
+ "chunk_candidates": chunk_candidates,
162
+ }
163
+
164
+ logging.info(f"🔮 AutoRAG Recommendation: {recommendation}")
165
+ return recommendation
166
+
167
+ # -----------------------------
168
+ # Full Auto-Tuning
169
+ # -----------------------------
170
+ def auto_tune(
171
+ self,
172
+ validation_set: str = None,
173
+ metric: str = "faithfulness",
174
+ trials: int = 5,
175
+ search_type: str = "random",
176
+ embedding_model: Optional[str] = None,
177
+ num_chunk_pairs: Optional[int] = 5
178
+ ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
179
+ """
180
+ Run a full automatic optimization using RAGMint.
181
+
182
+ Args:
183
+ validation_set (str): Path to validation set.
184
+ metric (str): Metric to optimize.
185
+ trials (int): Number of optimization trials.
186
+ search_type (str): Search strategy.
187
+ embedding_model (str, optional): User-provided embedding model.
188
+ num_chunk_pairs (int, optional): Number of chunk pairs to try.
189
+
190
+ Returns:
191
+ Tuple[Dict[str, Any], List[Dict[str, Any]]]: Best configuration and all trial results.
192
+ """
193
+ rec = self.recommend(embedding_model=embedding_model, num_chunk_pairs=num_chunk_pairs)
194
+
195
+ chunk_candidates = rec["chunk_candidates"]
196
+
197
+ logging.info("🚀 Launching full AutoRAG optimization with RAGMint")
198
+
199
+ tuner = RAGMint(
200
+ docs_path=self.docs_path,
201
+ retrievers=[rec["retriever"]],
202
+ embeddings=[rec["embedding_model"]],
203
+ rerankers=["mmr"],
204
+ chunk_sizes=[c[0] for c in chunk_candidates],
205
+ overlaps=[c[1] for c in chunk_candidates],
206
+ strategies=[rec["strategy"]],
207
+ )
208
+
209
+ best, results = tuner.optimize(
210
+ validation_set=validation_set,
211
+ metric=metric,
212
+ trials=trials,
213
+ search_type=search_type,
214
+ )
29
215
 
30
- def auto_tune(self, validation_data):
31
- config = self.recommend()
32
- results = evaluate_config(config, validation_data)
33
- return {"recommended": config, "results": results}
216
+ logging.info(f"🏁 AutoRAG tuning complete. Best: {best}")
217
+ return best, results
ragmint/core/chunking.py CHANGED
@@ -1,18 +1,45 @@
1
1
  from typing import List
2
+ import re
3
+
4
+ try:
5
+ import tiktoken
6
+ except ImportError:
7
+ tiktoken = None
8
+
9
+ try:
10
+ import nltk
11
+ nltk.download("punkt", quiet=True)
12
+ from nltk.tokenize import sent_tokenize
13
+ except ImportError:
14
+ sent_tokenize = None
2
15
 
3
16
 
4
17
  class Chunker:
5
18
  """
6
- Handles text chunking and splitting strategies:
7
- - Fixed size chunks
8
- - Overlapping windows
19
+ Handles text chunking strategies:
20
+ - fixed: character-based
21
+ - token: token-based (requires tiktoken)
22
+ - sentence: splits by full sentences (requires nltk)
9
23
  """
10
24
 
11
- def __init__(self, chunk_size: int = 500, overlap: int = 100):
25
+ def __init__(self, chunk_size: int = 500, overlap: int = 100, strategy: str = "fixed"):
12
26
  self.chunk_size = chunk_size
13
27
  self.overlap = overlap
28
+ self.strategy = strategy
14
29
 
15
30
  def chunk_text(self, text: str) -> List[str]:
31
+ """Dispatches to the correct chunking strategy."""
32
+ if self.strategy == "token" and tiktoken:
33
+ return self._chunk_by_tokens(text)
34
+ elif self.strategy == "sentence" and sent_tokenize:
35
+ return self._chunk_by_sentences(text)
36
+ else:
37
+ return self._chunk_fixed(text)
38
+
39
+ # -------------------------------
40
+ # Fixed-length (default)
41
+ # -------------------------------
42
+ def _chunk_fixed(self, text: str) -> List[str]:
16
43
  chunks = []
17
44
  start = 0
18
45
  while start < len(text):
@@ -20,3 +47,40 @@ class Chunker:
20
47
  chunks.append(text[start:end])
21
48
  start += self.chunk_size - self.overlap
22
49
  return chunks
50
+
51
+ # -------------------------------
52
+ # Token-based (for LLM embedding)
53
+ # -------------------------------
54
+ def _chunk_by_tokens(self, text: str) -> List[str]:
55
+ if not tiktoken:
56
+ raise ImportError("tiktoken is required for token-based chunking.")
57
+ enc = tiktoken.get_encoding("cl100k_base")
58
+ tokens = enc.encode(text)
59
+
60
+ chunks = []
61
+ for i in range(0, len(tokens), self.chunk_size - self.overlap):
62
+ chunk_tokens = tokens[i:i + self.chunk_size]
63
+ chunks.append(enc.decode(chunk_tokens))
64
+ return chunks
65
+
66
+ # -------------------------------
67
+ # Sentence-based
68
+ # -------------------------------
69
+ def _chunk_by_sentences(self, text: str) -> List[str]:
70
+ if not sent_tokenize:
71
+ raise ImportError("nltk is required for sentence-based chunking.")
72
+ sentences = sent_tokenize(text)
73
+ chunks = []
74
+ current_chunk = ""
75
+
76
+ for sentence in sentences:
77
+ if len(current_chunk) + len(sentence) <= self.chunk_size:
78
+ current_chunk += " " + sentence
79
+ else:
80
+ chunks.append(current_chunk.strip())
81
+ current_chunk = sentence
82
+
83
+ if current_chunk:
84
+ chunks.append(current_chunk.strip())
85
+
86
+ return chunks
@@ -1,19 +1,55 @@
1
1
  import numpy as np
2
+ from dotenv import load_dotenv
2
3
 
4
+ try:
5
+ from sentence_transformers import SentenceTransformer
6
+ except ImportError:
7
+ SentenceTransformer = None
3
8
 
4
- class EmbeddingModel:
9
+
10
+ class Embeddings:
5
11
  """
6
- Wrapper for embedding backends (OpenAI, HuggingFace, etc.)
12
+ Wrapper for embedding backends: HuggingFace (SentenceTransformers) or Dummy.
13
+
14
+ Example:
15
+ model = Embeddings("huggingface", model_name="all-MiniLM-L6-v2")
16
+ embeddings = model.encode(["example text"])
7
17
  """
8
18
 
9
- def __init__(self, backend: str = "dummy"):
10
- self.backend = backend
19
+ def __init__(self, backend: str = "huggingface", model_name: str = None):
20
+ load_dotenv()
21
+ self.backend = backend.lower()
22
+ self.model_name = model_name or "all-MiniLM-L6-v2"
23
+
24
+ if self.backend == "huggingface":
25
+ if SentenceTransformer is None:
26
+ raise ImportError("Please install `sentence-transformers` to use HuggingFace embeddings.")
27
+ self.model = SentenceTransformer(self.model_name)
28
+ self.dim = self.model.get_sentence_embedding_dimension()
29
+
30
+ elif self.backend == "dummy":
31
+ self.model = None
32
+ self.dim = 768 # Default embedding dimension for dummy backend
33
+
34
+ else:
35
+ raise ValueError(f"Unsupported embedding backend: {backend}")
11
36
 
12
37
  def encode(self, texts):
13
- if self.backend == "openai":
14
- # Example placeholder — integrate with actual OpenAI API
15
- return [np.random.rand(768) for _ in texts]
16
- elif self.backend == "huggingface":
17
- return [np.random.rand(768) for _ in texts]
38
+ if isinstance(texts, str):
39
+ texts = [texts]
40
+
41
+ if self.backend == "huggingface":
42
+ embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
43
+
44
+ elif self.backend == "dummy":
45
+ # Return a NumPy array of shape (len(texts), dim)
46
+ embeddings = np.random.rand(len(texts), self.dim).astype(np.float32)
47
+
18
48
  else:
19
- return [np.random.rand(768) for _ in texts]
49
+ raise ValueError(f"Unknown embedding backend: {self.backend}")
50
+
51
+ # ✅ Always ensure NumPy array output
52
+ if not isinstance(embeddings, np.ndarray):
53
+ embeddings = np.array(embeddings, dtype=np.float32)
54
+
55
+ return embeddings
@@ -1,33 +1,53 @@
1
1
  import time
2
- from typing import Dict, Any
3
- from difflib import SequenceMatcher
2
+ from typing import Dict, Any, List
3
+ import numpy as np
4
+ from .embeddings import Embeddings
4
5
 
5
6
 
6
7
  class Evaluator:
7
8
  """
8
- Simple evaluation of generated answers:
9
- - Faithfulness (similarity between answer and context)
10
- - Latency
9
+ Semantic evaluation of generated answers:
10
+ - Faithfulness: cosine similarity between answer and context embeddings
11
+ - Latency: time to compute embeddings and similarity
11
12
  """
12
13
 
13
- def __init__(self):
14
- pass
14
+ def __init__(self, embeddings: Embeddings = None):
15
+ self.embeddings = embeddings or Embeddings() # default to HuggingFace all-MiniLM-L6-v2
15
16
 
16
17
  def evaluate(self, query: str, answer: str, context: str) -> Dict[str, Any]:
17
18
  start = time.time()
18
- faithfulness = self._similarity(answer, context)
19
- latency = time.time() - start
20
19
 
20
+ # Compute embeddings
21
+ emb_answer = self.embeddings.encode(answer)
22
+ emb_context = self.embeddings.encode(context)
23
+
24
+ # Compute cosine similarity
25
+ faithfulness = self._cosine_similarity(emb_answer, emb_context)
26
+
27
+ faithfulness = np.clip(faithfulness, 0.0, 1.0)
28
+
29
+ latency = time.time() - start
21
30
  return {
22
31
  "faithfulness": faithfulness,
23
32
  "latency": latency,
24
33
  }
25
34
 
26
- def _similarity(self, a: str, b: str) -> float:
27
- return SequenceMatcher(None, a, b).ratio()
35
+ @staticmethod
36
+ def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
37
+ # Ensure vectors are 1D
38
+ a = a.flatten()
39
+ b = b.flatten()
40
+ if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
41
+ return 0.0
42
+ return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
28
43
 
29
- def evaluate_config(config, validation_data):
30
- evaluator = Evaluator()
44
+
45
+ def evaluate_config(config: Dict[str, Any], validation_data: List[Dict[str, str]], embeddings: Embeddings = None) -> \
46
+ List[Dict[str, Any]]:
47
+ """
48
+ Evaluate a set of model outputs against validation data.
49
+ """
50
+ evaluator = Evaluator(embeddings=embeddings)
31
51
  results = []
32
52
  for sample in validation_data:
33
53
  query = sample.get("query", "")
@@ -35,4 +55,3 @@ def evaluate_config(config, validation_data):
35
55
  context = sample.get("context", "")
36
56
  results.append(evaluator.evaluate(query, answer, context))
37
57
  return results
38
-
ragmint/core/pipeline.py CHANGED
@@ -1,33 +1,57 @@
1
- from typing import Any, Dict, List
1
+ from typing import Any, Dict, Optional
2
2
  from .retriever import Retriever
3
3
  from .reranker import Reranker
4
4
  from .evaluation import Evaluator
5
+ from .chunking import Chunker
5
6
 
6
7
 
7
8
  class RAGPipeline:
8
9
  """
9
10
  Core Retrieval-Augmented Generation pipeline.
10
- Simplified (no generator). It retrieves, reranks, and evaluates.
11
+ Retrieves, reranks, and evaluates a query given the configured backends.
12
+ Supports text chunking for optimal retrieval performance.
11
13
  """
12
14
 
13
- def __init__(self, retriever: Retriever, reranker: Reranker, evaluator: Evaluator):
15
+ def __init__(
16
+ self,
17
+ retriever: Retriever,
18
+ reranker: Reranker,
19
+ evaluator: Evaluator,
20
+ chunk_size: int = 500,
21
+ overlap: int = 100,
22
+ chunking_strategy: str = "fixed"
23
+ ):
14
24
  self.retriever = retriever
15
25
  self.reranker = reranker
16
26
  self.evaluator = evaluator
17
27
 
18
- def run(self, query: str, top_k: int = 5) -> Dict[str, Any]:
28
+ # Initialize chunker for preprocessing
29
+ self.chunker = Chunker(chunk_size=chunk_size, overlap=overlap, strategy=chunking_strategy)
30
+
31
+ def preprocess_docs(self, documents):
32
+ """Applies the selected chunking strategy to the document set."""
33
+ all_chunks = []
34
+ for doc in documents:
35
+ chunks = self.chunker.chunk_text(doc)
36
+ all_chunks.extend(chunks)
37
+ return all_chunks
38
+
39
+ def run(self, query: str, top_k: int = 5, use_chunking: bool = True) -> Dict[str, Any]:
40
+ # Optional preprocessing step
41
+ if use_chunking and hasattr(self.retriever, "documents") and self.retriever.documents:
42
+ self.retriever.documents = self.preprocess_docs(self.retriever.documents)
43
+
19
44
  # Retrieve documents
20
45
  retrieved_docs = self.retriever.retrieve(query, top_k=top_k)
46
+
21
47
  # Rerank
22
48
  reranked_docs = self.reranker.rerank(query, retrieved_docs)
23
49
 
24
- # Use top document as pseudo-answer
25
- if reranked_docs:
26
- answer = reranked_docs[0]["text"]
27
- else:
28
- answer = ""
29
-
50
+ # Construct pseudo-answer
51
+ answer = reranked_docs[0]["text"] if reranked_docs else ""
30
52
  context = "\n".join([d["text"] for d in reranked_docs])
53
+
54
+ # Evaluate
31
55
  metrics = self.evaluator.evaluate(query, answer, context)
32
56
 
33
57
  return {