ragmint 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragmint might be problematic. Click here for more details.

Files changed (35) hide show
  1. {ragmint-0.1.0/src/ragmint.egg-info → ragmint-0.1.1}/PKG-INFO +34 -1
  2. {ragmint-0.1.0 → ragmint-0.1.1}/README.md +33 -0
  3. {ragmint-0.1.0 → ragmint-0.1.1}/pyproject.toml +1 -1
  4. ragmint-0.1.1/src/ragmint/tests/test_tuner.py +71 -0
  5. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/tuner.py +1 -1
  6. ragmint-0.1.1/src/ragmint/utils/data_loader.py +65 -0
  7. {ragmint-0.1.0 → ragmint-0.1.1/src/ragmint.egg-info}/PKG-INFO +34 -1
  8. ragmint-0.1.0/src/ragmint/tests/test_tuner.py +0 -38
  9. ragmint-0.1.0/src/ragmint/utils/data_loader.py +0 -35
  10. {ragmint-0.1.0 → ragmint-0.1.1}/LICENSE +0 -0
  11. {ragmint-0.1.0 → ragmint-0.1.1}/setup.cfg +0 -0
  12. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/__init__.py +0 -0
  13. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/__main__.py +0 -0
  14. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/core/__init__.py +0 -0
  15. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/core/chunking.py +0 -0
  16. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/core/embeddings.py +0 -0
  17. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/core/evaluation.py +0 -0
  18. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/core/pipeline.py +0 -0
  19. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/core/reranker.py +0 -0
  20. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/core/retriever.py +0 -0
  21. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/experiments/__init__.py +0 -0
  22. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/optimization/__init__.py +0 -0
  23. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/optimization/search.py +0 -0
  24. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/tests/__init__.py +0 -0
  25. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/tests/test_pipeline.py +0 -0
  26. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/tests/test_retriever.py +0 -0
  27. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/tests/test_search.py +0 -0
  28. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/utils/__init__.py +0 -0
  29. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/utils/caching.py +0 -0
  30. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/utils/logger.py +0 -0
  31. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint/utils/metrics.py +0 -0
  32. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint.egg-info/SOURCES.txt +0 -0
  33. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint.egg-info/dependency_links.txt +0 -0
  34. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint.egg-info/requires.txt +0 -0
  35. {ragmint-0.1.0 → ragmint-0.1.1}/src/ragmint.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragmint
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
5
  Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
6
  License: Apache License 2.0
@@ -101,6 +101,39 @@ result = pipeline.run("What is retrieval-augmented generation?")
101
101
  print(result)
102
102
  ```
103
103
 
104
+ ---
105
+ ## 🧪 Dataset Options
106
+
107
+ Ragmint can automatically load evaluation datasets for your RAG pipeline:
108
+
109
+ | Mode | Example | Description |
110
+ |------|----------|-------------|
111
+ | 🧱 **Default** | `validation_set=None` | Uses built-in `experiments/validation_qa.json` |
112
+ | 📁 **Custom File** | `validation_set="data/my_eval.json"` | Load your own QA dataset (JSON or CSV) |
113
+ | 🌐 **Hugging Face Dataset** | `validation_set="squad"` | Automatically downloads benchmark datasets (requires `pip install datasets`) |
114
+
115
+ ### Example
116
+
117
+ ```python
118
+ from ragmint.tuner import RAGMint
119
+
120
+ ragmint = RAGMint(
121
+ docs_path="data/docs/",
122
+ retrievers=["faiss", "chroma"],
123
+ embeddings=["text-embedding-3-small"],
124
+ rerankers=["mmr"],
125
+ )
126
+
127
+ # Use built-in default
128
+ ragmint.optimize(validation_set=None)
129
+
130
+ # Use Hugging Face benchmark
131
+ ragmint.optimize(validation_set="squad")
132
+
133
+ # Use your own dataset
134
+ ragmint.optimize(validation_set="data/custom_qa.json")
135
+ ```
136
+
104
137
  ---
105
138
 
106
139
  ## 🧩 Folder Structure
@@ -75,6 +75,39 @@ result = pipeline.run("What is retrieval-augmented generation?")
75
75
  print(result)
76
76
  ```
77
77
 
78
+ ---
79
+ ## 🧪 Dataset Options
80
+
81
+ Ragmint can automatically load evaluation datasets for your RAG pipeline:
82
+
83
+ | Mode | Example | Description |
84
+ |------|----------|-------------|
85
+ | 🧱 **Default** | `validation_set=None` | Uses built-in `experiments/validation_qa.json` |
86
+ | 📁 **Custom File** | `validation_set="data/my_eval.json"` | Load your own QA dataset (JSON or CSV) |
87
+ | 🌐 **Hugging Face Dataset** | `validation_set="squad"` | Automatically downloads benchmark datasets (requires `pip install datasets`) |
88
+
89
+ ### Example
90
+
91
+ ```python
92
+ from ragmint.tuner import RAGMint
93
+
94
+ ragmint = RAGMint(
95
+ docs_path="data/docs/",
96
+ retrievers=["faiss", "chroma"],
97
+ embeddings=["text-embedding-3-small"],
98
+ rerankers=["mmr"],
99
+ )
100
+
101
+ # Use built-in default
102
+ ragmint.optimize(validation_set=None)
103
+
104
+ # Use Hugging Face benchmark
105
+ ragmint.optimize(validation_set="squad")
106
+
107
+ # Use your own dataset
108
+ ragmint.optimize(validation_set="data/custom_qa.json")
109
+ ```
110
+
78
111
  ---
79
112
 
80
113
  ## 🧩 Folder Structure
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ragmint"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  description = "A modular framework for evaluating and optimizing RAG pipelines."
9
9
  readme = "README.md"
10
10
  license = { text = "Apache License 2.0" }
@@ -0,0 +1,71 @@
1
+ import os
2
+ import json
3
+ import pytest
4
+ from ragmint.tuner import RAGMint
5
+
6
+
7
+ def setup_validation_file(tmp_path):
8
+ """Create a temporary validation QA dataset."""
9
+ data = [
10
+ {"question": "What is AI?", "answer": "Artificial Intelligence"},
11
+ {"question": "Define ML", "answer": "Machine Learning"}
12
+ ]
13
+ file = tmp_path / "validation_qa.json"
14
+ with open(file, "w", encoding="utf-8") as f:
15
+ json.dump(data, f)
16
+ return str(file)
17
+
18
+
19
+ def setup_docs(tmp_path):
20
+ """Create a small document corpus for testing."""
21
+ corpus = tmp_path / "corpus"
22
+ corpus.mkdir()
23
+ (corpus / "doc1.txt").write_text("This is about Artificial Intelligence.")
24
+ (corpus / "doc2.txt").write_text("This text explains Machine Learning.")
25
+ return str(corpus)
26
+
27
+
28
+ @pytest.mark.parametrize("validation_mode", [
29
+ None, # Built-in dataset
30
+ "data/custom_eval.json", # Custom dataset path (mocked below)
31
+ ])
32
+ def test_optimize_ragmint(tmp_path, validation_mode, monkeypatch):
33
+ """Test RAGMint.optimize() with different dataset modes."""
34
+ docs_path = setup_docs(tmp_path)
35
+ val_file = setup_validation_file(tmp_path)
36
+
37
+ # If using custom dataset, mock the path
38
+ if validation_mode and "custom_eval" in validation_mode:
39
+ custom_path = tmp_path / "custom_eval.json"
40
+ os.rename(val_file, custom_path)
41
+ validation_mode = str(custom_path)
42
+
43
+ metric = "faithfulness"
44
+
45
+ # Initialize RAGMint
46
+ rag = RAGMint(
47
+ docs_path=docs_path,
48
+ retrievers=["faiss"],
49
+ embeddings=["text-embedding-3-small"],
50
+ rerankers=["mmr"]
51
+ )
52
+
53
+ # Run optimization
54
+ best, results = rag.optimize(
55
+ validation_set=validation_mode,
56
+ metric=metric,
57
+ trials=2
58
+ )
59
+
60
+ # Validate results
61
+ assert isinstance(best, dict), "Best config should be a dict"
62
+ assert isinstance(results, list), "Results should be a list of trials"
63
+ assert len(results) > 0, "Optimization should produce results"
64
+
65
+ # The best result can expose either 'score' or the metric name (e.g. 'faithfulness')
66
+ assert any(k in best for k in ("score", metric)), \
67
+ f"Best config should include either 'score' or '{metric}'"
68
+
69
+ # Ensure the metric value is valid
70
+ assert best.get(metric, best.get("score")) >= 0, \
71
+ f"{metric} score should be non-negative"
@@ -90,7 +90,7 @@ class RAGMint:
90
90
  search_type: str = "random",
91
91
  trials: int = 10,
92
92
  ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
93
- validation = load_validation_set(validation_set)
93
+ validation = load_validation_set(validation_set or "default")
94
94
 
95
95
  search_space = {
96
96
  "retriever": self.retrievers,
@@ -0,0 +1,65 @@
1
+ import json
2
+ import csv
3
+ from typing import List, Dict
4
+ from pathlib import Path
5
+ import os
6
+
7
+ try:
8
+ from datasets import load_dataset
9
+ except ImportError:
10
+ load_dataset = None # optional dependency
11
+
12
+ DEFAULT_VALIDATION_PATH = Path(__file__).parent.parent / "experiments" / "validation_qa.json"
13
+
14
+
15
+ def load_json(path: str) -> List[Dict]:
16
+ with open(path, "r", encoding="utf-8") as f:
17
+ return json.load(f)
18
+
19
+
20
+ def load_csv(path: str) -> List[Dict]:
21
+ with open(path, newline="", encoding="utf-8") as csvfile:
22
+ reader = csv.DictReader(csvfile)
23
+ return list(reader)
24
+
25
+
26
+ def save_json(path: str, data: Dict):
27
+ with open(path, "w", encoding="utf-8") as f:
28
+ json.dump(data, f, ensure_ascii=False, indent=2)
29
+
30
+ def load_validation_set(path: str | None = None) -> List[Dict]:
31
+ """
32
+ Loads a validation dataset (QA pairs) from:
33
+ - Built-in default JSON file
34
+ - User-provided JSON or CSV
35
+ - Hugging Face dataset by name
36
+ """
37
+ # Default behavior
38
+ if path is None or path == "default":
39
+ if not DEFAULT_VALIDATION_PATH.exists():
40
+ raise FileNotFoundError(f"Default validation set not found at {DEFAULT_VALIDATION_PATH}")
41
+ return load_json(DEFAULT_VALIDATION_PATH)
42
+
43
+ # Hugging Face dataset
44
+ if not os.path.exists(path) and load_dataset:
45
+ try:
46
+ dataset = load_dataset(path, split="validation")
47
+ data = [
48
+ {"question": q, "answer": a}
49
+ for q, a in zip(dataset["question"], dataset["answers"])
50
+ ]
51
+ return data
52
+ except Exception:
53
+ pass # fall through to file loading
54
+
55
+ # Local file
56
+ p = Path(path)
57
+ if not p.exists():
58
+ raise FileNotFoundError(f"Validation file not found: {path}")
59
+
60
+ if p.suffix.lower() == ".json":
61
+ return load_json(path)
62
+ elif p.suffix.lower() in [".csv", ".tsv"]:
63
+ return load_csv(path)
64
+ else:
65
+ raise ValueError("Unsupported validation set format. Use JSON, CSV, or a Hugging Face dataset name.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragmint
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: A modular framework for evaluating and optimizing RAG pipelines.
5
5
  Author-email: Andre Oliveira <oandreoliveira@outlook.com>
6
6
  License: Apache License 2.0
@@ -101,6 +101,39 @@ result = pipeline.run("What is retrieval-augmented generation?")
101
101
  print(result)
102
102
  ```
103
103
 
104
+ ---
105
+ ## 🧪 Dataset Options
106
+
107
+ Ragmint can automatically load evaluation datasets for your RAG pipeline:
108
+
109
+ | Mode | Example | Description |
110
+ |------|----------|-------------|
111
+ | 🧱 **Default** | `validation_set=None` | Uses built-in `experiments/validation_qa.json` |
112
+ | 📁 **Custom File** | `validation_set="data/my_eval.json"` | Load your own QA dataset (JSON or CSV) |
113
+ | 🌐 **Hugging Face Dataset** | `validation_set="squad"` | Automatically downloads benchmark datasets (requires `pip install datasets`) |
114
+
115
+ ### Example
116
+
117
+ ```python
118
+ from ragmint.tuner import RAGMint
119
+
120
+ ragmint = RAGMint(
121
+ docs_path="data/docs/",
122
+ retrievers=["faiss", "chroma"],
123
+ embeddings=["text-embedding-3-small"],
124
+ rerankers=["mmr"],
125
+ )
126
+
127
+ # Use built-in default
128
+ ragmint.optimize(validation_set=None)
129
+
130
+ # Use Hugging Face benchmark
131
+ ragmint.optimize(validation_set="squad")
132
+
133
+ # Use your own dataset
134
+ ragmint.optimize(validation_set="data/custom_qa.json")
135
+ ```
136
+
104
137
  ---
105
138
 
106
139
  ## 🧩 Folder Structure
@@ -1,38 +0,0 @@
1
- import os
2
- import json
3
- from ragmint.tuner import RAGMint
4
-
5
-
6
- def setup_validation_file(tmp_path):
7
- data = [
8
- {"question": "What is AI?", "answer": "Artificial Intelligence"},
9
- {"question": "Define ML", "answer": "Machine Learning"}
10
- ]
11
- file = tmp_path / "validation_qa.json"
12
- with open(file, "w", encoding="utf-8") as f:
13
- json.dump(data, f)
14
- return str(file)
15
-
16
-
17
- def setup_docs(tmp_path):
18
- corpus = tmp_path / "corpus"
19
- corpus.mkdir()
20
- (corpus / "doc1.txt").write_text("This is about Artificial Intelligence.")
21
- (corpus / "doc2.txt").write_text("This text explains Machine Learning.")
22
- return str(corpus)
23
-
24
-
25
- def test_optimize_random(tmp_path):
26
- docs_path = setup_docs(tmp_path)
27
- val_file = setup_validation_file(tmp_path)
28
-
29
- rag = RAGMint(
30
- docs_path=docs_path,
31
- retrievers=["faiss"],
32
- embeddings=["openai/text-embedding-3-small"],
33
- rerankers=["mmr"]
34
- )
35
-
36
- best, results = rag.optimize(validation_set=val_file, metric="faithfulness", trials=2)
37
- assert isinstance(best, dict)
38
- assert isinstance(results, list)
@@ -1,35 +0,0 @@
1
- import json
2
- import csv
3
- from typing import List, Dict
4
- from pathlib import Path
5
-
6
-
7
- def load_json(path: str) -> List[Dict]:
8
- with open(path, "r", encoding="utf-8") as f:
9
- return json.load(f)
10
-
11
-
12
- def load_csv(path: str) -> List[Dict]:
13
- with open(path, newline="", encoding="utf-8") as csvfile:
14
- reader = csv.DictReader(csvfile)
15
- return list(reader)
16
-
17
-
18
- def save_json(path: str, data: Dict):
19
- with open(path, "w", encoding="utf-8") as f:
20
- json.dump(data, f, ensure_ascii=False, indent=2)
21
-
22
- def load_validation_set(path: str) -> List[Dict]:
23
- """
24
- Loads a validation dataset (QA pairs) from JSON or CSV.
25
- """
26
- p = Path(path)
27
- if not p.exists():
28
- raise FileNotFoundError(f"Validation file not found: {path}")
29
-
30
- if p.suffix.lower() == ".json":
31
- return load_json(path)
32
- elif p.suffix.lower() in [".csv", ".tsv"]:
33
- return load_csv(path)
34
- else:
35
- raise ValueError("Unsupported validation set format. Use JSON or CSV.")
File without changes
File without changes
File without changes
File without changes