ragbits-evaluate 1.2.1__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ragbits-evaluate might be problematic. Click here for more details.
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/.gitignore +4 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/CHANGELOG.md +15 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/PKG-INFO +2 -2
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/pyproject.toml +2 -2
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/evaluator.py +26 -6
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/unit/test_evaluator.py +77 -7
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/README.md +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/cli.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/config.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/base.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/document_search.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/exceptions.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/question_answer.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/factories/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/base.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/document_search.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/question_answer.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/optimizer.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/__init__.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/base.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/document_search.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/question_answer.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/py.typed +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/utils.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/cli/test_run_evaluation.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/unit/test_metrics.py +0 -0
- {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/unit/test_optimizer.py +0 -0
|
@@ -2,6 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
## Unreleased
|
|
4
4
|
|
|
5
|
+
## 1.3.0 (2025-09-11)
|
|
6
|
+
|
|
7
|
+
### Changed
|
|
8
|
+
|
|
9
|
+
- ragbits-core updated to version v1.3.0
|
|
10
|
+
|
|
11
|
+
- Optional parallel batches execution in ragbits.evaluate.Evaluator (#769)
|
|
12
|
+
|
|
13
|
+
## 1.2.2 (2025-08-08)
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
|
|
17
|
+
- ragbits-core updated to version v1.2.2
|
|
18
|
+
|
|
5
19
|
## 1.2.1 (2025-08-04)
|
|
6
20
|
|
|
7
21
|
### Changed
|
|
@@ -136,6 +150,7 @@
|
|
|
136
150
|
- ragbits-core updated to version v0.10.1
|
|
137
151
|
|
|
138
152
|
## 0.10.0 (2025-03-17)
|
|
153
|
+
|
|
139
154
|
### Changed
|
|
140
155
|
|
|
141
156
|
- ragbits-core updated to version v0.10.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragbits-evaluate
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Evaluation module for Ragbits components
|
|
5
5
|
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
6
|
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.5.0
|
|
|
27
27
|
Requires-Dist: hydra-core<2.0.0,>=1.3.2
|
|
28
28
|
Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
|
|
29
29
|
Requires-Dist: optuna<5.0.0,>=4.0.0
|
|
30
|
-
Requires-Dist: ragbits-core==1.
|
|
30
|
+
Requires-Dist: ragbits-core==1.3.0
|
|
31
31
|
Provides-Extra: relari
|
|
32
32
|
Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
|
|
33
33
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ragbits-evaluate"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.3.0"
|
|
4
4
|
description = "Evaluation module for Ragbits components"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -32,7 +32,7 @@ classifiers = [
|
|
|
32
32
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
33
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
34
34
|
]
|
|
35
|
-
dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.
|
|
35
|
+
dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.3.0"]
|
|
36
36
|
|
|
37
37
|
[project.urls]
|
|
38
38
|
"Homepage" = "https://github.com/deepsense-ai/ragbits"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import random
|
|
3
3
|
import time
|
|
4
|
-
from collections.abc import Awaitable, Callable, Iterable
|
|
4
|
+
from collections.abc import Awaitable, Callable, Iterable, Sized
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from typing import Generic, ParamSpec, TypeVar
|
|
7
7
|
|
|
@@ -71,6 +71,7 @@ class Evaluator(WithConstructionConfig):
|
|
|
71
71
|
num_retries: int = 3,
|
|
72
72
|
backoff_multiplier: int = 1,
|
|
73
73
|
backoff_max: int = 60,
|
|
74
|
+
parallelize_batches: bool = False,
|
|
74
75
|
) -> None:
|
|
75
76
|
"""
|
|
76
77
|
Initialize the Evaluator instance.
|
|
@@ -80,11 +81,13 @@ class Evaluator(WithConstructionConfig):
|
|
|
80
81
|
num_retries: The number of retries per evaluation pipeline inference error.
|
|
81
82
|
backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
|
|
82
83
|
backoff_max: The maximum allowed delay (in seconds) between retries.
|
|
84
|
+
parallelize_batches: Whether to process samples within each batch in parallel (asyncio.gather).
|
|
83
85
|
"""
|
|
84
86
|
self.batch_size = batch_size
|
|
85
87
|
self.num_retries = num_retries
|
|
86
88
|
self.backoff_multiplier = backoff_multiplier
|
|
87
89
|
self.backoff_max = backoff_max
|
|
90
|
+
self.parallelize_batches = parallelize_batches
|
|
88
91
|
|
|
89
92
|
@classmethod
|
|
90
93
|
async def run_from_config(cls, config: dict) -> EvaluatorResult:
|
|
@@ -156,16 +159,33 @@ class Evaluator(WithConstructionConfig):
|
|
|
156
159
|
The evaluation results and performance metrics.
|
|
157
160
|
"""
|
|
158
161
|
start_time = time.perf_counter()
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
]
|
|
162
|
+
|
|
163
|
+
total_samples = len(dataset) if isinstance(dataset, Sized) else None
|
|
164
|
+
batches = batched(dataset, self.batch_size)
|
|
165
|
+
outputs: list[Iterable[EvaluationResultT] | Exception] = []
|
|
166
|
+
|
|
167
|
+
with tqdm(total=total_samples, desc="Evaluation", unit="sample") as progress_bar:
|
|
168
|
+
for batch in batches:
|
|
169
|
+
batch_list = list(batch)
|
|
170
|
+
|
|
171
|
+
if self.parallelize_batches:
|
|
172
|
+
tasks = [self._call_with_error_handling(pipeline, [sample]) for sample in batch_list]
|
|
173
|
+
batch_results = await asyncio.gather(*tasks)
|
|
174
|
+
|
|
175
|
+
for result in batch_results:
|
|
176
|
+
outputs.append(result)
|
|
177
|
+
progress_bar.update(1)
|
|
178
|
+
else:
|
|
179
|
+
result = await self._call_with_error_handling(pipeline, batch_list)
|
|
180
|
+
outputs.append(result)
|
|
181
|
+
progress_bar.update(len(batch_list))
|
|
182
|
+
|
|
163
183
|
end_time = time.perf_counter()
|
|
164
184
|
|
|
165
185
|
errors = [output for output in outputs if isinstance(output, Exception)]
|
|
166
186
|
results = [item for output in outputs if not isinstance(output, Exception) for item in output]
|
|
167
187
|
|
|
168
|
-
return results, errors, self._compute_time_perf(start_time, end_time, len(
|
|
188
|
+
return results, errors, self._compute_time_perf(start_time, end_time, len(results))
|
|
169
189
|
|
|
170
190
|
async def _call_with_error_handling(
|
|
171
191
|
self,
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
1
3
|
from collections.abc import Iterable
|
|
2
4
|
from dataclasses import dataclass
|
|
3
5
|
from typing import Any, cast
|
|
@@ -31,15 +33,23 @@ class MockEvaluationTarget(WithConstructionConfig):
|
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
class MockEvaluationPipeline(EvaluationPipeline[MockEvaluationTarget, MockEvaluationData, MockEvaluationResult]):
|
|
36
|
+
def __init__(self, evaluation_target: MockEvaluationTarget, slow: bool = False):
|
|
37
|
+
super().__init__(evaluation_target)
|
|
38
|
+
self._slow = slow
|
|
39
|
+
|
|
34
40
|
async def __call__(self, data: Iterable[MockEvaluationData]) -> Iterable[MockEvaluationResult]:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
41
|
+
results = []
|
|
42
|
+
for row in data:
|
|
43
|
+
if self._slow:
|
|
44
|
+
await asyncio.sleep(0.5)
|
|
45
|
+
results.append(
|
|
46
|
+
MockEvaluationResult(
|
|
47
|
+
input_data=row.input_data,
|
|
48
|
+
processed_output=f"{self.evaluation_target.model_name}_{row.input_data}",
|
|
49
|
+
is_correct=row.input_data % 2 == 0,
|
|
50
|
+
)
|
|
40
51
|
)
|
|
41
|
-
|
|
42
|
-
]
|
|
52
|
+
return results
|
|
43
53
|
|
|
44
54
|
@classmethod
|
|
45
55
|
def from_config(cls, config: dict) -> "MockEvaluationPipeline":
|
|
@@ -102,6 +112,66 @@ async def test_run_evaluation(
|
|
|
102
112
|
assert all("test_model_" in r.processed_output for r in results.results)
|
|
103
113
|
|
|
104
114
|
|
|
115
|
+
@pytest.mark.parametrize(
|
|
116
|
+
("parallelize_batches", "expected_results", "expected_accuracy"),
|
|
117
|
+
[(False, 4, 0.5), (True, 4, 0.5)],
|
|
118
|
+
)
|
|
119
|
+
async def test_run_evaluation_with_parallel_batches(
|
|
120
|
+
parallelize_batches: bool,
|
|
121
|
+
expected_results: int,
|
|
122
|
+
expected_accuracy: float,
|
|
123
|
+
) -> None:
|
|
124
|
+
target = MockEvaluationTarget(model_name="parallel_test_model")
|
|
125
|
+
pipeline = MockEvaluationPipeline(target)
|
|
126
|
+
dataloader = MockDataLoader()
|
|
127
|
+
metrics = MetricSet(*[MockMetric()])
|
|
128
|
+
evaluator = Evaluator(batch_size=2, parallelize_batches=parallelize_batches)
|
|
129
|
+
|
|
130
|
+
results = await evaluator.compute(
|
|
131
|
+
pipeline=pipeline,
|
|
132
|
+
dataloader=dataloader,
|
|
133
|
+
metricset=metrics,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
assert len(results.results) == expected_results
|
|
137
|
+
assert len(results.errors) == 0
|
|
138
|
+
assert results.metrics["accuracy"] == expected_accuracy
|
|
139
|
+
assert all("parallel_test_model_" in r.processed_output for r in results.results)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
async def test_parallel_batches_performance() -> None:
|
|
143
|
+
"""Test that parallel processing is faster than sequential processing."""
|
|
144
|
+
target = MockEvaluationTarget(model_name="timing_test_model")
|
|
145
|
+
pipeline = MockEvaluationPipeline(target, slow=True)
|
|
146
|
+
dataloader = MockDataLoader(dataset_size=4)
|
|
147
|
+
metrics = MetricSet(*[MockMetric()])
|
|
148
|
+
|
|
149
|
+
# Test sequential processing
|
|
150
|
+
evaluator_sequential = Evaluator(batch_size=2, parallelize_batches=False)
|
|
151
|
+
start_time = time.perf_counter()
|
|
152
|
+
results_sequential = await evaluator_sequential.compute(
|
|
153
|
+
pipeline=pipeline,
|
|
154
|
+
dataloader=dataloader,
|
|
155
|
+
metricset=metrics,
|
|
156
|
+
)
|
|
157
|
+
sequential_time = time.perf_counter() - start_time
|
|
158
|
+
|
|
159
|
+
evaluator_parallel = Evaluator(batch_size=2, parallelize_batches=True)
|
|
160
|
+
start_time = time.perf_counter()
|
|
161
|
+
results_parallel = await evaluator_parallel.compute(
|
|
162
|
+
pipeline=pipeline,
|
|
163
|
+
dataloader=dataloader,
|
|
164
|
+
metricset=metrics,
|
|
165
|
+
)
|
|
166
|
+
parallel_time = time.perf_counter() - start_time
|
|
167
|
+
|
|
168
|
+
assert len(results_sequential.results) == len(results_parallel.results)
|
|
169
|
+
assert results_sequential.metrics == results_parallel.metrics
|
|
170
|
+
|
|
171
|
+
# Parallel processing should be roughly 2x faster, but we add some margin
|
|
172
|
+
assert parallel_time < sequential_time * 0.7
|
|
173
|
+
|
|
174
|
+
|
|
105
175
|
async def test_run_from_config() -> None:
|
|
106
176
|
config = {
|
|
107
177
|
"evaluation": {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/__init__.py
RENAMED
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/utils.py
RENAMED
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/factories/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/document_search.py
RENAMED
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/question_answer.py
RENAMED
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/document_search.py
RENAMED
|
File without changes
|
{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/question_answer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|