PyPI - ragbits-evaluate - Versions diffs - 1.2.1__tar.gz → 1.3.0__tar.gz - Mend

ragbits-evaluate 1.2.1tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ragbits-evaluate might be problematic. Click here for more details.

Files changed (44) hide show

{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/.gitignore RENAMED Viewed

@@ -105,3 +105,7 @@ qdrant/
 .DS_Store
 node_modules/
+lazygit
+lazygit.tar.gz

{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,20 @@
 ## Unreleased
+## 1.3.0 (2025-09-11)
+### Changed
+- ragbits-core updated to version v1.3.0
+- Optional parallel batches execution in ragbits.evaluate.Evaluator (#769)
+## 1.2.2 (2025-08-08)
+### Changed
+- ragbits-core updated to version v1.2.2
 ## 1.2.1 (2025-08-04)
 ### Changed
@@ -136,6 +150,7 @@
 - ragbits-core updated to version v0.10.1
 ## 0.10.0 (2025-03-17)
 ### Changed
 - ragbits-core updated to version v0.10.0

{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragbits-evaluate
-Version: 1.2.1
+Version: 1.3.0
 Summary: Evaluation module for Ragbits components
 Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
 Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.5.0
 Requires-Dist: hydra-core<2.0.0,>=1.3.2
 Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
 Requires-Dist: optuna<5.0.0,>=4.0.0
-Requires-Dist: ragbits-core==1.2.1
+Requires-Dist: ragbits-core==1.3.0
 Provides-Extra: relari
 Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
 Description-Content-Type: text/markdown

{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ragbits-evaluate"
-version = "1.2.1"
+version = "1.3.0"
 description = "Evaluation module for Ragbits components"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.2.1"]
+dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.3.0"]
 [project.urls]
 "Homepage" = "https://github.com/deepsense-ai/ragbits"

{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/evaluator.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import asyncio
 import random
 import time
-from collections.abc import Awaitable, Callable, Iterable
+from collections.abc import Awaitable, Callable, Iterable, Sized
 from dataclasses import dataclass
 from typing import Generic, ParamSpec, TypeVar
@@ -71,6 +71,7 @@ class Evaluator(WithConstructionConfig):
         num_retries: int = 3,
         backoff_multiplier: int = 1,
         backoff_max: int = 60,
+        parallelize_batches: bool = False,
     ) -> None:
         """
         Initialize the Evaluator instance.
@@ -80,11 +81,13 @@ class Evaluator(WithConstructionConfig):
             num_retries: The number of retries per evaluation pipeline inference error.
             backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
             backoff_max: The maximum allowed delay (in seconds) between retries.
+            parallelize_batches: Whether to process samples within each batch in parallel (asyncio.gather).
         """
         self.batch_size = batch_size
         self.num_retries = num_retries
         self.backoff_multiplier = backoff_multiplier
         self.backoff_max = backoff_max
+        self.parallelize_batches = parallelize_batches
     @classmethod
     async def run_from_config(cls, config: dict) -> EvaluatorResult:
@@ -156,16 +159,33 @@ class Evaluator(WithConstructionConfig):
             The evaluation results and performance metrics.
         """
         start_time = time.perf_counter()
-        outputs = [
-            await self._call_with_error_handling(pipeline, data)
-            for data in tqdm(batched(dataset, self.batch_size), desc="Evaluation")
-        ]
+        total_samples = len(dataset) if isinstance(dataset, Sized) else None
+        batches = batched(dataset, self.batch_size)
+        outputs: list[Iterable[EvaluationResultT] | Exception] = []
+        with tqdm(total=total_samples, desc="Evaluation", unit="sample") as progress_bar:
+            for batch in batches:
+                batch_list = list(batch)
+                if self.parallelize_batches:
+                    tasks = [self._call_with_error_handling(pipeline, [sample]) for sample in batch_list]
+                    batch_results = await asyncio.gather(*tasks)
+                    for result in batch_results:
+                        outputs.append(result)
+                        progress_bar.update(1)
+                else:
+                    result = await self._call_with_error_handling(pipeline, batch_list)
+                    outputs.append(result)
+                    progress_bar.update(len(batch_list))
         end_time = time.perf_counter()
         errors = [output for output in outputs if isinstance(output, Exception)]
         results = [item for output in outputs if not isinstance(output, Exception) for item in output]
-        return results, errors, self._compute_time_perf(start_time, end_time, len(outputs))
+        return results, errors, self._compute_time_perf(start_time, end_time, len(results))
     async def _call_with_error_handling(
         self,

{ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/unit/test_evaluator.py RENAMED Viewed

@@ -1,3 +1,5 @@
+import asyncio
+import time
 from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Any, cast
@@ -31,15 +33,23 @@ class MockEvaluationTarget(WithConstructionConfig):
 class MockEvaluationPipeline(EvaluationPipeline[MockEvaluationTarget, MockEvaluationData, MockEvaluationResult]):
+    def __init__(self, evaluation_target: MockEvaluationTarget, slow: bool = False):
+        super().__init__(evaluation_target)
+        self._slow = slow
     async def __call__(self, data: Iterable[MockEvaluationData]) -> Iterable[MockEvaluationResult]:
-        return [
-            MockEvaluationResult(
-                input_data=row.input_data,
-                processed_output=f"{self.evaluation_target.model_name}_{row.input_data}",
-                is_correct=row.input_data % 2 == 0,
+        results = []
+        for row in data:
+            if self._slow:
+                await asyncio.sleep(0.5)
+            results.append(
+                MockEvaluationResult(
+                    input_data=row.input_data,
+                    processed_output=f"{self.evaluation_target.model_name}_{row.input_data}",
+                    is_correct=row.input_data % 2 == 0,
+                )
             )
-            for row in data
-        ]
+        return results
     @classmethod
     def from_config(cls, config: dict) -> "MockEvaluationPipeline":
@@ -102,6 +112,66 @@ async def test_run_evaluation(
     assert all("test_model_" in r.processed_output for r in results.results)
+@pytest.mark.parametrize(
+    ("parallelize_batches", "expected_results", "expected_accuracy"),
+    [(False, 4, 0.5), (True, 4, 0.5)],
+)
+async def test_run_evaluation_with_parallel_batches(
+    parallelize_batches: bool,
+    expected_results: int,
+    expected_accuracy: float,
+) -> None:
+    target = MockEvaluationTarget(model_name="parallel_test_model")
+    pipeline = MockEvaluationPipeline(target)
+    dataloader = MockDataLoader()
+    metrics = MetricSet(*[MockMetric()])
+    evaluator = Evaluator(batch_size=2, parallelize_batches=parallelize_batches)
+    results = await evaluator.compute(
+        pipeline=pipeline,
+        dataloader=dataloader,
+        metricset=metrics,
+    )
+    assert len(results.results) == expected_results
+    assert len(results.errors) == 0
+    assert results.metrics["accuracy"] == expected_accuracy
+    assert all("parallel_test_model_" in r.processed_output for r in results.results)
+async def test_parallel_batches_performance() -> None:
+    """Test that parallel processing is faster than sequential processing."""
+    target = MockEvaluationTarget(model_name="timing_test_model")
+    pipeline = MockEvaluationPipeline(target, slow=True)
+    dataloader = MockDataLoader(dataset_size=4)
+    metrics = MetricSet(*[MockMetric()])
+    # Test sequential processing
+    evaluator_sequential = Evaluator(batch_size=2, parallelize_batches=False)
+    start_time = time.perf_counter()
+    results_sequential = await evaluator_sequential.compute(
+        pipeline=pipeline,
+        dataloader=dataloader,
+        metricset=metrics,
+    )
+    sequential_time = time.perf_counter() - start_time
+    evaluator_parallel = Evaluator(batch_size=2, parallelize_batches=True)
+    start_time = time.perf_counter()
+    results_parallel = await evaluator_parallel.compute(
+        pipeline=pipeline,
+        dataloader=dataloader,
+        metricset=metrics,
+    )
+    parallel_time = time.perf_counter() - start_time
+    assert len(results_sequential.results) == len(results_parallel.results)
+    assert results_sequential.metrics == results_parallel.metrics
+    # Parallel processing should be roughly 2x faster, but we add some margin
+    assert parallel_time < sequential_time * 0.7
 async def test_run_from_config() -> None:
     config = {
         "evaluation": {