ragbits-evaluate 1.2.1__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragbits-evaluate might be problematic. Click here for more details.

Files changed (44) hide show
  1. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/.gitignore +4 -0
  2. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/CHANGELOG.md +15 -0
  3. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/PKG-INFO +2 -2
  4. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/pyproject.toml +2 -2
  5. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/evaluator.py +26 -6
  6. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/unit/test_evaluator.py +77 -7
  7. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/README.md +0 -0
  8. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/__init__.py +0 -0
  9. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/cli.py +0 -0
  10. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/config.py +0 -0
  11. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/__init__.py +0 -0
  12. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/base.py +0 -0
  13. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/document_search.py +0 -0
  14. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/exceptions.py +0 -0
  15. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataloaders/question_answer.py +0 -0
  16. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
  17. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
  18. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
  19. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
  20. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
  21. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
  22. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
  23. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
  24. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
  25. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
  26. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
  27. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +0 -0
  28. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +0 -0
  29. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
  30. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/factories/__init__.py +0 -0
  31. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/__init__.py +0 -0
  32. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/base.py +0 -0
  33. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/document_search.py +0 -0
  34. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/metrics/question_answer.py +0 -0
  35. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/optimizer.py +0 -0
  36. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/__init__.py +0 -0
  37. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/base.py +0 -0
  38. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/document_search.py +0 -0
  39. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/pipelines/question_answer.py +0 -0
  40. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/py.typed +0 -0
  41. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/src/ragbits/evaluate/utils.py +0 -0
  42. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/cli/test_run_evaluation.py +0 -0
  43. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/unit/test_metrics.py +0 -0
  44. {ragbits_evaluate-1.2.1 → ragbits_evaluate-1.3.0}/tests/unit/test_optimizer.py +0 -0
@@ -105,3 +105,7 @@ qdrant/
105
105
 
106
106
  .DS_Store
107
107
  node_modules/
108
+
109
+ lazygit
110
+
111
+ lazygit.tar.gz
@@ -2,6 +2,20 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ ## 1.3.0 (2025-09-11)
6
+
7
+ ### Changed
8
+
9
+ - ragbits-core updated to version v1.3.0
10
+
11
+ - Optional parallel batches execution in ragbits.evaluate.Evaluator (#769)
12
+
13
+ ## 1.2.2 (2025-08-08)
14
+
15
+ ### Changed
16
+
17
+ - ragbits-core updated to version v1.2.2
18
+
5
19
  ## 1.2.1 (2025-08-04)
6
20
 
7
21
  ### Changed
@@ -136,6 +150,7 @@
136
150
  - ragbits-core updated to version v0.10.1
137
151
 
138
152
  ## 0.10.0 (2025-03-17)
153
+
139
154
  ### Changed
140
155
 
141
156
  - ragbits-core updated to version v0.10.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 1.2.1
3
+ Version: 1.3.0
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.5.0
27
27
  Requires-Dist: hydra-core<2.0.0,>=1.3.2
28
28
  Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
29
29
  Requires-Dist: optuna<5.0.0,>=4.0.0
30
- Requires-Dist: ragbits-core==1.2.1
30
+ Requires-Dist: ragbits-core==1.3.0
31
31
  Provides-Extra: relari
32
32
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
33
33
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ragbits-evaluate"
3
- version = "1.2.1"
3
+ version = "1.3.0"
4
4
  description = "Evaluation module for Ragbits components"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
32
32
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
33
  "Topic :: Software Development :: Libraries :: Python Modules",
34
34
  ]
35
- dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.2.1"]
35
+ dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.3.0"]
36
36
 
37
37
  [project.urls]
38
38
  "Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -1,7 +1,7 @@
1
1
  import asyncio
2
2
  import random
3
3
  import time
4
- from collections.abc import Awaitable, Callable, Iterable
4
+ from collections.abc import Awaitable, Callable, Iterable, Sized
5
5
  from dataclasses import dataclass
6
6
  from typing import Generic, ParamSpec, TypeVar
7
7
 
@@ -71,6 +71,7 @@ class Evaluator(WithConstructionConfig):
71
71
  num_retries: int = 3,
72
72
  backoff_multiplier: int = 1,
73
73
  backoff_max: int = 60,
74
+ parallelize_batches: bool = False,
74
75
  ) -> None:
75
76
  """
76
77
  Initialize the Evaluator instance.
@@ -80,11 +81,13 @@ class Evaluator(WithConstructionConfig):
80
81
  num_retries: The number of retries per evaluation pipeline inference error.
81
82
  backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
82
83
  backoff_max: The maximum allowed delay (in seconds) between retries.
84
+ parallelize_batches: Whether to process samples within each batch in parallel (asyncio.gather).
83
85
  """
84
86
  self.batch_size = batch_size
85
87
  self.num_retries = num_retries
86
88
  self.backoff_multiplier = backoff_multiplier
87
89
  self.backoff_max = backoff_max
90
+ self.parallelize_batches = parallelize_batches
88
91
 
89
92
  @classmethod
90
93
  async def run_from_config(cls, config: dict) -> EvaluatorResult:
@@ -156,16 +159,33 @@ class Evaluator(WithConstructionConfig):
156
159
  The evaluation results and performance metrics.
157
160
  """
158
161
  start_time = time.perf_counter()
159
- outputs = [
160
- await self._call_with_error_handling(pipeline, data)
161
- for data in tqdm(batched(dataset, self.batch_size), desc="Evaluation")
162
- ]
162
+
163
+ total_samples = len(dataset) if isinstance(dataset, Sized) else None
164
+ batches = batched(dataset, self.batch_size)
165
+ outputs: list[Iterable[EvaluationResultT] | Exception] = []
166
+
167
+ with tqdm(total=total_samples, desc="Evaluation", unit="sample") as progress_bar:
168
+ for batch in batches:
169
+ batch_list = list(batch)
170
+
171
+ if self.parallelize_batches:
172
+ tasks = [self._call_with_error_handling(pipeline, [sample]) for sample in batch_list]
173
+ batch_results = await asyncio.gather(*tasks)
174
+
175
+ for result in batch_results:
176
+ outputs.append(result)
177
+ progress_bar.update(1)
178
+ else:
179
+ result = await self._call_with_error_handling(pipeline, batch_list)
180
+ outputs.append(result)
181
+ progress_bar.update(len(batch_list))
182
+
163
183
  end_time = time.perf_counter()
164
184
 
165
185
  errors = [output for output in outputs if isinstance(output, Exception)]
166
186
  results = [item for output in outputs if not isinstance(output, Exception) for item in output]
167
187
 
168
- return results, errors, self._compute_time_perf(start_time, end_time, len(outputs))
188
+ return results, errors, self._compute_time_perf(start_time, end_time, len(results))
169
189
 
170
190
  async def _call_with_error_handling(
171
191
  self,
@@ -1,3 +1,5 @@
1
+ import asyncio
2
+ import time
1
3
  from collections.abc import Iterable
2
4
  from dataclasses import dataclass
3
5
  from typing import Any, cast
@@ -31,15 +33,23 @@ class MockEvaluationTarget(WithConstructionConfig):
31
33
 
32
34
 
33
35
  class MockEvaluationPipeline(EvaluationPipeline[MockEvaluationTarget, MockEvaluationData, MockEvaluationResult]):
36
+ def __init__(self, evaluation_target: MockEvaluationTarget, slow: bool = False):
37
+ super().__init__(evaluation_target)
38
+ self._slow = slow
39
+
34
40
  async def __call__(self, data: Iterable[MockEvaluationData]) -> Iterable[MockEvaluationResult]:
35
- return [
36
- MockEvaluationResult(
37
- input_data=row.input_data,
38
- processed_output=f"{self.evaluation_target.model_name}_{row.input_data}",
39
- is_correct=row.input_data % 2 == 0,
41
+ results = []
42
+ for row in data:
43
+ if self._slow:
44
+ await asyncio.sleep(0.5)
45
+ results.append(
46
+ MockEvaluationResult(
47
+ input_data=row.input_data,
48
+ processed_output=f"{self.evaluation_target.model_name}_{row.input_data}",
49
+ is_correct=row.input_data % 2 == 0,
50
+ )
40
51
  )
41
- for row in data
42
- ]
52
+ return results
43
53
 
44
54
  @classmethod
45
55
  def from_config(cls, config: dict) -> "MockEvaluationPipeline":
@@ -102,6 +112,66 @@ async def test_run_evaluation(
102
112
  assert all("test_model_" in r.processed_output for r in results.results)
103
113
 
104
114
 
115
+ @pytest.mark.parametrize(
116
+ ("parallelize_batches", "expected_results", "expected_accuracy"),
117
+ [(False, 4, 0.5), (True, 4, 0.5)],
118
+ )
119
+ async def test_run_evaluation_with_parallel_batches(
120
+ parallelize_batches: bool,
121
+ expected_results: int,
122
+ expected_accuracy: float,
123
+ ) -> None:
124
+ target = MockEvaluationTarget(model_name="parallel_test_model")
125
+ pipeline = MockEvaluationPipeline(target)
126
+ dataloader = MockDataLoader()
127
+ metrics = MetricSet(*[MockMetric()])
128
+ evaluator = Evaluator(batch_size=2, parallelize_batches=parallelize_batches)
129
+
130
+ results = await evaluator.compute(
131
+ pipeline=pipeline,
132
+ dataloader=dataloader,
133
+ metricset=metrics,
134
+ )
135
+
136
+ assert len(results.results) == expected_results
137
+ assert len(results.errors) == 0
138
+ assert results.metrics["accuracy"] == expected_accuracy
139
+ assert all("parallel_test_model_" in r.processed_output for r in results.results)
140
+
141
+
142
+ async def test_parallel_batches_performance() -> None:
143
+ """Test that parallel processing is faster than sequential processing."""
144
+ target = MockEvaluationTarget(model_name="timing_test_model")
145
+ pipeline = MockEvaluationPipeline(target, slow=True)
146
+ dataloader = MockDataLoader(dataset_size=4)
147
+ metrics = MetricSet(*[MockMetric()])
148
+
149
+ # Test sequential processing
150
+ evaluator_sequential = Evaluator(batch_size=2, parallelize_batches=False)
151
+ start_time = time.perf_counter()
152
+ results_sequential = await evaluator_sequential.compute(
153
+ pipeline=pipeline,
154
+ dataloader=dataloader,
155
+ metricset=metrics,
156
+ )
157
+ sequential_time = time.perf_counter() - start_time
158
+
159
+ evaluator_parallel = Evaluator(batch_size=2, parallelize_batches=True)
160
+ start_time = time.perf_counter()
161
+ results_parallel = await evaluator_parallel.compute(
162
+ pipeline=pipeline,
163
+ dataloader=dataloader,
164
+ metricset=metrics,
165
+ )
166
+ parallel_time = time.perf_counter() - start_time
167
+
168
+ assert len(results_sequential.results) == len(results_parallel.results)
169
+ assert results_sequential.metrics == results_parallel.metrics
170
+
171
+ # Parallel processing should be roughly 2x faster, but we add some margin
172
+ assert parallel_time < sequential_time * 0.7
173
+
174
+
105
175
  async def test_run_from_config() -> None:
106
176
  config = {
107
177
  "evaluation": {