ragbits-evaluate 0.17.0__tar.gz → 0.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragbits-evaluate might be problematic. Click here for more details.

Files changed (43) hide show
  1. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/CHANGELOG.md +18 -0
  2. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/PKG-INFO +2 -2
  3. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/pyproject.toml +2 -2
  4. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/cli.py +2 -2
  5. ragbits_evaluate-0.18.0/src/ragbits/evaluate/dataloaders/document_search.py +73 -0
  6. ragbits_evaluate-0.18.0/src/ragbits/evaluate/evaluator.py +224 -0
  7. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/factories/__init__.py +11 -26
  8. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/metrics/base.py +8 -4
  9. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/metrics/document_search.py +13 -2
  10. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/optimizer.py +9 -9
  11. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/pipelines/base.py +2 -1
  12. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/pipelines/document_search.py +25 -15
  13. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/utils.py +48 -14
  14. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/tests/cli/test_run_evaluation.py +4 -4
  15. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/tests/unit/test_evaluator.py +37 -17
  16. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/tests/unit/test_metrics.py +59 -25
  17. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/tests/unit/test_optimizer.py +10 -7
  18. ragbits_evaluate-0.17.0/src/ragbits/evaluate/dataloaders/document_search.py +0 -45
  19. ragbits_evaluate-0.17.0/src/ragbits/evaluate/evaluator.py +0 -179
  20. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/.gitignore +0 -0
  21. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/README.md +0 -0
  22. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/__init__.py +0 -0
  23. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/config.py +0 -0
  24. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataloaders/__init__.py +0 -0
  25. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataloaders/base.py +0 -0
  26. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataloaders/exceptions.py +0 -0
  27. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
  28. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
  29. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
  30. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
  31. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
  32. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
  33. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
  34. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
  35. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
  36. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
  37. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
  38. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +0 -0
  39. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +0 -0
  40. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
  41. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/metrics/__init__.py +0 -0
  42. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/pipelines/__init__.py +0 -0
  43. {ragbits_evaluate-0.17.0 → ragbits_evaluate-0.18.0}/src/ragbits/evaluate/py.typed +0 -0
@@ -2,6 +2,24 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ ## 0.18.0 (2025-05-22)
6
+
7
+ ### Changed
8
+
9
+ - ragbits-core updated to version v0.18.0
10
+
11
+ - Add support for custom column names in evaluation dataset (#566)
12
+ - Add support for reference document ids and page numbers in evaluation dataset (#566)
13
+ - BREAKING CHANGE: Adjust eval pipline interface to batch processing (#555)
14
+ - Rename DocumentMeta create_text_document_from_literal to from_literal (#561)
15
+ - Adjust typing for DocumentSearch (#554)
16
+
17
+ ## 0.17.1 (2025-05-09)
18
+
19
+ ### Changed
20
+
21
+ - ragbits-core updated to version v0.17.1
22
+
5
23
  ## 0.17.0 (2025-05-06)
6
24
 
7
25
  ### Changed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 0.17.0
3
+ Version: 0.18.0
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.4.1
27
27
  Requires-Dist: hydra-core<2.0.0,>=1.3.2
28
28
  Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
29
29
  Requires-Dist: optuna<5.0.0,>=4.0.0
30
- Requires-Dist: ragbits-core==0.17.0
30
+ Requires-Dist: ragbits-core==0.18.0
31
31
  Provides-Extra: relari
32
32
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
33
33
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ragbits-evaluate"
3
- version = "0.17.0"
3
+ version = "0.18.0"
4
4
  description = "Evaluation module for Ragbits components"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
32
32
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
33
  "Topic :: Software Development :: Libraries :: Python Modules",
34
34
  ]
35
- dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.4.1,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==0.17.0"]
35
+ dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.4.1,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==0.18.0"]
36
36
 
37
37
  [project.urls]
38
38
  "Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -140,10 +140,10 @@ def run() -> None:
140
140
  metric_results = await evaluator.compute(
141
141
  pipeline=state.pipeline,
142
142
  dataloader=state.dataloader,
143
- metrics=state.metrics,
143
+ metricset=state.metrics,
144
144
  )
145
145
  evaluation_results = EvaluationResult(
146
- metrics={"metrics": metric_results["metrics"], "time_perf": metric_results["time_perf"]}
146
+ metrics={"metrics": metric_results.metrics, "time_perf": metric_results.time_perf}
147
147
  )
148
148
  print_output(evaluation_results)
149
149
 
@@ -0,0 +1,73 @@
1
+ from collections.abc import Iterable
2
+
3
+ from datasets import load_dataset
4
+
5
+ from ragbits.core.sources.base import Source
6
+ from ragbits.evaluate.dataloaders.base import DataLoader
7
+ from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
8
+ from ragbits.evaluate.pipelines.document_search import DocumentSearchData
9
+
10
+
11
+ class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
12
+ """
13
+ Document search evaluation data loader.
14
+
15
+ The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files)
16
+ and contain the following features: "question, "passages".
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ source: Source,
22
+ question_key: str = "question",
23
+ document_ids_key: str = "document_ids",
24
+ passages_key: str = "passages",
25
+ page_numbers_key: str = "page_numbers",
26
+ ) -> None:
27
+ """
28
+ Initialize the document search data loader.
29
+
30
+ Args:
31
+ source: The source to load the data from.
32
+ question_key: The dataset column name that contains the question.
33
+ document_ids_key: The dataset column name that contains the document ids. Document ids are optional.
34
+ passages_key: The dataset column name that contains the passages. Passages are optional.
35
+ page_numbers_key: The dataset column name that contains the page numbers. Page numbers are optional.
36
+ """
37
+ super().__init__(source)
38
+ self.question_key = question_key
39
+ self.document_ids_key = document_ids_key
40
+ self.passages_key = passages_key
41
+ self.page_numbers_key = page_numbers_key
42
+
43
+ async def load(self) -> Iterable[DocumentSearchData]:
44
+ """
45
+ Load the data from source and format them.
46
+
47
+ Returns:
48
+ The document search evaluation data.
49
+
50
+ Raises:
51
+ DataLoaderIncorrectFormatDataError: If evaluation dataset is incorrectly formatted.
52
+ """
53
+ data_path = await self.source.fetch()
54
+ dataset = load_dataset(
55
+ path=str(data_path.parent),
56
+ split="train",
57
+ data_files={"train": str(data_path.name)},
58
+ )
59
+ if self.question_key not in dataset.features:
60
+ raise DataLoaderIncorrectFormatDataError(
61
+ required_features=[self.question_key],
62
+ data_path=data_path,
63
+ )
64
+
65
+ return [
66
+ DocumentSearchData(
67
+ question=data.get(self.question_key),
68
+ reference_document_ids=data.get(self.document_ids_key),
69
+ reference_passages=data.get(self.passages_key),
70
+ reference_page_numbers=data.get(self.page_numbers_key),
71
+ )
72
+ for data in dataset
73
+ ]
@@ -0,0 +1,224 @@
1
+ import asyncio
2
+ import random
3
+ import time
4
+ from collections.abc import Awaitable, Callable, Iterable
5
+ from dataclasses import dataclass
6
+ from typing import Generic, ParamSpec, TypeVar
7
+
8
+ from pydantic import BaseModel
9
+ from tqdm import tqdm
10
+
11
+ from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
12
+ from ragbits.core.utils.helpers import batched
13
+ from ragbits.evaluate.dataloaders.base import DataLoader
14
+ from ragbits.evaluate.metrics.base import MetricSet
15
+ from ragbits.evaluate.pipelines.base import EvaluationDataT, EvaluationPipeline, EvaluationResultT, EvaluationTargetT
16
+
17
+ _CallP = ParamSpec("_CallP")
18
+ _CallReturnT = TypeVar("_CallReturnT")
19
+
20
+
21
+ @dataclass
22
+ class EvaluationTimePerf:
23
+ """
24
+ Container for evaluation time performance metrics.
25
+ """
26
+
27
+ total_time_in_seconds: float
28
+ samples_per_second: float
29
+ latency_in_seconds: float
30
+
31
+
32
+ @dataclass
33
+ class EvaluatorResult(Generic[EvaluationResultT]):
34
+ """
35
+ Container for evaluation results.
36
+ """
37
+
38
+ metrics: dict[str, int | float]
39
+ results: list[EvaluationResultT]
40
+ errors: list[Exception]
41
+ time_perf: EvaluationTimePerf
42
+
43
+
44
+ class EvaluationConfig(BaseModel):
45
+ """
46
+ Schema for the evaluation run config.
47
+ """
48
+
49
+ pipeline: ObjectConstructionConfig
50
+ dataloader: ObjectConstructionConfig
51
+ metrics: dict[str, ObjectConstructionConfig]
52
+
53
+
54
+ class EvaluatorConfig(BaseModel):
55
+ """
56
+ Schema for the evaluator config.
57
+ """
58
+
59
+ evaluation: EvaluationConfig
60
+ evaluator: dict | None = None
61
+
62
+
63
+ class Evaluator(WithConstructionConfig):
64
+ """
65
+ Evaluator class.
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ batch_size: int = 10,
71
+ num_retries: int = 3,
72
+ backoff_multiplier: int = 1,
73
+ backoff_max: int = 60,
74
+ ) -> None:
75
+ """
76
+ Initialize the Evaluator instance.
77
+
78
+ Args:
79
+ batch_size: batch size for the evaluation pipeline inference.
80
+ num_retries: The number of retries per evaluation pipeline inference error.
81
+ backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
82
+ backoff_max: The maximum allowed delay (in seconds) between retries.
83
+ """
84
+ self.batch_size = batch_size
85
+ self.num_retries = num_retries
86
+ self.backoff_multiplier = backoff_multiplier
87
+ self.backoff_max = backoff_max
88
+
89
+ @classmethod
90
+ async def run_from_config(cls, config: dict) -> EvaluatorResult:
91
+ """
92
+ Run the evaluation based on configuration.
93
+
94
+ Args:
95
+ config: Evaluation config.
96
+
97
+ Returns:
98
+ The evaluation results.
99
+ """
100
+ evaluator_config = EvaluatorConfig.model_validate(config)
101
+ evaluation_config = EvaluationConfig.model_validate(evaluator_config.evaluation)
102
+ pipeline: EvaluationPipeline = EvaluationPipeline.subclass_from_config(evaluation_config.pipeline)
103
+ dataloader: DataLoader = DataLoader.subclass_from_config(evaluation_config.dataloader)
104
+ metricset: MetricSet = MetricSet.from_config(evaluation_config.metrics)
105
+
106
+ evaluator = cls.from_config(evaluator_config.evaluator or {})
107
+ return await evaluator.compute(
108
+ pipeline=pipeline,
109
+ dataloader=dataloader,
110
+ metricset=metricset,
111
+ )
112
+
113
+ async def compute(
114
+ self,
115
+ pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
116
+ dataloader: DataLoader[EvaluationDataT],
117
+ metricset: MetricSet[EvaluationResultT],
118
+ ) -> EvaluatorResult[EvaluationResultT]:
119
+ """
120
+ Compute the evaluation results for the given pipeline and data.
121
+
122
+ Args:
123
+ pipeline: The pipeline to be evaluated.
124
+ dataloader: The dataloader to load the data.
125
+ metricset: The metrics to be computed.
126
+
127
+ Returns:
128
+ The evaluation results.
129
+ """
130
+ await pipeline.prepare()
131
+
132
+ dataset = await dataloader.load()
133
+ results, errors, time_perf = await self._call_pipeline(pipeline, dataset)
134
+ metrics = await metricset.compute(results)
135
+
136
+ return EvaluatorResult(
137
+ metrics=metrics,
138
+ results=results,
139
+ errors=errors,
140
+ time_perf=time_perf,
141
+ )
142
+
143
+ async def _call_pipeline(
144
+ self,
145
+ pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
146
+ dataset: Iterable[EvaluationDataT],
147
+ ) -> tuple[list[EvaluationResultT], list[Exception], EvaluationTimePerf]:
148
+ """
149
+ Call the pipeline with the given data.
150
+
151
+ Args:
152
+ pipeline: The pipeline to be called.
153
+ dataset: The dataset to be processed.
154
+
155
+ Returns:
156
+ The evaluation results and performance metrics.
157
+ """
158
+ start_time = time.perf_counter()
159
+ outputs = [
160
+ await self._call_with_error_handling(pipeline, data)
161
+ for data in tqdm(batched(dataset, self.batch_size), desc="Evaluation")
162
+ ]
163
+ end_time = time.perf_counter()
164
+
165
+ errors = [output for output in outputs if isinstance(output, Exception)]
166
+ results = [item for output in outputs if not isinstance(output, Exception) for item in output]
167
+
168
+ return results, errors, self._compute_time_perf(start_time, end_time, len(outputs))
169
+
170
+ async def _call_with_error_handling(
171
+ self,
172
+ executable: Callable[_CallP, Awaitable[_CallReturnT]],
173
+ *executable_args: _CallP.args,
174
+ **executable_kwargs: _CallP.kwargs,
175
+ ) -> _CallReturnT | Exception:
176
+ """
177
+ Call executable with a standarized error handling.
178
+ If an error occurs, the executable is retried `num_retries` times using randomized exponential backoff.
179
+
180
+ Args:
181
+ executable: The callable function to execute.
182
+ executable_args: Positional arguments to pass to the executable.
183
+ executable_kwargs: Keyword arguments to pass to the executable.
184
+
185
+ Returns:
186
+ The result of the executable if successful.
187
+
188
+ Raises:
189
+ Exception: The last encountered exception after all retries are exhausted.
190
+ """
191
+ for i in range(max(0, self.num_retries) + 1):
192
+ try:
193
+ return await executable(*executable_args, **executable_kwargs)
194
+ except Exception as exc:
195
+ if i == self.num_retries:
196
+ return exc
197
+
198
+ delay = random.uniform(0, min(2**i * self.backoff_multiplier, self.backoff_max)) # noqa: S311
199
+ await asyncio.sleep(delay)
200
+
201
+ raise RuntimeError("Unreachable code reached") # mypy quirk
202
+
203
+ @staticmethod
204
+ def _compute_time_perf(start_time: float, end_time: float, num_samples: int) -> EvaluationTimePerf:
205
+ """
206
+ Compute the performance metrics.
207
+
208
+ Args:
209
+ start_time: The start time.
210
+ end_time: The end time.
211
+ num_samples: The number of samples.
212
+
213
+ Returns:
214
+ The performance metrics.
215
+ """
216
+ latency = end_time - start_time
217
+ throughput = num_samples / latency
218
+ latency_sample = 1.0 / throughput if throughput > 0 else 0.0
219
+
220
+ return EvaluationTimePerf(
221
+ total_time_in_seconds=latency,
222
+ samples_per_second=throughput,
223
+ latency_in_seconds=latency_sample,
224
+ )
@@ -1,43 +1,21 @@
1
1
  import asyncio
2
2
 
3
+ from continuous_eval.metrics.retrieval.matching_strategy import RougeChunkMatch
3
4
  from datasets import load_dataset
4
5
 
5
6
  from ragbits.core.embeddings.dense import LiteLLMEmbedder
6
7
  from ragbits.core.sources.hf import HuggingFaceSource
7
- from ragbits.core.utils.config_handling import ObjectConstructionConfig
8
8
  from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
9
9
  from ragbits.document_search import DocumentSearch
10
10
  from ragbits.document_search.documents.document import DocumentMeta
11
11
  from ragbits.evaluate.dataloaders.document_search import DocumentSearchDataLoader
12
12
  from ragbits.evaluate.metrics import MetricSet
13
-
14
- DS_PRECISION_RECALL_F1 = {
15
- "precision_recall_f1": ObjectConstructionConfig.model_validate(
16
- {
17
- "type": "ragbits.evaluate.metrics.document_search:DocumentSearchPrecisionRecallF1",
18
- "config": {
19
- "matching_strategy": {
20
- "type": "RougeChunkMatch",
21
- "config": {
22
- "threshold": 0.5,
23
- },
24
- },
25
- },
26
- }
27
- ),
28
- }
29
-
30
-
31
- def precision_recall_f1() -> MetricSet:
32
- """
33
- Factory of precision recall f1 metric set for retrival evaluation.
34
- """
35
- return MetricSet.from_config(config=DS_PRECISION_RECALL_F1)
13
+ from ragbits.evaluate.metrics.document_search import DocumentSearchPrecisionRecallF1
36
14
 
37
15
 
38
16
  async def _add_example_documents(document_search: DocumentSearch) -> None:
39
17
  dataset = load_dataset(path="deepsense-ai/synthetic-rag-dataset_v1.0", split="train")
40
- documents = [DocumentMeta.create_text_document_from_literal(doc) for chunks in dataset["chunks"] for doc in chunks]
18
+ documents = [DocumentMeta.from_literal(doc) for chunks in dataset["chunks"] for doc in chunks]
41
19
  await document_search.ingest(documents)
42
20
 
43
21
 
@@ -45,7 +23,7 @@ def basic_document_search_factory() -> DocumentSearch:
45
23
  """
46
24
  Factory for basic example document search instance.
47
25
  """
48
- document_search = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
26
+ document_search: DocumentSearch = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
49
27
  asyncio.run(_add_example_documents(document_search))
50
28
  return document_search
51
29
 
@@ -55,3 +33,10 @@ def synthetic_rag_dataset() -> DocumentSearchDataLoader:
55
33
  Factory for synthetic RAG dataset.
56
34
  """
57
35
  return DocumentSearchDataLoader(source=HuggingFaceSource(path="deepsense-ai/synthetic-rag-dataset_v1.0"))
36
+
37
+
38
+ def precision_recall_f1() -> MetricSet:
39
+ """
40
+ Factory of precision recall f1 metric set for retrival evaluation.
41
+ """
42
+ return MetricSet(DocumentSearchPrecisionRecallF1(matching_strategy=RougeChunkMatch()))
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  from abc import ABC, abstractmethod
2
3
  from types import ModuleType
3
4
  from typing import ClassVar, Generic
@@ -19,7 +20,7 @@ class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
19
20
 
20
21
  def __init__(self, weight: float = 1.0) -> None:
21
22
  """
22
- Initializes the metric.
23
+ Initialize the metric.
23
24
 
24
25
  Args:
25
26
  weight: Metric value weight in the final score, used during optimization.
@@ -28,7 +29,7 @@ class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
28
29
  self.weight = weight
29
30
 
30
31
  @abstractmethod
31
- def compute(self, results: list[EvaluationResultT]) -> dict:
32
+ async def compute(self, results: list[EvaluationResultT]) -> dict:
32
33
  """
33
34
  Compute the metric.
34
35
 
@@ -70,7 +71,7 @@ class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
70
71
  """
71
72
  return cls(*[Metric.subclass_from_config(metric_config) for metric_config in config.values()])
72
73
 
73
- def compute(self, results: list[EvaluationResultT]) -> dict:
74
+ async def compute(self, results: list[EvaluationResultT]) -> dict:
74
75
  """
75
76
  Compute the metrics.
76
77
 
@@ -80,6 +81,9 @@ class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
80
81
  Returns:
81
82
  The computed metrics.
82
83
  """
84
+ metric_results = await asyncio.gather(*[metric.compute(results) for metric in self.metrics])
83
85
  return {
84
- name: metric.weight * value for metric in self.metrics for name, value in metric.compute(results).items()
86
+ name: metric.weight * value
87
+ for metric, result in zip(self.metrics, metric_results, strict=False)
88
+ for name, value in result.items()
85
89
  }
@@ -46,7 +46,7 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
46
46
  matching_strategy = matching_strategy_cls(**config["matching_strategy"]["config"])
47
47
  return cls(matching_strategy=matching_strategy, weight=config.get("weight", 1.0))
48
48
 
49
- def compute(self, results: list[DocumentSearchResult]) -> dict:
49
+ async def compute(self, results: list[DocumentSearchResult]) -> dict:
50
50
  """
51
51
  Compute the metric.
52
52
 
@@ -57,7 +57,18 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
57
57
  The computed metric.
58
58
  """
59
59
  return self.metric.aggregate(
60
- [self.metric(result.predicted_passages, result.reference_passages) for result in results]
60
+ [
61
+ self.metric(
62
+ [
63
+ element.text_representation
64
+ for element in result.predicted_elements
65
+ if element.text_representation
66
+ ],
67
+ result.reference_passages,
68
+ )
69
+ for result in results
70
+ if result.reference_passages is not None
71
+ ]
61
72
  )
62
73
 
63
74
 
@@ -61,7 +61,7 @@ class Optimizer(WithConstructionConfig):
61
61
  evaluator_config = EvaluatorConfig.model_validate(optimizer_config.evaluator)
62
62
 
63
63
  dataloader: DataLoader = DataLoader.subclass_from_config(evaluator_config.evaluation.dataloader)
64
- metrics: MetricSet = MetricSet.from_config(evaluator_config.evaluation.metrics)
64
+ metricset: MetricSet = MetricSet.from_config(evaluator_config.evaluation.metrics)
65
65
 
66
66
  pipeline_class = import_by_path(evaluator_config.evaluation.pipeline.type)
67
67
  pipeline_config = dict(evaluator_config.evaluation.pipeline.config)
@@ -71,7 +71,7 @@ class Optimizer(WithConstructionConfig):
71
71
  return optimizer.optimize(
72
72
  pipeline_class=pipeline_class,
73
73
  pipeline_config=pipeline_config,
74
- metrics=metrics,
74
+ metricset=metricset,
75
75
  dataloader=dataloader,
76
76
  callbacks=callbacks,
77
77
  )
@@ -81,7 +81,7 @@ class Optimizer(WithConstructionConfig):
81
81
  pipeline_class: type[EvaluationPipeline],
82
82
  pipeline_config: dict,
83
83
  dataloader: DataLoader,
84
- metrics: MetricSet,
84
+ metricset: MetricSet,
85
85
  callbacks: list[Callable] | None = None,
86
86
  ) -> list[tuple[dict, float, dict[str, float]]]:
87
87
  """
@@ -91,7 +91,7 @@ class Optimizer(WithConstructionConfig):
91
91
  pipeline_class: Pipeline to be optimized.
92
92
  pipeline_config: Configuration defining the optimization process.
93
93
  dataloader: Data loader.
94
- metrics: Metrics to be optimized.
94
+ metricset: Metrics to be optimized.
95
95
  callbacks: Experiment callbacks.
96
96
 
97
97
  Returns:
@@ -104,7 +104,7 @@ class Optimizer(WithConstructionConfig):
104
104
  pipeline_class=pipeline_class,
105
105
  pipeline_config=pipeline_config,
106
106
  dataloader=dataloader,
107
- metrics=metrics,
107
+ metricset=metricset,
108
108
  )
109
109
 
110
110
  study = optuna.create_study(direction=self.direction)
@@ -131,7 +131,7 @@ class Optimizer(WithConstructionConfig):
131
131
  pipeline_class: type[EvaluationPipeline],
132
132
  pipeline_config: dict,
133
133
  dataloader: DataLoader,
134
- metrics: MetricSet,
134
+ metricset: MetricSet,
135
135
  ) -> float:
136
136
  """
137
137
  Run a single experiment.
@@ -153,11 +153,11 @@ class Optimizer(WithConstructionConfig):
153
153
  evaluator.compute(
154
154
  pipeline=pipeline,
155
155
  dataloader=dataloader,
156
- metrics=metrics,
156
+ metricset=metricset,
157
157
  )
158
158
  )
159
- score = sum(results["metrics"].values())
160
- metrics_values = results["metrics"]
159
+ score = sum(results.metrics.values())
160
+ metrics_values = results.metrics
161
161
  break
162
162
  except Exception as exc:
163
163
  message = (
@@ -1,4 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
+ from collections.abc import Iterable
2
3
  from dataclasses import dataclass
3
4
  from types import ModuleType
4
5
  from typing import ClassVar, Generic, TypeVar
@@ -51,7 +52,7 @@ class EvaluationPipeline(WithConstructionConfig, Generic[EvaluationTargetT, Eval
51
52
  pass
52
53
 
53
54
  @abstractmethod
54
- async def __call__(self, data: EvaluationDataT) -> EvaluationResultT:
55
+ async def __call__(self, data: Iterable[EvaluationDataT]) -> Iterable[EvaluationResultT]:
55
56
  """
56
57
  Run the evaluation pipeline.
57
58
 
@@ -1,3 +1,5 @@
1
+ import asyncio
2
+ from collections.abc import Iterable, Sequence
1
3
  from dataclasses import dataclass
2
4
  from uuid import uuid4
3
5
 
@@ -5,6 +7,7 @@ from typing_extensions import Self
5
7
 
6
8
  from ragbits.core.sources.hf import HuggingFaceSource
7
9
  from ragbits.document_search import DocumentSearch
10
+ from ragbits.document_search.documents.element import Element
8
11
  from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
9
12
 
10
13
 
@@ -14,7 +17,9 @@ class DocumentSearchData(EvaluationData):
14
17
  """
15
18
 
16
19
  question: str
17
- reference_passages: list[str]
20
+ reference_document_ids: list[str | int] | None = None
21
+ reference_passages: list[str] | None = None
22
+ reference_page_numbers: list[int] | None = None
18
23
 
19
24
 
20
25
  @dataclass
@@ -24,8 +29,10 @@ class DocumentSearchResult(EvaluationResult):
24
29
  """
25
30
 
26
31
  question: str
27
- reference_passages: list[str]
28
- predicted_passages: list[str]
32
+ predicted_elements: Sequence[Element]
33
+ reference_document_ids: list[str | int] | None = None
34
+ reference_passages: list[str] | None = None
35
+ reference_page_numbers: list[int] | None = None
29
36
 
30
37
 
31
38
  class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchData, DocumentSearchResult]):
@@ -60,7 +67,7 @@ class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchDa
60
67
  # TODO: optimize this for cases with duplicated document search configs between runs
61
68
  if config.get("source"):
62
69
  config["vector_store"]["config"]["index_name"] = str(uuid4())
63
- evaluation_target = DocumentSearch.from_config(config)
70
+ evaluation_target: DocumentSearch = DocumentSearch.from_config(config)
64
71
  return cls(evaluation_target=evaluation_target, source=config.get("source"))
65
72
 
66
73
  async def prepare(self) -> None:
@@ -76,21 +83,24 @@ class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchDa
76
83
  )
77
84
  await self.evaluation_target.ingest(sources)
78
85
 
79
- async def __call__(self, data: DocumentSearchData) -> DocumentSearchResult:
86
+ async def __call__(self, data: Iterable[DocumentSearchData]) -> Iterable[DocumentSearchResult]:
80
87
  """
81
88
  Run the document search evaluation pipeline.
82
89
 
83
90
  Args:
84
- data: The evaluation data.
91
+ data: The evaluation data batch.
85
92
 
86
93
  Returns:
87
- The evaluation result.
94
+ The evaluation result batch.
88
95
  """
89
- elements = await self.evaluation_target.search(data.question)
90
- predicted_passages = [element.text_representation for element in elements if element.text_representation]
91
-
92
- return DocumentSearchResult(
93
- question=data.question,
94
- reference_passages=data.reference_passages,
95
- predicted_passages=predicted_passages,
96
- )
96
+ results = await asyncio.gather(*[self.evaluation_target.search(row.question) for row in data])
97
+ return [
98
+ DocumentSearchResult(
99
+ question=row.question,
100
+ predicted_elements=elements,
101
+ reference_document_ids=row.reference_document_ids,
102
+ reference_passages=row.reference_passages,
103
+ reference_page_numbers=row.reference_page_numbers,
104
+ )
105
+ for row, elements in zip(data, results, strict=False)
106
+ ]