ragbits-evaluate 0.17.1__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragbits-evaluate might be problematic. Click here for more details.

ragbits/evaluate/cli.py CHANGED
@@ -140,10 +140,10 @@ def run() -> None:
140
140
  metric_results = await evaluator.compute(
141
141
  pipeline=state.pipeline,
142
142
  dataloader=state.dataloader,
143
- metrics=state.metrics,
143
+ metricset=state.metrics,
144
144
  )
145
145
  evaluation_results = EvaluationResult(
146
- metrics={"metrics": metric_results["metrics"], "time_perf": metric_results["time_perf"]}
146
+ metrics={"metrics": metric_results.metrics, "time_perf": metric_results.time_perf}
147
147
  )
148
148
  print_output(evaluation_results)
149
149
 
@@ -3,12 +3,14 @@ from collections.abc import Iterable
3
3
  from types import ModuleType
4
4
  from typing import ClassVar, Generic
5
5
 
6
+ from datasets import load_dataset
6
7
  from pydantic import BaseModel
7
8
  from typing_extensions import Self
8
9
 
9
10
  from ragbits.core.sources.base import Source
10
11
  from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
11
12
  from ragbits.evaluate import dataloaders
13
+ from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
12
14
  from ragbits.evaluate.pipelines.base import EvaluationDataT
13
15
 
14
16
 
@@ -28,14 +30,19 @@ class DataLoader(WithConstructionConfig, Generic[EvaluationDataT], ABC):
28
30
  default_module: ClassVar[ModuleType | None] = dataloaders
29
31
  configuration_key: ClassVar[str] = "dataloader"
30
32
 
31
- def __init__(self, source: Source) -> None:
33
+ def __init__(self, source: Source, *, split: str = "data", required_keys: set[str] | None = None) -> None:
32
34
  """
33
35
  Initialize the data loader.
34
36
 
35
37
  Args:
36
38
  source: The source to load the evaluation data from.
39
+ split: The split to load the data from. Split is fixed for data loaders to "data",
40
+ but you can slice it using the [Hugging Face API](https://huggingface.co/docs/datasets/v1.11.0/splits.html#slicing-api).
41
+ required_keys: The required columns for the evaluation data.
37
42
  """
38
43
  self.source = source
44
+ self.split = split
45
+ self.required_keys = required_keys or set()
39
46
 
40
47
  @classmethod
41
48
  def from_config(cls, config: dict) -> Self:
@@ -52,11 +59,37 @@ class DataLoader(WithConstructionConfig, Generic[EvaluationDataT], ABC):
52
59
  config["source"] = Source.subclass_from_config(dataloader_config.source)
53
60
  return super().from_config(config)
54
61
 
55
- @abstractmethod
56
62
  async def load(self) -> Iterable[EvaluationDataT]:
57
63
  """
58
64
  Load the data.
59
65
 
60
66
  Returns:
61
- The loaded data.
67
+ The loaded evaluation data.
68
+
69
+ Raises:
70
+ DataLoaderIncorrectFormatDataError: If evaluation dataset is incorrectly formatted.
71
+ """
72
+ data_path = await self.source.fetch()
73
+ dataset = load_dataset(
74
+ path=str(data_path.parent),
75
+ data_files={"data": str(data_path.name)},
76
+ split=self.split,
77
+ )
78
+ if not self.required_keys.issubset(dataset.features):
79
+ raise DataLoaderIncorrectFormatDataError(
80
+ required_features=list(self.required_keys),
81
+ data_path=data_path,
82
+ )
83
+ return await self.map(dataset.to_list())
84
+
85
+ @abstractmethod
86
+ async def map(self, dataset: Iterable[dict]) -> Iterable[EvaluationDataT]:
87
+ """
88
+ Map the dataset to the evaluation data.
89
+
90
+ Args:
91
+ dataset: The dataset to map.
92
+
93
+ Returns:
94
+ The evaluation data.
62
95
  """
@@ -1,9 +1,7 @@
1
1
  from collections.abc import Iterable
2
2
 
3
- from datasets import load_dataset
4
-
3
+ from ragbits.core.sources.base import Source
5
4
  from ragbits.evaluate.dataloaders.base import DataLoader
6
- from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
7
5
  from ragbits.evaluate.pipelines.document_search import DocumentSearchData
8
6
 
9
7
 
@@ -11,35 +9,53 @@ class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
11
9
  """
12
10
  Document search evaluation data loader.
13
11
 
14
- The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files)
15
- and contain the following features: "question, "passages".
12
+ The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files).
16
13
  """
17
14
 
18
- async def load(self) -> Iterable[DocumentSearchData]:
15
+ def __init__(
16
+ self,
17
+ source: Source,
18
+ *,
19
+ split: str = "data",
20
+ question_key: str = "question",
21
+ document_ids_key: str = "document_ids",
22
+ passages_key: str = "passages",
23
+ page_numbers_key: str = "page_numbers",
24
+ ) -> None:
19
25
  """
20
- Load the data from source and format them.
21
-
22
- Returns:
23
- The document search evaluation data.
26
+ Initialize the document search data loader.
27
+
28
+ Args:
29
+ source: The source to load the data from.
30
+ split: The split to load the data from. Split is fixed for data loaders to "data",
31
+ but you can slice it using the [Hugging Face API](https://huggingface.co/docs/datasets/v1.11.0/splits.html#slicing-api).
32
+ question_key: The dataset column name that contains the question.
33
+ document_ids_key: The dataset column name that contains the document ids. Document ids are optional.
34
+ passages_key: The dataset column name that contains the passages. Passages are optional.
35
+ page_numbers_key: The dataset column name that contains the page numbers. Page numbers are optional.
36
+ """
37
+ super().__init__(source=source, split=split, required_keys={question_key})
38
+ self.question_key = question_key
39
+ self.document_ids_key = document_ids_key
40
+ self.passages_key = passages_key
41
+ self.page_numbers_key = page_numbers_key
24
42
 
25
- Raises:
26
- DataLoaderIncorrectFormatDataError: If evaluation dataset is incorrectly formatted.
43
+ async def map(self, dataset: Iterable[dict]) -> Iterable[DocumentSearchData]:
27
44
  """
28
- data_path = await self.source.fetch()
29
- dataset = load_dataset(
30
- path=str(data_path.parent),
31
- split=data_path.stem,
32
- )
33
- if "question" not in dataset.features or "passages" not in dataset.features:
34
- raise DataLoaderIncorrectFormatDataError(
35
- required_features=["question", "passages"],
36
- data_path=data_path,
37
- )
45
+ Map the dataset to the document search data schema.
46
+
47
+ Args:
48
+ dataset: The dataset to map.
38
49
 
50
+ Returns:
51
+ The document search data.
52
+ """
39
53
  return [
40
54
  DocumentSearchData(
41
- question=data["question"],
42
- reference_passages=data["passages"],
55
+ question=data.get(self.question_key, ""),
56
+ reference_document_ids=data.get(self.document_ids_key),
57
+ reference_passages=data.get(self.passages_key),
58
+ reference_page_numbers=data.get(self.page_numbers_key),
43
59
  )
44
60
  for data in dataset
45
61
  ]
@@ -0,0 +1,57 @@
1
+ from collections.abc import Iterable
2
+
3
+ from ragbits.core.sources.base import Source
4
+ from ragbits.evaluate.dataloaders.base import DataLoader
5
+ from ragbits.evaluate.pipelines.question_answer import QuestionAnswerData
6
+
7
+
8
+ class QuestionAnswerDataLoader(DataLoader[QuestionAnswerData]):
9
+ """
10
+ Question answer evaluation data loader.
11
+
12
+ The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files).
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ source: Source,
18
+ *,
19
+ split: str = "data",
20
+ question_key: str = "question",
21
+ answer_key: str = "answer",
22
+ context_key: str = "context",
23
+ ) -> None:
24
+ """
25
+ Initialize the question answer data loader.
26
+
27
+ Args:
28
+ source: The source to load the data from.
29
+ split: The split to load the data from.
30
+ required_keys: The required keys to load the data from.
31
+ question_key: The dataset column name that contains the question.
32
+ answer_key: The dataset column name that contains the answer.
33
+ context_key: The dataset column name that contains the context. Context is optional.
34
+ """
35
+ super().__init__(source=source, split=split, required_keys={question_key, answer_key})
36
+ self.question_key = question_key
37
+ self.answer_key = answer_key
38
+ self.context_key = context_key
39
+
40
+ async def map(self, dataset: Iterable[dict]) -> Iterable[QuestionAnswerData]:
41
+ """
42
+ Map the dataset to the question answer data schema.
43
+
44
+ Args:
45
+ dataset: The dataset to map.
46
+
47
+ Returns:
48
+ The question answer data.
49
+ """
50
+ return [
51
+ QuestionAnswerData(
52
+ question=data.get(self.question_key, ""),
53
+ reference_answer=data.get(self.answer_key, ""),
54
+ reference_context=data.get(self.context_key),
55
+ )
56
+ for data in dataset
57
+ ]
@@ -1,16 +1,45 @@
1
1
  import asyncio
2
+ import random
2
3
  import time
3
- from collections.abc import Iterable
4
- from dataclasses import asdict
4
+ from collections.abc import Awaitable, Callable, Iterable
5
+ from dataclasses import dataclass
6
+ from typing import Generic, ParamSpec, TypeVar
5
7
 
6
8
  from pydantic import BaseModel
7
- from tqdm.asyncio import tqdm
9
+ from tqdm import tqdm
8
10
 
9
11
  from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
12
+ from ragbits.core.utils.helpers import batched
10
13
  from ragbits.evaluate.dataloaders.base import DataLoader
11
14
  from ragbits.evaluate.metrics.base import MetricSet
12
15
  from ragbits.evaluate.pipelines.base import EvaluationDataT, EvaluationPipeline, EvaluationResultT, EvaluationTargetT
13
16
 
17
+ _CallP = ParamSpec("_CallP")
18
+ _CallReturnT = TypeVar("_CallReturnT")
19
+
20
+
21
+ @dataclass
22
+ class EvaluationTimePerf:
23
+ """
24
+ Container for evaluation time performance metrics.
25
+ """
26
+
27
+ total_time_in_seconds: float
28
+ samples_per_second: float
29
+ latency_in_seconds: float
30
+
31
+
32
+ @dataclass
33
+ class EvaluatorResult(Generic[EvaluationResultT]):
34
+ """
35
+ Container for evaluation results.
36
+ """
37
+
38
+ metrics: dict[str, int | float]
39
+ results: list[EvaluationResultT]
40
+ errors: list[Exception]
41
+ time_perf: EvaluationTimePerf
42
+
14
43
 
15
44
  class EvaluationConfig(BaseModel):
16
45
  """
@@ -36,17 +65,29 @@ class Evaluator(WithConstructionConfig):
36
65
  Evaluator class.
37
66
  """
38
67
 
39
- def __init__(self, batch_size: int = 10) -> None:
68
+ def __init__(
69
+ self,
70
+ batch_size: int = 10,
71
+ num_retries: int = 3,
72
+ backoff_multiplier: int = 1,
73
+ backoff_max: int = 60,
74
+ ) -> None:
40
75
  """
41
- Initialize the evaluator.
76
+ Initialize the Evaluator instance.
42
77
 
43
78
  Args:
44
79
  batch_size: batch size for the evaluation pipeline inference.
80
+ num_retries: The number of retries per evaluation pipeline inference error.
81
+ backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
82
+ backoff_max: The maximum allowed delay (in seconds) between retries.
45
83
  """
46
84
  self.batch_size = batch_size
85
+ self.num_retries = num_retries
86
+ self.backoff_multiplier = backoff_multiplier
87
+ self.backoff_max = backoff_max
47
88
 
48
89
  @classmethod
49
- async def run_from_config(cls, config: dict) -> dict:
90
+ async def run_from_config(cls, config: dict) -> EvaluatorResult:
50
91
  """
51
92
  Run the evaluation based on configuration.
52
93
 
@@ -60,50 +101,50 @@ class Evaluator(WithConstructionConfig):
60
101
  evaluation_config = EvaluationConfig.model_validate(evaluator_config.evaluation)
61
102
  pipeline: EvaluationPipeline = EvaluationPipeline.subclass_from_config(evaluation_config.pipeline)
62
103
  dataloader: DataLoader = DataLoader.subclass_from_config(evaluation_config.dataloader)
63
- metrics: MetricSet = MetricSet.from_config(evaluation_config.metrics)
104
+ metricset: MetricSet = MetricSet.from_config(evaluation_config.metrics)
64
105
 
65
106
  evaluator = cls.from_config(evaluator_config.evaluator or {})
66
107
  return await evaluator.compute(
67
108
  pipeline=pipeline,
68
109
  dataloader=dataloader,
69
- metrics=metrics,
110
+ metricset=metricset,
70
111
  )
71
112
 
72
113
  async def compute(
73
114
  self,
74
115
  pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
75
116
  dataloader: DataLoader[EvaluationDataT],
76
- metrics: MetricSet[EvaluationResultT],
77
- ) -> dict:
117
+ metricset: MetricSet[EvaluationResultT],
118
+ ) -> EvaluatorResult[EvaluationResultT]:
78
119
  """
79
120
  Compute the evaluation results for the given pipeline and data.
80
121
 
81
122
  Args:
82
123
  pipeline: The pipeline to be evaluated.
83
124
  dataloader: The dataloader to load the data.
84
- metrics: The metrics to be computed.
125
+ metricset: The metrics to be computed.
85
126
 
86
127
  Returns:
87
128
  The evaluation results.
88
129
  """
89
- dataset = await dataloader.load()
90
130
  await pipeline.prepare()
91
131
 
92
- results, perf_results = await self._call_pipeline(pipeline, dataset)
93
- computed_metrics = self._compute_metrics(metrics, results)
94
- processed_results = self._results_processor(results)
132
+ dataset = await dataloader.load()
133
+ results, errors, time_perf = await self._call_pipeline(pipeline, dataset)
134
+ metrics = await metricset.compute(results)
95
135
 
96
- return {
97
- **perf_results,
98
- **computed_metrics,
99
- **processed_results,
100
- }
136
+ return EvaluatorResult(
137
+ metrics=metrics,
138
+ results=results,
139
+ errors=errors,
140
+ time_perf=time_perf,
141
+ )
101
142
 
102
143
  async def _call_pipeline(
103
144
  self,
104
145
  pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
105
146
  dataset: Iterable[EvaluationDataT],
106
- ) -> tuple[list[EvaluationResultT], dict]:
147
+ ) -> tuple[list[EvaluationResultT], list[Exception], EvaluationTimePerf]:
107
148
  """
108
149
  Call the pipeline with the given data.
109
150
 
@@ -114,47 +155,53 @@ class Evaluator(WithConstructionConfig):
114
155
  Returns:
115
156
  The evaluation results and performance metrics.
116
157
  """
117
- semaphore = asyncio.Semaphore(self.batch_size)
118
-
119
- async def _call_pipeline_with_semaphore(data: EvaluationDataT) -> EvaluationResultT:
120
- async with semaphore:
121
- return await pipeline(data)
122
-
123
158
  start_time = time.perf_counter()
124
- pipe_outputs = await tqdm.gather(*[_call_pipeline_with_semaphore(data) for data in dataset], desc="Evaluation")
159
+ outputs = [
160
+ await self._call_with_error_handling(pipeline, data)
161
+ for data in tqdm(batched(dataset, self.batch_size), desc="Evaluation")
162
+ ]
125
163
  end_time = time.perf_counter()
126
164
 
127
- return pipe_outputs, self._compute_time_perf(start_time, end_time, len(pipe_outputs))
165
+ errors = [output for output in outputs if isinstance(output, Exception)]
166
+ results = [item for output in outputs if not isinstance(output, Exception) for item in output]
128
167
 
129
- @staticmethod
130
- def _results_processor(results: list[EvaluationResultT]) -> dict:
168
+ return results, errors, self._compute_time_perf(start_time, end_time, len(outputs))
169
+
170
+ async def _call_with_error_handling(
171
+ self,
172
+ executable: Callable[_CallP, Awaitable[_CallReturnT]],
173
+ *executable_args: _CallP.args,
174
+ **executable_kwargs: _CallP.kwargs,
175
+ ) -> _CallReturnT | Exception:
131
176
  """
132
- Process the results.
177
+ Call executable with a standarized error handling.
178
+ If an error occurs, the executable is retried `num_retries` times using randomized exponential backoff.
133
179
 
134
180
  Args:
135
- results: The evaluation results.
181
+ executable: The callable function to execute.
182
+ executable_args: Positional arguments to pass to the executable.
183
+ executable_kwargs: Keyword arguments to pass to the executable.
136
184
 
137
185
  Returns:
138
- The processed results.
139
- """
140
- return {"results": [asdict(result) for result in results]}
186
+ The result of the executable if successful.
141
187
 
142
- @staticmethod
143
- def _compute_metrics(metrics: MetricSet[EvaluationResultT], results: list[EvaluationResultT]) -> dict:
188
+ Raises:
189
+ Exception: The last encountered exception after all retries are exhausted.
144
190
  """
145
- Compute a metric using the given inputs.
191
+ for i in range(max(0, self.num_retries) + 1):
192
+ try:
193
+ return await executable(*executable_args, **executable_kwargs)
194
+ except Exception as exc:
195
+ if i == self.num_retries:
196
+ return exc
146
197
 
147
- Args:
148
- metrics: The metrics to be computed.
149
- results: The evaluation results.
198
+ delay = random.uniform(0, min(2**i * self.backoff_multiplier, self.backoff_max)) # noqa: S311
199
+ await asyncio.sleep(delay)
150
200
 
151
- Returns:
152
- The computed metric.
153
- """
154
- return {"metrics": metrics.compute(results)}
201
+ raise RuntimeError("Unreachable code reached") # mypy quirk
155
202
 
156
203
  @staticmethod
157
- def _compute_time_perf(start_time: float, end_time: float, num_samples: int) -> dict:
204
+ def _compute_time_perf(start_time: float, end_time: float, num_samples: int) -> EvaluationTimePerf:
158
205
  """
159
206
  Compute the performance metrics.
160
207
 
@@ -170,10 +217,8 @@ class Evaluator(WithConstructionConfig):
170
217
  throughput = num_samples / latency
171
218
  latency_sample = 1.0 / throughput if throughput > 0 else 0.0
172
219
 
173
- return {
174
- "time_perf": {
175
- "total_time_in_seconds": latency,
176
- "samples_per_second": throughput,
177
- "latency_in_seconds": latency_sample,
178
- },
179
- }
220
+ return EvaluationTimePerf(
221
+ total_time_in_seconds=latency,
222
+ samples_per_second=throughput,
223
+ latency_in_seconds=latency_sample,
224
+ )
@@ -1,43 +1,21 @@
1
1
  import asyncio
2
2
 
3
+ from continuous_eval.metrics.retrieval.matching_strategy import RougeChunkMatch
3
4
  from datasets import load_dataset
4
5
 
5
6
  from ragbits.core.embeddings.dense import LiteLLMEmbedder
6
7
  from ragbits.core.sources.hf import HuggingFaceSource
7
- from ragbits.core.utils.config_handling import ObjectConstructionConfig
8
8
  from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
9
9
  from ragbits.document_search import DocumentSearch
10
10
  from ragbits.document_search.documents.document import DocumentMeta
11
11
  from ragbits.evaluate.dataloaders.document_search import DocumentSearchDataLoader
12
12
  from ragbits.evaluate.metrics import MetricSet
13
-
14
- DS_PRECISION_RECALL_F1 = {
15
- "precision_recall_f1": ObjectConstructionConfig.model_validate(
16
- {
17
- "type": "ragbits.evaluate.metrics.document_search:DocumentSearchPrecisionRecallF1",
18
- "config": {
19
- "matching_strategy": {
20
- "type": "RougeChunkMatch",
21
- "config": {
22
- "threshold": 0.5,
23
- },
24
- },
25
- },
26
- }
27
- ),
28
- }
29
-
30
-
31
- def precision_recall_f1() -> MetricSet:
32
- """
33
- Factory of precision recall f1 metric set for retrival evaluation.
34
- """
35
- return MetricSet.from_config(config=DS_PRECISION_RECALL_F1)
13
+ from ragbits.evaluate.metrics.document_search import DocumentSearchPrecisionRecallF1
36
14
 
37
15
 
38
16
  async def _add_example_documents(document_search: DocumentSearch) -> None:
39
17
  dataset = load_dataset(path="deepsense-ai/synthetic-rag-dataset_v1.0", split="train")
40
- documents = [DocumentMeta.create_text_document_from_literal(doc) for chunks in dataset["chunks"] for doc in chunks]
18
+ documents = [DocumentMeta.from_literal(doc) for chunks in dataset["chunks"] for doc in chunks]
41
19
  await document_search.ingest(documents)
42
20
 
43
21
 
@@ -45,7 +23,7 @@ def basic_document_search_factory() -> DocumentSearch:
45
23
  """
46
24
  Factory for basic example document search instance.
47
25
  """
48
- document_search = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
26
+ document_search: DocumentSearch = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
49
27
  asyncio.run(_add_example_documents(document_search))
50
28
  return document_search
51
29
 
@@ -55,3 +33,10 @@ def synthetic_rag_dataset() -> DocumentSearchDataLoader:
55
33
  Factory for synthetic RAG dataset.
56
34
  """
57
35
  return DocumentSearchDataLoader(source=HuggingFaceSource(path="deepsense-ai/synthetic-rag-dataset_v1.0"))
36
+
37
+
38
+ def precision_recall_f1() -> MetricSet:
39
+ """
40
+ Factory of precision recall f1 metric set for retrival evaluation.
41
+ """
42
+ return MetricSet(DocumentSearchPrecisionRecallF1(matching_strategy=RougeChunkMatch()))
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  from abc import ABC, abstractmethod
2
3
  from types import ModuleType
3
4
  from typing import ClassVar, Generic
@@ -19,7 +20,7 @@ class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
19
20
 
20
21
  def __init__(self, weight: float = 1.0) -> None:
21
22
  """
22
- Initializes the metric.
23
+ Initialize the metric.
23
24
 
24
25
  Args:
25
26
  weight: Metric value weight in the final score, used during optimization.
@@ -28,7 +29,7 @@ class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
28
29
  self.weight = weight
29
30
 
30
31
  @abstractmethod
31
- def compute(self, results: list[EvaluationResultT]) -> dict:
32
+ async def compute(self, results: list[EvaluationResultT]) -> dict:
32
33
  """
33
34
  Compute the metric.
34
35
 
@@ -70,7 +71,7 @@ class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
70
71
  """
71
72
  return cls(*[Metric.subclass_from_config(metric_config) for metric_config in config.values()])
72
73
 
73
- def compute(self, results: list[EvaluationResultT]) -> dict:
74
+ async def compute(self, results: list[EvaluationResultT]) -> dict:
74
75
  """
75
76
  Compute the metrics.
76
77
 
@@ -80,6 +81,9 @@ class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
80
81
  Returns:
81
82
  The computed metrics.
82
83
  """
84
+ metric_results = await asyncio.gather(*[metric.compute(results) for metric in self.metrics])
83
85
  return {
84
- name: metric.weight * value for metric in self.metrics for name, value in metric.compute(results).items()
86
+ name: metric.weight * value
87
+ for metric, result in zip(self.metrics, metric_results, strict=False)
88
+ for name, value in result.items()
85
89
  }
@@ -46,7 +46,7 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
46
46
  matching_strategy = matching_strategy_cls(**config["matching_strategy"]["config"])
47
47
  return cls(matching_strategy=matching_strategy, weight=config.get("weight", 1.0))
48
48
 
49
- def compute(self, results: list[DocumentSearchResult]) -> dict:
49
+ async def compute(self, results: list[DocumentSearchResult]) -> dict:
50
50
  """
51
51
  Compute the metric.
52
52
 
@@ -57,7 +57,18 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
57
57
  The computed metric.
58
58
  """
59
59
  return self.metric.aggregate(
60
- [self.metric(result.predicted_passages, result.reference_passages) for result in results]
60
+ [
61
+ self.metric(
62
+ [
63
+ element.text_representation
64
+ for element in result.predicted_elements
65
+ if element.text_representation
66
+ ],
67
+ result.reference_passages,
68
+ )
69
+ for result in results
70
+ if result.reference_passages is not None
71
+ ]
61
72
  )
62
73
 
63
74
 
@@ -0,0 +1,182 @@
1
+ import asyncio
2
+ from abc import ABC, abstractmethod
3
+ from itertools import chain
4
+ from typing import Generic, TypeVar
5
+
6
+ from continuous_eval.llm_factory import LLMInterface
7
+ from continuous_eval.metrics.base import LLMBasedMetric
8
+ from continuous_eval.metrics.generation.text import (
9
+ LLMBasedAnswerCorrectness,
10
+ LLMBasedAnswerRelevance,
11
+ LLMBasedFaithfulness,
12
+ LLMBasedStyleConsistency,
13
+ )
14
+ from typing_extensions import Self
15
+
16
+ from ragbits.agents.types import QuestionAnswerPromptOutputT
17
+ from ragbits.core.llms.base import LLM
18
+ from ragbits.core.utils.helpers import batched
19
+ from ragbits.evaluate.metrics.base import Metric
20
+ from ragbits.evaluate.pipelines.question_answer import QuestionAnswerResult
21
+
22
+ MetricT = TypeVar("MetricT", bound=LLMBasedMetric)
23
+
24
+
25
+ class _MetricLMM(LLMInterface):
26
+ """
27
+ Implementation of required interface of Relari generative metrics based on LiteLMM.
28
+ """
29
+
30
+ def __init__(self, llm: LLM) -> None:
31
+ self._llm = llm
32
+
33
+ def run(self, prompt: dict[str, str], temperature: float = 0, max_tokens: int = 1024) -> str:
34
+ formatted_prompt = [
35
+ {"role": "system", "content": prompt["system_prompt"]},
36
+ {"role": "user", "content": prompt["user_prompt"]},
37
+ ]
38
+ options = self._llm.options_cls(
39
+ temperature=temperature,
40
+ max_tokens=max_tokens,
41
+ )
42
+ return asyncio.run(self._llm.generate(formatted_prompt, options=options))
43
+
44
+
45
+ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
46
+ """
47
+ Metric for question answer evaluation based on Relari backend.
48
+ More details can be found [here](https://docs.relari.ai/category/text-generation).
49
+ """
50
+
51
+ metric_cls: type[MetricT]
52
+
53
+ def __init__(self, llm: LLM, batch_size: int = 15, weight: float = 1.0) -> None:
54
+ """
55
+ Initialize the agent metric.
56
+
57
+ Args:
58
+ llm: Judge LLM instance.
59
+ batch_size: Batch size for metric computation.
60
+ weight: Metric value weight in the final score, used during optimization.
61
+ """
62
+ super().__init__(weight=weight)
63
+ self.metric = self.metric_cls(_MetricLMM(llm))
64
+ self.batch_size = batch_size
65
+
66
+ @classmethod
67
+ def from_config(cls, config: dict) -> Self:
68
+ """
69
+ Create an instance of `QuestionAnswerMetric` from a configuration dictionary.
70
+
71
+ Args:
72
+ config: A dictionary containing configuration settings for the metric.
73
+
74
+ Returns:
75
+ An instance of the metric class initialized with the provided configuration.
76
+ """
77
+ config["llm"] = LLM.from_config(config["llm"])
78
+ config["batch_size"] = config.get("batch_size", 15)
79
+ config["weight"] = config.get("weight", 1.0)
80
+ return super().from_config(config)
81
+
82
+ async def compute(self, results: list[QuestionAnswerResult[QuestionAnswerPromptOutputT]]) -> dict:
83
+ """
84
+ Compute the metric.
85
+
86
+ Args:
87
+ results: The evaluation results.
88
+
89
+ Returns:
90
+ The computed metric.
91
+ """
92
+ metric_results = chain.from_iterable(
93
+ [
94
+ await asyncio.gather(*[asyncio.to_thread(self._call_metric, result) for result in batch])
95
+ for batch in batched(results, self.batch_size)
96
+ ]
97
+ )
98
+ return self.metric.aggregate(list(metric_results))
99
+
100
+ @abstractmethod
101
+ def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
102
+ """
103
+ Call the metric with the proper arguments.
104
+ """
105
+
106
+
107
+ class QuestionAnswerAnswerCorrectness(QuestionAnswerMetric[LLMBasedAnswerCorrectness]):
108
+ """
109
+ Metric checking answer correctness based on LLM.
110
+ More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_correctness).
111
+ """
112
+
113
+ metric_cls: type[LLMBasedAnswerCorrectness] = LLMBasedAnswerCorrectness
114
+
115
+ def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
116
+ return self.metric(
117
+ question=result.question,
118
+ answer=(
119
+ result.predicted_result.content
120
+ if isinstance(result.predicted_result.content, str)
121
+ else result.predicted_result.content.answer
122
+ ),
123
+ ground_truth_answers=result.reference_answer,
124
+ )
125
+
126
+
127
+ class QuestionAnswerAnswerFaithfulness(QuestionAnswerMetric[LLMBasedFaithfulness]):
128
+ """
129
+ Metric checking answer faithfulness based on LLM.
130
+ More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_faithfulness).
131
+ """
132
+
133
+ metric_cls: type[LLMBasedFaithfulness] = LLMBasedFaithfulness
134
+
135
+ def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
136
+ return self.metric(
137
+ question=result.question,
138
+ answer=(
139
+ result.predicted_result.content
140
+ if isinstance(result.predicted_result.content, str)
141
+ else result.predicted_result.content.answer
142
+ ),
143
+ retrieved_context=result.reference_context,
144
+ )
145
+
146
+
147
+ class QuestionAnswerAnswerRelevance(QuestionAnswerMetric[LLMBasedAnswerRelevance]):
148
+ """
149
+ Metric checking answer relevance based on LLM.
150
+ More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_relevance).
151
+ """
152
+
153
+ metric_cls: type[LLMBasedAnswerRelevance] = LLMBasedAnswerRelevance
154
+
155
+ def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
156
+ return self.metric(
157
+ question=result.question,
158
+ answer=(
159
+ result.predicted_result.content
160
+ if isinstance(result.predicted_result.content, str)
161
+ else result.predicted_result.content.answer
162
+ ),
163
+ )
164
+
165
+
166
+ class QuestionAnswerAnswerConsistency(QuestionAnswerMetric[LLMBasedStyleConsistency]):
167
+ """
168
+ Metric checking answer relevance based on LLM.
169
+ More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_style).
170
+ """
171
+
172
+ metric_cls: type[LLMBasedStyleConsistency] = LLMBasedStyleConsistency
173
+
174
+ def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
175
+ return self.metric(
176
+ answer=(
177
+ result.predicted_result.content
178
+ if isinstance(result.predicted_result.content, str)
179
+ else result.predicted_result.content.answer
180
+ ),
181
+ ground_truth_answers=result.reference_answer,
182
+ )
@@ -61,7 +61,7 @@ class Optimizer(WithConstructionConfig):
61
61
  evaluator_config = EvaluatorConfig.model_validate(optimizer_config.evaluator)
62
62
 
63
63
  dataloader: DataLoader = DataLoader.subclass_from_config(evaluator_config.evaluation.dataloader)
64
- metrics: MetricSet = MetricSet.from_config(evaluator_config.evaluation.metrics)
64
+ metricset: MetricSet = MetricSet.from_config(evaluator_config.evaluation.metrics)
65
65
 
66
66
  pipeline_class = import_by_path(evaluator_config.evaluation.pipeline.type)
67
67
  pipeline_config = dict(evaluator_config.evaluation.pipeline.config)
@@ -71,7 +71,7 @@ class Optimizer(WithConstructionConfig):
71
71
  return optimizer.optimize(
72
72
  pipeline_class=pipeline_class,
73
73
  pipeline_config=pipeline_config,
74
- metrics=metrics,
74
+ metricset=metricset,
75
75
  dataloader=dataloader,
76
76
  callbacks=callbacks,
77
77
  )
@@ -81,7 +81,7 @@ class Optimizer(WithConstructionConfig):
81
81
  pipeline_class: type[EvaluationPipeline],
82
82
  pipeline_config: dict,
83
83
  dataloader: DataLoader,
84
- metrics: MetricSet,
84
+ metricset: MetricSet,
85
85
  callbacks: list[Callable] | None = None,
86
86
  ) -> list[tuple[dict, float, dict[str, float]]]:
87
87
  """
@@ -91,7 +91,7 @@ class Optimizer(WithConstructionConfig):
91
91
  pipeline_class: Pipeline to be optimized.
92
92
  pipeline_config: Configuration defining the optimization process.
93
93
  dataloader: Data loader.
94
- metrics: Metrics to be optimized.
94
+ metricset: Metrics to be optimized.
95
95
  callbacks: Experiment callbacks.
96
96
 
97
97
  Returns:
@@ -104,7 +104,7 @@ class Optimizer(WithConstructionConfig):
104
104
  pipeline_class=pipeline_class,
105
105
  pipeline_config=pipeline_config,
106
106
  dataloader=dataloader,
107
- metrics=metrics,
107
+ metricset=metricset,
108
108
  )
109
109
 
110
110
  study = optuna.create_study(direction=self.direction)
@@ -131,7 +131,7 @@ class Optimizer(WithConstructionConfig):
131
131
  pipeline_class: type[EvaluationPipeline],
132
132
  pipeline_config: dict,
133
133
  dataloader: DataLoader,
134
- metrics: MetricSet,
134
+ metricset: MetricSet,
135
135
  ) -> float:
136
136
  """
137
137
  Run a single experiment.
@@ -153,11 +153,11 @@ class Optimizer(WithConstructionConfig):
153
153
  evaluator.compute(
154
154
  pipeline=pipeline,
155
155
  dataloader=dataloader,
156
- metrics=metrics,
156
+ metricset=metricset,
157
157
  )
158
158
  )
159
- score = sum(results["metrics"].values())
160
- metrics_values = results["metrics"]
159
+ score = sum(results.metrics.values())
160
+ metrics_values = results.metrics
161
161
  break
162
162
  except Exception as exc:
163
163
  message = (
@@ -1,4 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
+ from collections.abc import Iterable
2
3
  from dataclasses import dataclass
3
4
  from types import ModuleType
4
5
  from typing import ClassVar, Generic, TypeVar
@@ -51,7 +52,7 @@ class EvaluationPipeline(WithConstructionConfig, Generic[EvaluationTargetT, Eval
51
52
  pass
52
53
 
53
54
  @abstractmethod
54
- async def __call__(self, data: EvaluationDataT) -> EvaluationResultT:
55
+ async def __call__(self, data: Iterable[EvaluationDataT]) -> Iterable[EvaluationResultT]:
55
56
  """
56
57
  Run the evaluation pipeline.
57
58
 
@@ -1,3 +1,5 @@
1
+ import asyncio
2
+ from collections.abc import Iterable, Sequence
1
3
  from dataclasses import dataclass
2
4
  from uuid import uuid4
3
5
 
@@ -5,6 +7,7 @@ from typing_extensions import Self
5
7
 
6
8
  from ragbits.core.sources.hf import HuggingFaceSource
7
9
  from ragbits.document_search import DocumentSearch
10
+ from ragbits.document_search.documents.element import Element
8
11
  from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
9
12
 
10
13
 
@@ -14,7 +17,9 @@ class DocumentSearchData(EvaluationData):
14
17
  """
15
18
 
16
19
  question: str
17
- reference_passages: list[str]
20
+ reference_document_ids: list[str | int] | None = None
21
+ reference_passages: list[str] | None = None
22
+ reference_page_numbers: list[int] | None = None
18
23
 
19
24
 
20
25
  @dataclass
@@ -24,8 +29,10 @@ class DocumentSearchResult(EvaluationResult):
24
29
  """
25
30
 
26
31
  question: str
27
- reference_passages: list[str]
28
- predicted_passages: list[str]
32
+ predicted_elements: Sequence[Element]
33
+ reference_document_ids: list[str | int] | None = None
34
+ reference_passages: list[str] | None = None
35
+ reference_page_numbers: list[int] | None = None
29
36
 
30
37
 
31
38
  class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchData, DocumentSearchResult]):
@@ -60,7 +67,7 @@ class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchDa
60
67
  # TODO: optimize this for cases with duplicated document search configs between runs
61
68
  if config.get("source"):
62
69
  config["vector_store"]["config"]["index_name"] = str(uuid4())
63
- evaluation_target = DocumentSearch.from_config(config)
70
+ evaluation_target: DocumentSearch = DocumentSearch.from_config(config)
64
71
  return cls(evaluation_target=evaluation_target, source=config.get("source"))
65
72
 
66
73
  async def prepare(self) -> None:
@@ -76,21 +83,24 @@ class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchDa
76
83
  )
77
84
  await self.evaluation_target.ingest(sources)
78
85
 
79
- async def __call__(self, data: DocumentSearchData) -> DocumentSearchResult:
86
+ async def __call__(self, data: Iterable[DocumentSearchData]) -> Iterable[DocumentSearchResult]:
80
87
  """
81
88
  Run the document search evaluation pipeline.
82
89
 
83
90
  Args:
84
- data: The evaluation data.
91
+ data: The evaluation data batch.
85
92
 
86
93
  Returns:
87
- The evaluation result.
94
+ The evaluation result batch.
88
95
  """
89
- elements = await self.evaluation_target.search(data.question)
90
- predicted_passages = [element.text_representation for element in elements if element.text_representation]
91
-
92
- return DocumentSearchResult(
93
- question=data.question,
94
- reference_passages=data.reference_passages,
95
- predicted_passages=predicted_passages,
96
- )
96
+ results = await asyncio.gather(*[self.evaluation_target.search(row.question) for row in data])
97
+ return [
98
+ DocumentSearchResult(
99
+ question=row.question,
100
+ predicted_elements=elements,
101
+ reference_document_ids=row.reference_document_ids,
102
+ reference_passages=row.reference_passages,
103
+ reference_page_numbers=row.reference_page_numbers,
104
+ )
105
+ for row, elements in zip(data, results, strict=False)
106
+ ]
@@ -0,0 +1,96 @@
1
+ import asyncio
2
+ from collections.abc import Iterable
3
+ from dataclasses import dataclass
4
+ from typing import Any, Generic
5
+
6
+ from typing_extensions import Self
7
+
8
+ from ragbits.agents._main import AgentResult
9
+ from ragbits.agents.types import (
10
+ QuestionAnswerAgent,
11
+ QuestionAnswerPromptInput,
12
+ QuestionAnswerPromptOutputT,
13
+ )
14
+ from ragbits.core.llms.base import LLMClientOptionsT
15
+ from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
16
+
17
+
18
+ class QuestionAnswerData(EvaluationData):
19
+ """
20
+ Represents the evaluation data for question answer.
21
+ """
22
+
23
+ question: str
24
+ reference_answer: str
25
+ reference_context: Any | None = None
26
+
27
+
28
+ @dataclass
29
+ class QuestionAnswerResult(EvaluationResult, Generic[QuestionAnswerPromptOutputT]):
30
+ """
31
+ Represents the result of a single evaluation.
32
+ """
33
+
34
+ question: str
35
+ predicted_result: AgentResult[QuestionAnswerPromptOutputT]
36
+ reference_answer: str
37
+ reference_context: Any | None = None
38
+
39
+
40
+ class QuestionAnswerPipeline(
41
+ EvaluationPipeline[
42
+ QuestionAnswerAgent[LLMClientOptionsT, QuestionAnswerPromptInput, QuestionAnswerPromptOutputT],
43
+ QuestionAnswerData,
44
+ QuestionAnswerResult,
45
+ ]
46
+ ):
47
+ """
48
+ Question answer evaluation pipeline.
49
+ """
50
+
51
+ @classmethod
52
+ def from_config(cls, config: dict) -> Self:
53
+ """
54
+ Create an instance of `QuestionAnswerPipeline` from a configuration dictionary.
55
+
56
+ Args:
57
+ config: A dictionary containing configuration settings for the pipeline.
58
+
59
+ Returns:
60
+ An instance of the pipeline class initialized with the provided configuration.
61
+ """
62
+ config["evaluation_target"] = QuestionAnswerAgent.from_config(config)
63
+ return super().from_config(config)
64
+
65
+ async def __call__(
66
+ self, data: Iterable[QuestionAnswerData]
67
+ ) -> Iterable[QuestionAnswerResult[QuestionAnswerPromptOutputT]]:
68
+ """
69
+ Run the question answer evaluation pipeline.
70
+
71
+ Args:
72
+ data: The evaluation data batch.
73
+
74
+ Returns:
75
+ The evaluation result batch.
76
+ """
77
+ results = await asyncio.gather(
78
+ *[
79
+ self.evaluation_target.run(
80
+ QuestionAnswerPromptInput(
81
+ question=row.question,
82
+ context=row.reference_context,
83
+ )
84
+ )
85
+ for row in data
86
+ ]
87
+ )
88
+ return [
89
+ QuestionAnswerResult(
90
+ question=row.question,
91
+ predicted_result=result,
92
+ reference_answer=row.reference_answer,
93
+ reference_context=row.reference_context,
94
+ )
95
+ for row, result in zip(data, results, strict=False)
96
+ ]
ragbits/evaluate/utils.py CHANGED
@@ -1,5 +1,7 @@
1
1
  import json
2
2
  import sys
3
+ import traceback
4
+ from dataclasses import asdict
3
5
  from datetime import datetime
4
6
  from pathlib import Path
5
7
  from typing import Any
@@ -12,13 +14,15 @@ from neptune.utils import stringify_unsupported
12
14
  from neptune_optuna import NeptuneCallback
13
15
  from omegaconf import DictConfig
14
16
 
17
+ from ragbits.evaluate.evaluator import EvaluatorResult
15
18
 
16
- def log_evaluation_to_file(results: dict, output_dir: Path | None = None) -> Path:
19
+
20
+ def log_evaluation_to_file(result: EvaluatorResult, output_dir: Path | None = None) -> Path:
17
21
  """
18
- Log the evaluation results locally.
22
+ Log the evaluation result locally.
19
23
 
20
24
  Args:
21
- results: The evaluation results.
25
+ result: The evaluation result.
22
26
  output_dir: The output directory.
23
27
 
24
28
  Returns:
@@ -27,27 +31,57 @@ def log_evaluation_to_file(results: dict, output_dir: Path | None = None) -> Pat
27
31
  output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
28
32
  metrics_file = output_dir / "metrics.json"
29
33
  results_file = output_dir / "results.json"
30
-
31
- _save_json(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"])
32
- _save_json(results_file, results=results["results"])
34
+ errors_file = output_dir / "errors.json"
35
+
36
+ _save_json(metrics_file, metrics=result.metrics, time_perf=asdict(result.time_perf))
37
+ _save_json(results_file, results=[asdict(entry) for entry in result.results])
38
+ _save_json(
39
+ errors_file,
40
+ errors=[
41
+ {
42
+ "type": exc.__class__.__name__,
43
+ "message": str(exc),
44
+ "stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
45
+ }
46
+ for exc in result.errors
47
+ ],
48
+ )
33
49
 
34
50
  return output_dir
35
51
 
36
52
 
37
- def log_evaluation_to_neptune(results: dict, config: DictConfig, tags: str | list[str] | None = None) -> None:
53
+ def log_evaluation_to_neptune(result: EvaluatorResult, config: DictConfig, tags: str | list[str] | None = None) -> None:
38
54
  """
39
- Log the evaluation results to Neptune.
55
+ Log the evaluation result to Neptune.
40
56
 
41
57
  Args:
42
- results: Evaluation results.
43
- config: Evaluation configuration.
44
- tags: Experiment tags.
58
+ result: The evaluation result.
59
+ config: The evaluation configuration.
60
+ tags: The experiment tags.
45
61
  """
46
62
  run = Run(tags=tags)
47
63
  run["config"] = stringify_unsupported(config)
48
- run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
49
- run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
50
- run["evaluation/results"].upload(File.from_content(json.dumps(results["results"], indent=4), extension="json"))
64
+ run["evaluation/metrics"] = stringify_unsupported(result.metrics)
65
+ run["evaluation/time_perf"] = stringify_unsupported(asdict(result.time_perf))
66
+ run["evaluation/results"].upload(
67
+ File.from_content(json.dumps([asdict(entry) for entry in result.results], indent=4), extension="json")
68
+ )
69
+ run["evaluation/errors"].upload(
70
+ File.from_content(
71
+ json.dumps(
72
+ [
73
+ {
74
+ "type": exc.__class__.__name__,
75
+ "message": str(exc),
76
+ "stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
77
+ }
78
+ for exc in result.errors
79
+ ],
80
+ indent=4,
81
+ ),
82
+ extension="json",
83
+ )
84
+ )
51
85
 
52
86
 
53
87
  def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Path:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 0.17.1
3
+ Version: 0.19.0
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.4.1
27
27
  Requires-Dist: hydra-core<2.0.0,>=1.3.2
28
28
  Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
29
29
  Requires-Dist: optuna<5.0.0,>=4.0.0
30
- Requires-Dist: ragbits-core==0.17.1
30
+ Requires-Dist: ragbits-core==0.19.0
31
31
  Provides-Extra: relari
32
32
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
33
33
  Description-Content-Type: text/markdown
@@ -1,14 +1,15 @@
1
1
  ragbits/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- ragbits/evaluate/cli.py,sha256=MEDo8ubk81TCNx-fq-liF0P5hjn2-kPpIfq54fReKIY,4509
2
+ ragbits/evaluate/cli.py,sha256=vP8l2DyNXpR6jQP83wXKP_RRnGjEXjKnTVBg9RPbDKo,4505
3
3
  ragbits/evaluate/config.py,sha256=2WSmbVxyQi893L2FSjRFQoXkWZp1GetcNmR2GCDe0tA,339
4
- ragbits/evaluate/evaluator.py,sha256=Cif-QX2n5awOGm-AfFy2nRXkb_m4vGY_JZ_o4K4PhZI,5552
5
- ragbits/evaluate/optimizer.py,sha256=egcU54aADqKrN31NPqj7cNIQO4UISfG7VtkOAQyQUOY,8471
4
+ ragbits/evaluate/evaluator.py,sha256=awRDaDTubHtM_1SANIeE5GhQ0v9fawF0q1Tj6FWZDvQ,7348
5
+ ragbits/evaluate/optimizer.py,sha256=RqYgoiCIEhjXO0HEN6uwldblHyoPuT3qUdncuHPZgCg,8485
6
6
  ragbits/evaluate/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- ragbits/evaluate/utils.py,sha256=rTTmrP4nv3D7174cMEfohxrDN5thPScH0BsXaptMHqQ,3757
7
+ ragbits/evaluate/utils.py,sha256=w-hbvKRHI9tEva9wKDTVla0Wm2eCHT2MxVkof27Sqfw,4831
8
8
  ragbits/evaluate/dataloaders/__init__.py,sha256=UFJFjmvi3GUQFsx6A5sYD01HH2f7TXcHRW2VNM1pmIA,83
9
- ragbits/evaluate/dataloaders/base.py,sha256=ovL38_tH12q9wd3yeflIlovGuSD8S1X9HUUtwv17QrM,1774
10
- ragbits/evaluate/dataloaders/document_search.py,sha256=sqNPQf1ZYAqM_xMjuwh63ET00zEmKtAzqXX04cazuB8,1579
9
+ ragbits/evaluate/dataloaders/base.py,sha256=x8rEl5utNOziF_9urL0grkqoXwMgaDWYSM5akw3Kt9Y,3213
10
+ ragbits/evaluate/dataloaders/document_search.py,sha256=c9Bc4ZtFEKAiG9B70JFiBZlZDkBSGNWFRKabF7PMTU0,2495
11
11
  ragbits/evaluate/dataloaders/exceptions.py,sha256=xUOBLj1JuCkcqzRVnu0A0I_i1THxbDt2MEDVdDGjDyY,735
12
+ ragbits/evaluate/dataloaders/question_answer.py,sha256=naXFDtla0otOTWSyHVvWvgDYEq-Wry4irnAJR2tHMNg,2032
12
13
  ragbits/evaluate/dataset_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
14
  ragbits/evaluate/dataset_generator/pipeline.py,sha256=dgnV-Qm0Z7S1Y6ga9-9RscXxxr3krOKsIj7E9WS4ANk,4940
14
15
  ragbits/evaluate/dataset_generator/utils.py,sha256=zD-ksXlX62kkIgzBefE4ILsP7He9bHimnZ63LLsMKCA,1325
@@ -23,13 +24,15 @@ ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py,sha256=ydMHyI0JrWZf
23
24
  ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
25
  ragbits/evaluate/dataset_generator/tasks/text_generation/base.py,sha256=2h-Y14H3fRHKbTNvXWKRus8t0hdTITd9LMoIFVwfKfA,2138
25
26
  ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py,sha256=QAClPbTVNCe4QzVOGuepRnsmkt9ZF6bXBAuJI2elRuE,3851
26
- ragbits/evaluate/factories/__init__.py,sha256=De2ZgQ4YXgvpMOvm81fSDPSMvKpIBjS-aqeE0dxEU1s,2074
27
+ ragbits/evaluate/factories/__init__.py,sha256=7nh0J80EfqMWRGtHx4hkfHNMztfC6FMhH8gHumwcH9w,1727
27
28
  ragbits/evaluate/metrics/__init__.py,sha256=Mr83ytGyvdXtBlr7Bbo0-5auE0530xsd3wffKSIf8cE,95
28
- ragbits/evaluate/metrics/base.py,sha256=axkGuKJU5u94SnRjpWsdG4jFWjy8rmkSHVRcgz1JLTo,2342
29
- ragbits/evaluate/metrics/document_search.py,sha256=WeC0xuLYci_Vbdw-E4OjawTqmLkcFKjDWSJGITC9-AQ,2851
29
+ ragbits/evaluate/metrics/base.py,sha256=bOscQ_nJXLGWmP2ls9jncrUoeghNBnKDJsab71pFEjo,2519
30
+ ragbits/evaluate/metrics/document_search.py,sha256=MfvMwEPenqiJdKYuW6WLvmtMch9ZVYb0T6ibpOF3vGI,3189
31
+ ragbits/evaluate/metrics/question_answer.py,sha256=_XMFjkJcG-xdOO2fCfoKIhJb5VVM_GK_yKhFGXO8FRM,6566
30
32
  ragbits/evaluate/pipelines/__init__.py,sha256=Bqp_L7aRq12Ua19ELZDsdYvra6-GlLrQ9cIG2IWArko,1294
31
- ragbits/evaluate/pipelines/base.py,sha256=1GPu3MV-2o0PdUuFM4IcLeg1baYv9acqCcGrQykmRSs,1682
32
- ragbits/evaluate/pipelines/document_search.py,sha256=xMcSnahy7fifk2bJoolX9OWCXz4FjSJQfBDHIB1d2mQ,3266
33
- ragbits_evaluate-0.17.1.dist-info/METADATA,sha256=Kcrrm9-4VOIfvHk4q0J9jTxqTZYGCG4nIQdFUY7NJGU,2300
34
- ragbits_evaluate-0.17.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
- ragbits_evaluate-0.17.1.dist-info/RECORD,,
33
+ ragbits/evaluate/pipelines/base.py,sha256=QV3fjPnbJjeCgcbt8yV1Ho3BamEUc3wSca3MAzaBlV0,1739
34
+ ragbits/evaluate/pipelines/document_search.py,sha256=tgk-I21eshdBbWVsuNa1zWK_fWuDNXhhMCn1_Fdu_Ko,3840
35
+ ragbits/evaluate/pipelines/question_answer.py,sha256=3CYVHDLnOy4z7kgYPMluiJ8POulHo-w3PEiqvqsF4Dc,2797
36
+ ragbits_evaluate-0.19.0.dist-info/METADATA,sha256=h-R6_pE37yh9C7RgpMyb2NS6F_l-wrNEdou2MrhZoOk,2300
37
+ ragbits_evaluate-0.19.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ ragbits_evaluate-0.19.0.dist-info/RECORD,,