ragbits-evaluate 0.17.1__tar.gz → 0.19.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ragbits-evaluate might be problematic. Click here for more details.
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/CHANGELOG.md +22 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/PKG-INFO +2 -2
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/pyproject.toml +2 -2
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/cli.py +2 -2
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataloaders/base.py +36 -3
- ragbits_evaluate-0.19.0/src/ragbits/evaluate/dataloaders/document_search.py +61 -0
- ragbits_evaluate-0.19.0/src/ragbits/evaluate/dataloaders/question_answer.py +57 -0
- ragbits_evaluate-0.19.0/src/ragbits/evaluate/evaluator.py +224 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/factories/__init__.py +11 -26
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/metrics/base.py +8 -4
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/metrics/document_search.py +13 -2
- ragbits_evaluate-0.19.0/src/ragbits/evaluate/metrics/question_answer.py +182 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/optimizer.py +9 -9
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/pipelines/base.py +2 -1
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/pipelines/document_search.py +25 -15
- ragbits_evaluate-0.19.0/src/ragbits/evaluate/pipelines/question_answer.py +96 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/utils.py +48 -14
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/tests/cli/test_run_evaluation.py +4 -4
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/tests/unit/test_evaluator.py +41 -18
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/tests/unit/test_metrics.py +59 -25
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/tests/unit/test_optimizer.py +14 -7
- ragbits_evaluate-0.17.1/src/ragbits/evaluate/dataloaders/document_search.py +0 -45
- ragbits_evaluate-0.17.1/src/ragbits/evaluate/evaluator.py +0 -179
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/.gitignore +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/README.md +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/config.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataloaders/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataloaders/exceptions.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/metrics/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/pipelines/__init__.py +0 -0
- {ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/py.typed +0 -0
|
@@ -2,6 +2,28 @@
|
|
|
2
2
|
|
|
3
3
|
## Unreleased
|
|
4
4
|
|
|
5
|
+
## 0.19.0 (2025-05-27)
|
|
6
|
+
|
|
7
|
+
### Changed
|
|
8
|
+
|
|
9
|
+
- ragbits-core updated to version v0.19.0
|
|
10
|
+
|
|
11
|
+
- Add evals for question answering (#577)
|
|
12
|
+
- Add support for slicing dataset (#576)
|
|
13
|
+
- Separate load and map ops in data loaders (#576)
|
|
14
|
+
|
|
15
|
+
## 0.18.0 (2025-05-22)
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
|
|
19
|
+
- ragbits-core updated to version v0.18.0
|
|
20
|
+
|
|
21
|
+
- Add support for custom column names in evaluation dataset (#566)
|
|
22
|
+
- Add support for reference document ids and page numbers in evaluation dataset (#566)
|
|
23
|
+
- BREAKING CHANGE: Adjust eval pipline interface to batch processing (#555)
|
|
24
|
+
- Rename DocumentMeta create_text_document_from_literal to from_literal (#561)
|
|
25
|
+
- Adjust typing for DocumentSearch (#554)
|
|
26
|
+
|
|
5
27
|
## 0.17.1 (2025-05-09)
|
|
6
28
|
|
|
7
29
|
### Changed
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragbits-evaluate
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.19.0
|
|
4
4
|
Summary: Evaluation module for Ragbits components
|
|
5
5
|
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
6
|
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.4.1
|
|
|
27
27
|
Requires-Dist: hydra-core<2.0.0,>=1.3.2
|
|
28
28
|
Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
|
|
29
29
|
Requires-Dist: optuna<5.0.0,>=4.0.0
|
|
30
|
-
Requires-Dist: ragbits-core==0.
|
|
30
|
+
Requires-Dist: ragbits-core==0.19.0
|
|
31
31
|
Provides-Extra: relari
|
|
32
32
|
Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
|
|
33
33
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ragbits-evaluate"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.19.0"
|
|
4
4
|
description = "Evaluation module for Ragbits components"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -32,7 +32,7 @@ classifiers = [
|
|
|
32
32
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
33
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
34
34
|
]
|
|
35
|
-
dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.4.1,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==0.
|
|
35
|
+
dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.4.1,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==0.19.0"]
|
|
36
36
|
|
|
37
37
|
[project.urls]
|
|
38
38
|
"Homepage" = "https://github.com/deepsense-ai/ragbits"
|
|
@@ -140,10 +140,10 @@ def run() -> None:
|
|
|
140
140
|
metric_results = await evaluator.compute(
|
|
141
141
|
pipeline=state.pipeline,
|
|
142
142
|
dataloader=state.dataloader,
|
|
143
|
-
|
|
143
|
+
metricset=state.metrics,
|
|
144
144
|
)
|
|
145
145
|
evaluation_results = EvaluationResult(
|
|
146
|
-
metrics={"metrics": metric_results
|
|
146
|
+
metrics={"metrics": metric_results.metrics, "time_perf": metric_results.time_perf}
|
|
147
147
|
)
|
|
148
148
|
print_output(evaluation_results)
|
|
149
149
|
|
{ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/dataloaders/base.py
RENAMED
|
@@ -3,12 +3,14 @@ from collections.abc import Iterable
|
|
|
3
3
|
from types import ModuleType
|
|
4
4
|
from typing import ClassVar, Generic
|
|
5
5
|
|
|
6
|
+
from datasets import load_dataset
|
|
6
7
|
from pydantic import BaseModel
|
|
7
8
|
from typing_extensions import Self
|
|
8
9
|
|
|
9
10
|
from ragbits.core.sources.base import Source
|
|
10
11
|
from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
|
|
11
12
|
from ragbits.evaluate import dataloaders
|
|
13
|
+
from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
|
|
12
14
|
from ragbits.evaluate.pipelines.base import EvaluationDataT
|
|
13
15
|
|
|
14
16
|
|
|
@@ -28,14 +30,19 @@ class DataLoader(WithConstructionConfig, Generic[EvaluationDataT], ABC):
|
|
|
28
30
|
default_module: ClassVar[ModuleType | None] = dataloaders
|
|
29
31
|
configuration_key: ClassVar[str] = "dataloader"
|
|
30
32
|
|
|
31
|
-
def __init__(self, source: Source) -> None:
|
|
33
|
+
def __init__(self, source: Source, *, split: str = "data", required_keys: set[str] | None = None) -> None:
|
|
32
34
|
"""
|
|
33
35
|
Initialize the data loader.
|
|
34
36
|
|
|
35
37
|
Args:
|
|
36
38
|
source: The source to load the evaluation data from.
|
|
39
|
+
split: The split to load the data from. Split is fixed for data loaders to "data",
|
|
40
|
+
but you can slice it using the [Hugging Face API](https://huggingface.co/docs/datasets/v1.11.0/splits.html#slicing-api).
|
|
41
|
+
required_keys: The required columns for the evaluation data.
|
|
37
42
|
"""
|
|
38
43
|
self.source = source
|
|
44
|
+
self.split = split
|
|
45
|
+
self.required_keys = required_keys or set()
|
|
39
46
|
|
|
40
47
|
@classmethod
|
|
41
48
|
def from_config(cls, config: dict) -> Self:
|
|
@@ -52,11 +59,37 @@ class DataLoader(WithConstructionConfig, Generic[EvaluationDataT], ABC):
|
|
|
52
59
|
config["source"] = Source.subclass_from_config(dataloader_config.source)
|
|
53
60
|
return super().from_config(config)
|
|
54
61
|
|
|
55
|
-
@abstractmethod
|
|
56
62
|
async def load(self) -> Iterable[EvaluationDataT]:
|
|
57
63
|
"""
|
|
58
64
|
Load the data.
|
|
59
65
|
|
|
60
66
|
Returns:
|
|
61
|
-
The loaded data.
|
|
67
|
+
The loaded evaluation data.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
DataLoaderIncorrectFormatDataError: If evaluation dataset is incorrectly formatted.
|
|
71
|
+
"""
|
|
72
|
+
data_path = await self.source.fetch()
|
|
73
|
+
dataset = load_dataset(
|
|
74
|
+
path=str(data_path.parent),
|
|
75
|
+
data_files={"data": str(data_path.name)},
|
|
76
|
+
split=self.split,
|
|
77
|
+
)
|
|
78
|
+
if not self.required_keys.issubset(dataset.features):
|
|
79
|
+
raise DataLoaderIncorrectFormatDataError(
|
|
80
|
+
required_features=list(self.required_keys),
|
|
81
|
+
data_path=data_path,
|
|
82
|
+
)
|
|
83
|
+
return await self.map(dataset.to_list())
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
async def map(self, dataset: Iterable[dict]) -> Iterable[EvaluationDataT]:
|
|
87
|
+
"""
|
|
88
|
+
Map the dataset to the evaluation data.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
dataset: The dataset to map.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The evaluation data.
|
|
62
95
|
"""
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from ragbits.core.sources.base import Source
|
|
4
|
+
from ragbits.evaluate.dataloaders.base import DataLoader
|
|
5
|
+
from ragbits.evaluate.pipelines.document_search import DocumentSearchData
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
|
|
9
|
+
"""
|
|
10
|
+
Document search evaluation data loader.
|
|
11
|
+
|
|
12
|
+
The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
source: Source,
|
|
18
|
+
*,
|
|
19
|
+
split: str = "data",
|
|
20
|
+
question_key: str = "question",
|
|
21
|
+
document_ids_key: str = "document_ids",
|
|
22
|
+
passages_key: str = "passages",
|
|
23
|
+
page_numbers_key: str = "page_numbers",
|
|
24
|
+
) -> None:
|
|
25
|
+
"""
|
|
26
|
+
Initialize the document search data loader.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
source: The source to load the data from.
|
|
30
|
+
split: The split to load the data from. Split is fixed for data loaders to "data",
|
|
31
|
+
but you can slice it using the [Hugging Face API](https://huggingface.co/docs/datasets/v1.11.0/splits.html#slicing-api).
|
|
32
|
+
question_key: The dataset column name that contains the question.
|
|
33
|
+
document_ids_key: The dataset column name that contains the document ids. Document ids are optional.
|
|
34
|
+
passages_key: The dataset column name that contains the passages. Passages are optional.
|
|
35
|
+
page_numbers_key: The dataset column name that contains the page numbers. Page numbers are optional.
|
|
36
|
+
"""
|
|
37
|
+
super().__init__(source=source, split=split, required_keys={question_key})
|
|
38
|
+
self.question_key = question_key
|
|
39
|
+
self.document_ids_key = document_ids_key
|
|
40
|
+
self.passages_key = passages_key
|
|
41
|
+
self.page_numbers_key = page_numbers_key
|
|
42
|
+
|
|
43
|
+
async def map(self, dataset: Iterable[dict]) -> Iterable[DocumentSearchData]:
|
|
44
|
+
"""
|
|
45
|
+
Map the dataset to the document search data schema.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
dataset: The dataset to map.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The document search data.
|
|
52
|
+
"""
|
|
53
|
+
return [
|
|
54
|
+
DocumentSearchData(
|
|
55
|
+
question=data.get(self.question_key, ""),
|
|
56
|
+
reference_document_ids=data.get(self.document_ids_key),
|
|
57
|
+
reference_passages=data.get(self.passages_key),
|
|
58
|
+
reference_page_numbers=data.get(self.page_numbers_key),
|
|
59
|
+
)
|
|
60
|
+
for data in dataset
|
|
61
|
+
]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from ragbits.core.sources.base import Source
|
|
4
|
+
from ragbits.evaluate.dataloaders.base import DataLoader
|
|
5
|
+
from ragbits.evaluate.pipelines.question_answer import QuestionAnswerData
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class QuestionAnswerDataLoader(DataLoader[QuestionAnswerData]):
|
|
9
|
+
"""
|
|
10
|
+
Question answer evaluation data loader.
|
|
11
|
+
|
|
12
|
+
The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
source: Source,
|
|
18
|
+
*,
|
|
19
|
+
split: str = "data",
|
|
20
|
+
question_key: str = "question",
|
|
21
|
+
answer_key: str = "answer",
|
|
22
|
+
context_key: str = "context",
|
|
23
|
+
) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Initialize the question answer data loader.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
source: The source to load the data from.
|
|
29
|
+
split: The split to load the data from.
|
|
30
|
+
required_keys: The required keys to load the data from.
|
|
31
|
+
question_key: The dataset column name that contains the question.
|
|
32
|
+
answer_key: The dataset column name that contains the answer.
|
|
33
|
+
context_key: The dataset column name that contains the context. Context is optional.
|
|
34
|
+
"""
|
|
35
|
+
super().__init__(source=source, split=split, required_keys={question_key, answer_key})
|
|
36
|
+
self.question_key = question_key
|
|
37
|
+
self.answer_key = answer_key
|
|
38
|
+
self.context_key = context_key
|
|
39
|
+
|
|
40
|
+
async def map(self, dataset: Iterable[dict]) -> Iterable[QuestionAnswerData]:
|
|
41
|
+
"""
|
|
42
|
+
Map the dataset to the question answer data schema.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
dataset: The dataset to map.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The question answer data.
|
|
49
|
+
"""
|
|
50
|
+
return [
|
|
51
|
+
QuestionAnswerData(
|
|
52
|
+
question=data.get(self.question_key, ""),
|
|
53
|
+
reference_answer=data.get(self.answer_key, ""),
|
|
54
|
+
reference_context=data.get(self.context_key),
|
|
55
|
+
)
|
|
56
|
+
for data in dataset
|
|
57
|
+
]
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import random
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import Awaitable, Callable, Iterable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Generic, ParamSpec, TypeVar
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
|
|
12
|
+
from ragbits.core.utils.helpers import batched
|
|
13
|
+
from ragbits.evaluate.dataloaders.base import DataLoader
|
|
14
|
+
from ragbits.evaluate.metrics.base import MetricSet
|
|
15
|
+
from ragbits.evaluate.pipelines.base import EvaluationDataT, EvaluationPipeline, EvaluationResultT, EvaluationTargetT
|
|
16
|
+
|
|
17
|
+
_CallP = ParamSpec("_CallP")
|
|
18
|
+
_CallReturnT = TypeVar("_CallReturnT")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class EvaluationTimePerf:
|
|
23
|
+
"""
|
|
24
|
+
Container for evaluation time performance metrics.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
total_time_in_seconds: float
|
|
28
|
+
samples_per_second: float
|
|
29
|
+
latency_in_seconds: float
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class EvaluatorResult(Generic[EvaluationResultT]):
|
|
34
|
+
"""
|
|
35
|
+
Container for evaluation results.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
metrics: dict[str, int | float]
|
|
39
|
+
results: list[EvaluationResultT]
|
|
40
|
+
errors: list[Exception]
|
|
41
|
+
time_perf: EvaluationTimePerf
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class EvaluationConfig(BaseModel):
|
|
45
|
+
"""
|
|
46
|
+
Schema for the evaluation run config.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
pipeline: ObjectConstructionConfig
|
|
50
|
+
dataloader: ObjectConstructionConfig
|
|
51
|
+
metrics: dict[str, ObjectConstructionConfig]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class EvaluatorConfig(BaseModel):
|
|
55
|
+
"""
|
|
56
|
+
Schema for the evaluator config.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
evaluation: EvaluationConfig
|
|
60
|
+
evaluator: dict | None = None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class Evaluator(WithConstructionConfig):
|
|
64
|
+
"""
|
|
65
|
+
Evaluator class.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
batch_size: int = 10,
|
|
71
|
+
num_retries: int = 3,
|
|
72
|
+
backoff_multiplier: int = 1,
|
|
73
|
+
backoff_max: int = 60,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Initialize the Evaluator instance.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
batch_size: batch size for the evaluation pipeline inference.
|
|
80
|
+
num_retries: The number of retries per evaluation pipeline inference error.
|
|
81
|
+
backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
|
|
82
|
+
backoff_max: The maximum allowed delay (in seconds) between retries.
|
|
83
|
+
"""
|
|
84
|
+
self.batch_size = batch_size
|
|
85
|
+
self.num_retries = num_retries
|
|
86
|
+
self.backoff_multiplier = backoff_multiplier
|
|
87
|
+
self.backoff_max = backoff_max
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
async def run_from_config(cls, config: dict) -> EvaluatorResult:
|
|
91
|
+
"""
|
|
92
|
+
Run the evaluation based on configuration.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
config: Evaluation config.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
The evaluation results.
|
|
99
|
+
"""
|
|
100
|
+
evaluator_config = EvaluatorConfig.model_validate(config)
|
|
101
|
+
evaluation_config = EvaluationConfig.model_validate(evaluator_config.evaluation)
|
|
102
|
+
pipeline: EvaluationPipeline = EvaluationPipeline.subclass_from_config(evaluation_config.pipeline)
|
|
103
|
+
dataloader: DataLoader = DataLoader.subclass_from_config(evaluation_config.dataloader)
|
|
104
|
+
metricset: MetricSet = MetricSet.from_config(evaluation_config.metrics)
|
|
105
|
+
|
|
106
|
+
evaluator = cls.from_config(evaluator_config.evaluator or {})
|
|
107
|
+
return await evaluator.compute(
|
|
108
|
+
pipeline=pipeline,
|
|
109
|
+
dataloader=dataloader,
|
|
110
|
+
metricset=metricset,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
async def compute(
|
|
114
|
+
self,
|
|
115
|
+
pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
|
|
116
|
+
dataloader: DataLoader[EvaluationDataT],
|
|
117
|
+
metricset: MetricSet[EvaluationResultT],
|
|
118
|
+
) -> EvaluatorResult[EvaluationResultT]:
|
|
119
|
+
"""
|
|
120
|
+
Compute the evaluation results for the given pipeline and data.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
pipeline: The pipeline to be evaluated.
|
|
124
|
+
dataloader: The dataloader to load the data.
|
|
125
|
+
metricset: The metrics to be computed.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
The evaluation results.
|
|
129
|
+
"""
|
|
130
|
+
await pipeline.prepare()
|
|
131
|
+
|
|
132
|
+
dataset = await dataloader.load()
|
|
133
|
+
results, errors, time_perf = await self._call_pipeline(pipeline, dataset)
|
|
134
|
+
metrics = await metricset.compute(results)
|
|
135
|
+
|
|
136
|
+
return EvaluatorResult(
|
|
137
|
+
metrics=metrics,
|
|
138
|
+
results=results,
|
|
139
|
+
errors=errors,
|
|
140
|
+
time_perf=time_perf,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
async def _call_pipeline(
|
|
144
|
+
self,
|
|
145
|
+
pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
|
|
146
|
+
dataset: Iterable[EvaluationDataT],
|
|
147
|
+
) -> tuple[list[EvaluationResultT], list[Exception], EvaluationTimePerf]:
|
|
148
|
+
"""
|
|
149
|
+
Call the pipeline with the given data.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
pipeline: The pipeline to be called.
|
|
153
|
+
dataset: The dataset to be processed.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
The evaluation results and performance metrics.
|
|
157
|
+
"""
|
|
158
|
+
start_time = time.perf_counter()
|
|
159
|
+
outputs = [
|
|
160
|
+
await self._call_with_error_handling(pipeline, data)
|
|
161
|
+
for data in tqdm(batched(dataset, self.batch_size), desc="Evaluation")
|
|
162
|
+
]
|
|
163
|
+
end_time = time.perf_counter()
|
|
164
|
+
|
|
165
|
+
errors = [output for output in outputs if isinstance(output, Exception)]
|
|
166
|
+
results = [item for output in outputs if not isinstance(output, Exception) for item in output]
|
|
167
|
+
|
|
168
|
+
return results, errors, self._compute_time_perf(start_time, end_time, len(outputs))
|
|
169
|
+
|
|
170
|
+
async def _call_with_error_handling(
|
|
171
|
+
self,
|
|
172
|
+
executable: Callable[_CallP, Awaitable[_CallReturnT]],
|
|
173
|
+
*executable_args: _CallP.args,
|
|
174
|
+
**executable_kwargs: _CallP.kwargs,
|
|
175
|
+
) -> _CallReturnT | Exception:
|
|
176
|
+
"""
|
|
177
|
+
Call executable with a standarized error handling.
|
|
178
|
+
If an error occurs, the executable is retried `num_retries` times using randomized exponential backoff.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
executable: The callable function to execute.
|
|
182
|
+
executable_args: Positional arguments to pass to the executable.
|
|
183
|
+
executable_kwargs: Keyword arguments to pass to the executable.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
The result of the executable if successful.
|
|
187
|
+
|
|
188
|
+
Raises:
|
|
189
|
+
Exception: The last encountered exception after all retries are exhausted.
|
|
190
|
+
"""
|
|
191
|
+
for i in range(max(0, self.num_retries) + 1):
|
|
192
|
+
try:
|
|
193
|
+
return await executable(*executable_args, **executable_kwargs)
|
|
194
|
+
except Exception as exc:
|
|
195
|
+
if i == self.num_retries:
|
|
196
|
+
return exc
|
|
197
|
+
|
|
198
|
+
delay = random.uniform(0, min(2**i * self.backoff_multiplier, self.backoff_max)) # noqa: S311
|
|
199
|
+
await asyncio.sleep(delay)
|
|
200
|
+
|
|
201
|
+
raise RuntimeError("Unreachable code reached") # mypy quirk
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def _compute_time_perf(start_time: float, end_time: float, num_samples: int) -> EvaluationTimePerf:
|
|
205
|
+
"""
|
|
206
|
+
Compute the performance metrics.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
start_time: The start time.
|
|
210
|
+
end_time: The end time.
|
|
211
|
+
num_samples: The number of samples.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
The performance metrics.
|
|
215
|
+
"""
|
|
216
|
+
latency = end_time - start_time
|
|
217
|
+
throughput = num_samples / latency
|
|
218
|
+
latency_sample = 1.0 / throughput if throughput > 0 else 0.0
|
|
219
|
+
|
|
220
|
+
return EvaluationTimePerf(
|
|
221
|
+
total_time_in_seconds=latency,
|
|
222
|
+
samples_per_second=throughput,
|
|
223
|
+
latency_in_seconds=latency_sample,
|
|
224
|
+
)
|
{ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/factories/__init__.py
RENAMED
|
@@ -1,43 +1,21 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
|
|
3
|
+
from continuous_eval.metrics.retrieval.matching_strategy import RougeChunkMatch
|
|
3
4
|
from datasets import load_dataset
|
|
4
5
|
|
|
5
6
|
from ragbits.core.embeddings.dense import LiteLLMEmbedder
|
|
6
7
|
from ragbits.core.sources.hf import HuggingFaceSource
|
|
7
|
-
from ragbits.core.utils.config_handling import ObjectConstructionConfig
|
|
8
8
|
from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
|
|
9
9
|
from ragbits.document_search import DocumentSearch
|
|
10
10
|
from ragbits.document_search.documents.document import DocumentMeta
|
|
11
11
|
from ragbits.evaluate.dataloaders.document_search import DocumentSearchDataLoader
|
|
12
12
|
from ragbits.evaluate.metrics import MetricSet
|
|
13
|
-
|
|
14
|
-
DS_PRECISION_RECALL_F1 = {
|
|
15
|
-
"precision_recall_f1": ObjectConstructionConfig.model_validate(
|
|
16
|
-
{
|
|
17
|
-
"type": "ragbits.evaluate.metrics.document_search:DocumentSearchPrecisionRecallF1",
|
|
18
|
-
"config": {
|
|
19
|
-
"matching_strategy": {
|
|
20
|
-
"type": "RougeChunkMatch",
|
|
21
|
-
"config": {
|
|
22
|
-
"threshold": 0.5,
|
|
23
|
-
},
|
|
24
|
-
},
|
|
25
|
-
},
|
|
26
|
-
}
|
|
27
|
-
),
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def precision_recall_f1() -> MetricSet:
|
|
32
|
-
"""
|
|
33
|
-
Factory of precision recall f1 metric set for retrival evaluation.
|
|
34
|
-
"""
|
|
35
|
-
return MetricSet.from_config(config=DS_PRECISION_RECALL_F1)
|
|
13
|
+
from ragbits.evaluate.metrics.document_search import DocumentSearchPrecisionRecallF1
|
|
36
14
|
|
|
37
15
|
|
|
38
16
|
async def _add_example_documents(document_search: DocumentSearch) -> None:
|
|
39
17
|
dataset = load_dataset(path="deepsense-ai/synthetic-rag-dataset_v1.0", split="train")
|
|
40
|
-
documents = [DocumentMeta.
|
|
18
|
+
documents = [DocumentMeta.from_literal(doc) for chunks in dataset["chunks"] for doc in chunks]
|
|
41
19
|
await document_search.ingest(documents)
|
|
42
20
|
|
|
43
21
|
|
|
@@ -45,7 +23,7 @@ def basic_document_search_factory() -> DocumentSearch:
|
|
|
45
23
|
"""
|
|
46
24
|
Factory for basic example document search instance.
|
|
47
25
|
"""
|
|
48
|
-
document_search = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
|
|
26
|
+
document_search: DocumentSearch = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
|
|
49
27
|
asyncio.run(_add_example_documents(document_search))
|
|
50
28
|
return document_search
|
|
51
29
|
|
|
@@ -55,3 +33,10 @@ def synthetic_rag_dataset() -> DocumentSearchDataLoader:
|
|
|
55
33
|
Factory for synthetic RAG dataset.
|
|
56
34
|
"""
|
|
57
35
|
return DocumentSearchDataLoader(source=HuggingFaceSource(path="deepsense-ai/synthetic-rag-dataset_v1.0"))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def precision_recall_f1() -> MetricSet:
|
|
39
|
+
"""
|
|
40
|
+
Factory of precision recall f1 metric set for retrival evaluation.
|
|
41
|
+
"""
|
|
42
|
+
return MetricSet(DocumentSearchPrecisionRecallF1(matching_strategy=RougeChunkMatch()))
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from types import ModuleType
|
|
3
4
|
from typing import ClassVar, Generic
|
|
@@ -19,7 +20,7 @@ class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
|
|
|
19
20
|
|
|
20
21
|
def __init__(self, weight: float = 1.0) -> None:
|
|
21
22
|
"""
|
|
22
|
-
|
|
23
|
+
Initialize the metric.
|
|
23
24
|
|
|
24
25
|
Args:
|
|
25
26
|
weight: Metric value weight in the final score, used during optimization.
|
|
@@ -28,7 +29,7 @@ class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
|
|
|
28
29
|
self.weight = weight
|
|
29
30
|
|
|
30
31
|
@abstractmethod
|
|
31
|
-
def compute(self, results: list[EvaluationResultT]) -> dict:
|
|
32
|
+
async def compute(self, results: list[EvaluationResultT]) -> dict:
|
|
32
33
|
"""
|
|
33
34
|
Compute the metric.
|
|
34
35
|
|
|
@@ -70,7 +71,7 @@ class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
|
|
|
70
71
|
"""
|
|
71
72
|
return cls(*[Metric.subclass_from_config(metric_config) for metric_config in config.values()])
|
|
72
73
|
|
|
73
|
-
def compute(self, results: list[EvaluationResultT]) -> dict:
|
|
74
|
+
async def compute(self, results: list[EvaluationResultT]) -> dict:
|
|
74
75
|
"""
|
|
75
76
|
Compute the metrics.
|
|
76
77
|
|
|
@@ -80,6 +81,9 @@ class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
|
|
|
80
81
|
Returns:
|
|
81
82
|
The computed metrics.
|
|
82
83
|
"""
|
|
84
|
+
metric_results = await asyncio.gather(*[metric.compute(results) for metric in self.metrics])
|
|
83
85
|
return {
|
|
84
|
-
name: metric.weight * value
|
|
86
|
+
name: metric.weight * value
|
|
87
|
+
for metric, result in zip(self.metrics, metric_results, strict=False)
|
|
88
|
+
for name, value in result.items()
|
|
85
89
|
}
|
{ragbits_evaluate-0.17.1 → ragbits_evaluate-0.19.0}/src/ragbits/evaluate/metrics/document_search.py
RENAMED
|
@@ -46,7 +46,7 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
|
|
|
46
46
|
matching_strategy = matching_strategy_cls(**config["matching_strategy"]["config"])
|
|
47
47
|
return cls(matching_strategy=matching_strategy, weight=config.get("weight", 1.0))
|
|
48
48
|
|
|
49
|
-
def compute(self, results: list[DocumentSearchResult]) -> dict:
|
|
49
|
+
async def compute(self, results: list[DocumentSearchResult]) -> dict:
|
|
50
50
|
"""
|
|
51
51
|
Compute the metric.
|
|
52
52
|
|
|
@@ -57,7 +57,18 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
|
|
|
57
57
|
The computed metric.
|
|
58
58
|
"""
|
|
59
59
|
return self.metric.aggregate(
|
|
60
|
-
[
|
|
60
|
+
[
|
|
61
|
+
self.metric(
|
|
62
|
+
[
|
|
63
|
+
element.text_representation
|
|
64
|
+
for element in result.predicted_elements
|
|
65
|
+
if element.text_representation
|
|
66
|
+
],
|
|
67
|
+
result.reference_passages,
|
|
68
|
+
)
|
|
69
|
+
for result in results
|
|
70
|
+
if result.reference_passages is not None
|
|
71
|
+
]
|
|
61
72
|
)
|
|
62
73
|
|
|
63
74
|
|