ragbits-evaluate 0.18.0__py3-none-any.whl → 0.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ragbits-evaluate might be problematic. Click here for more details.
- ragbits/evaluate/dataloaders/base.py +36 -3
- ragbits/evaluate/dataloaders/document_search.py +13 -25
- ragbits/evaluate/dataloaders/question_answer.py +57 -0
- ragbits/evaluate/metrics/question_answer.py +182 -0
- ragbits/evaluate/pipelines/question_answer.py +96 -0
- {ragbits_evaluate-0.18.0.dist-info → ragbits_evaluate-0.19.1.dist-info}/METADATA +2 -2
- {ragbits_evaluate-0.18.0.dist-info → ragbits_evaluate-0.19.1.dist-info}/RECORD +8 -5
- {ragbits_evaluate-0.18.0.dist-info → ragbits_evaluate-0.19.1.dist-info}/WHEEL +0 -0
|
@@ -3,12 +3,14 @@ from collections.abc import Iterable
|
|
|
3
3
|
from types import ModuleType
|
|
4
4
|
from typing import ClassVar, Generic
|
|
5
5
|
|
|
6
|
+
from datasets import load_dataset
|
|
6
7
|
from pydantic import BaseModel
|
|
7
8
|
from typing_extensions import Self
|
|
8
9
|
|
|
9
10
|
from ragbits.core.sources.base import Source
|
|
10
11
|
from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
|
|
11
12
|
from ragbits.evaluate import dataloaders
|
|
13
|
+
from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
|
|
12
14
|
from ragbits.evaluate.pipelines.base import EvaluationDataT
|
|
13
15
|
|
|
14
16
|
|
|
@@ -28,14 +30,19 @@ class DataLoader(WithConstructionConfig, Generic[EvaluationDataT], ABC):
|
|
|
28
30
|
default_module: ClassVar[ModuleType | None] = dataloaders
|
|
29
31
|
configuration_key: ClassVar[str] = "dataloader"
|
|
30
32
|
|
|
31
|
-
def __init__(self, source: Source) -> None:
|
|
33
|
+
def __init__(self, source: Source, *, split: str = "data", required_keys: set[str] | None = None) -> None:
|
|
32
34
|
"""
|
|
33
35
|
Initialize the data loader.
|
|
34
36
|
|
|
35
37
|
Args:
|
|
36
38
|
source: The source to load the evaluation data from.
|
|
39
|
+
split: The split to load the data from. Split is fixed for data loaders to "data",
|
|
40
|
+
but you can slice it using the [Hugging Face API](https://huggingface.co/docs/datasets/v1.11.0/splits.html#slicing-api).
|
|
41
|
+
required_keys: The required columns for the evaluation data.
|
|
37
42
|
"""
|
|
38
43
|
self.source = source
|
|
44
|
+
self.split = split
|
|
45
|
+
self.required_keys = required_keys or set()
|
|
39
46
|
|
|
40
47
|
@classmethod
|
|
41
48
|
def from_config(cls, config: dict) -> Self:
|
|
@@ -52,11 +59,37 @@ class DataLoader(WithConstructionConfig, Generic[EvaluationDataT], ABC):
|
|
|
52
59
|
config["source"] = Source.subclass_from_config(dataloader_config.source)
|
|
53
60
|
return super().from_config(config)
|
|
54
61
|
|
|
55
|
-
@abstractmethod
|
|
56
62
|
async def load(self) -> Iterable[EvaluationDataT]:
|
|
57
63
|
"""
|
|
58
64
|
Load the data.
|
|
59
65
|
|
|
60
66
|
Returns:
|
|
61
|
-
The loaded data.
|
|
67
|
+
The loaded evaluation data.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
DataLoaderIncorrectFormatDataError: If evaluation dataset is incorrectly formatted.
|
|
71
|
+
"""
|
|
72
|
+
data_path = await self.source.fetch()
|
|
73
|
+
dataset = load_dataset(
|
|
74
|
+
path=str(data_path.parent),
|
|
75
|
+
data_files={"data": str(data_path.name)},
|
|
76
|
+
split=self.split,
|
|
77
|
+
)
|
|
78
|
+
if not self.required_keys.issubset(dataset.features):
|
|
79
|
+
raise DataLoaderIncorrectFormatDataError(
|
|
80
|
+
required_features=list(self.required_keys),
|
|
81
|
+
data_path=data_path,
|
|
82
|
+
)
|
|
83
|
+
return await self.map(dataset.to_list())
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
async def map(self, dataset: Iterable[dict]) -> Iterable[EvaluationDataT]:
|
|
87
|
+
"""
|
|
88
|
+
Map the dataset to the evaluation data.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
dataset: The dataset to map.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The evaluation data.
|
|
62
95
|
"""
|
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
from collections.abc import Iterable
|
|
2
2
|
|
|
3
|
-
from datasets import load_dataset
|
|
4
|
-
|
|
5
3
|
from ragbits.core.sources.base import Source
|
|
6
4
|
from ragbits.evaluate.dataloaders.base import DataLoader
|
|
7
|
-
from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
|
|
8
5
|
from ragbits.evaluate.pipelines.document_search import DocumentSearchData
|
|
9
6
|
|
|
10
7
|
|
|
@@ -12,13 +9,14 @@ class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
|
|
|
12
9
|
"""
|
|
13
10
|
Document search evaluation data loader.
|
|
14
11
|
|
|
15
|
-
The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files)
|
|
16
|
-
and contain the following features: "question, "passages".
|
|
12
|
+
The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files).
|
|
17
13
|
"""
|
|
18
14
|
|
|
19
15
|
def __init__(
|
|
20
16
|
self,
|
|
21
17
|
source: Source,
|
|
18
|
+
*,
|
|
19
|
+
split: str = "data",
|
|
22
20
|
question_key: str = "question",
|
|
23
21
|
document_ids_key: str = "document_ids",
|
|
24
22
|
passages_key: str = "passages",
|
|
@@ -29,42 +27,32 @@ class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
|
|
|
29
27
|
|
|
30
28
|
Args:
|
|
31
29
|
source: The source to load the data from.
|
|
30
|
+
split: The split to load the data from. Split is fixed for data loaders to "data",
|
|
31
|
+
but you can slice it using the [Hugging Face API](https://huggingface.co/docs/datasets/v1.11.0/splits.html#slicing-api).
|
|
32
32
|
question_key: The dataset column name that contains the question.
|
|
33
33
|
document_ids_key: The dataset column name that contains the document ids. Document ids are optional.
|
|
34
34
|
passages_key: The dataset column name that contains the passages. Passages are optional.
|
|
35
35
|
page_numbers_key: The dataset column name that contains the page numbers. Page numbers are optional.
|
|
36
36
|
"""
|
|
37
|
-
super().__init__(source)
|
|
37
|
+
super().__init__(source=source, split=split, required_keys={question_key})
|
|
38
38
|
self.question_key = question_key
|
|
39
39
|
self.document_ids_key = document_ids_key
|
|
40
40
|
self.passages_key = passages_key
|
|
41
41
|
self.page_numbers_key = page_numbers_key
|
|
42
42
|
|
|
43
|
-
async def
|
|
43
|
+
async def map(self, dataset: Iterable[dict]) -> Iterable[DocumentSearchData]:
|
|
44
44
|
"""
|
|
45
|
-
|
|
45
|
+
Map the dataset to the document search data schema.
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
The
|
|
47
|
+
Args:
|
|
48
|
+
dataset: The dataset to map.
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
Returns:
|
|
51
|
+
The document search data.
|
|
52
52
|
"""
|
|
53
|
-
data_path = await self.source.fetch()
|
|
54
|
-
dataset = load_dataset(
|
|
55
|
-
path=str(data_path.parent),
|
|
56
|
-
split="train",
|
|
57
|
-
data_files={"train": str(data_path.name)},
|
|
58
|
-
)
|
|
59
|
-
if self.question_key not in dataset.features:
|
|
60
|
-
raise DataLoaderIncorrectFormatDataError(
|
|
61
|
-
required_features=[self.question_key],
|
|
62
|
-
data_path=data_path,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
53
|
return [
|
|
66
54
|
DocumentSearchData(
|
|
67
|
-
question=data.get(self.question_key),
|
|
55
|
+
question=data.get(self.question_key, ""),
|
|
68
56
|
reference_document_ids=data.get(self.document_ids_key),
|
|
69
57
|
reference_passages=data.get(self.passages_key),
|
|
70
58
|
reference_page_numbers=data.get(self.page_numbers_key),
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
|
|
3
|
+
from ragbits.core.sources.base import Source
|
|
4
|
+
from ragbits.evaluate.dataloaders.base import DataLoader
|
|
5
|
+
from ragbits.evaluate.pipelines.question_answer import QuestionAnswerData
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class QuestionAnswerDataLoader(DataLoader[QuestionAnswerData]):
|
|
9
|
+
"""
|
|
10
|
+
Question answer evaluation data loader.
|
|
11
|
+
|
|
12
|
+
The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
source: Source,
|
|
18
|
+
*,
|
|
19
|
+
split: str = "data",
|
|
20
|
+
question_key: str = "question",
|
|
21
|
+
answer_key: str = "answer",
|
|
22
|
+
context_key: str = "context",
|
|
23
|
+
) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Initialize the question answer data loader.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
source: The source to load the data from.
|
|
29
|
+
split: The split to load the data from.
|
|
30
|
+
required_keys: The required keys to load the data from.
|
|
31
|
+
question_key: The dataset column name that contains the question.
|
|
32
|
+
answer_key: The dataset column name that contains the answer.
|
|
33
|
+
context_key: The dataset column name that contains the context. Context is optional.
|
|
34
|
+
"""
|
|
35
|
+
super().__init__(source=source, split=split, required_keys={question_key, answer_key})
|
|
36
|
+
self.question_key = question_key
|
|
37
|
+
self.answer_key = answer_key
|
|
38
|
+
self.context_key = context_key
|
|
39
|
+
|
|
40
|
+
async def map(self, dataset: Iterable[dict]) -> Iterable[QuestionAnswerData]:
|
|
41
|
+
"""
|
|
42
|
+
Map the dataset to the question answer data schema.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
dataset: The dataset to map.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The question answer data.
|
|
49
|
+
"""
|
|
50
|
+
return [
|
|
51
|
+
QuestionAnswerData(
|
|
52
|
+
question=data.get(self.question_key, ""),
|
|
53
|
+
reference_answer=data.get(self.answer_key, ""),
|
|
54
|
+
reference_context=data.get(self.context_key),
|
|
55
|
+
)
|
|
56
|
+
for data in dataset
|
|
57
|
+
]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from itertools import chain
|
|
4
|
+
from typing import Generic, TypeVar
|
|
5
|
+
|
|
6
|
+
from continuous_eval.llm_factory import LLMInterface
|
|
7
|
+
from continuous_eval.metrics.base import LLMBasedMetric
|
|
8
|
+
from continuous_eval.metrics.generation.text import (
|
|
9
|
+
LLMBasedAnswerCorrectness,
|
|
10
|
+
LLMBasedAnswerRelevance,
|
|
11
|
+
LLMBasedFaithfulness,
|
|
12
|
+
LLMBasedStyleConsistency,
|
|
13
|
+
)
|
|
14
|
+
from typing_extensions import Self
|
|
15
|
+
|
|
16
|
+
from ragbits.agents.types import QuestionAnswerPromptOutputT
|
|
17
|
+
from ragbits.core.llms.base import LLM
|
|
18
|
+
from ragbits.core.utils.helpers import batched
|
|
19
|
+
from ragbits.evaluate.metrics.base import Metric
|
|
20
|
+
from ragbits.evaluate.pipelines.question_answer import QuestionAnswerResult
|
|
21
|
+
|
|
22
|
+
MetricT = TypeVar("MetricT", bound=LLMBasedMetric)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _MetricLMM(LLMInterface):
|
|
26
|
+
"""
|
|
27
|
+
Implementation of required interface of Relari generative metrics based on LiteLMM.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, llm: LLM) -> None:
|
|
31
|
+
self._llm = llm
|
|
32
|
+
|
|
33
|
+
def run(self, prompt: dict[str, str], temperature: float = 0, max_tokens: int = 1024) -> str:
|
|
34
|
+
formatted_prompt = [
|
|
35
|
+
{"role": "system", "content": prompt["system_prompt"]},
|
|
36
|
+
{"role": "user", "content": prompt["user_prompt"]},
|
|
37
|
+
]
|
|
38
|
+
options = self._llm.options_cls(
|
|
39
|
+
temperature=temperature,
|
|
40
|
+
max_tokens=max_tokens,
|
|
41
|
+
)
|
|
42
|
+
return asyncio.run(self._llm.generate(formatted_prompt, options=options))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
|
|
46
|
+
"""
|
|
47
|
+
Metric for question answer evaluation based on Relari backend.
|
|
48
|
+
More details can be found [here](https://docs.relari.ai/category/text-generation).
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
metric_cls: type[MetricT]
|
|
52
|
+
|
|
53
|
+
def __init__(self, llm: LLM, batch_size: int = 15, weight: float = 1.0) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Initialize the agent metric.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
llm: Judge LLM instance.
|
|
59
|
+
batch_size: Batch size for metric computation.
|
|
60
|
+
weight: Metric value weight in the final score, used during optimization.
|
|
61
|
+
"""
|
|
62
|
+
super().__init__(weight=weight)
|
|
63
|
+
self.metric = self.metric_cls(_MetricLMM(llm))
|
|
64
|
+
self.batch_size = batch_size
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_config(cls, config: dict) -> Self:
|
|
68
|
+
"""
|
|
69
|
+
Create an instance of `QuestionAnswerMetric` from a configuration dictionary.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
config: A dictionary containing configuration settings for the metric.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
An instance of the metric class initialized with the provided configuration.
|
|
76
|
+
"""
|
|
77
|
+
config["llm"] = LLM.from_config(config["llm"])
|
|
78
|
+
config["batch_size"] = config.get("batch_size", 15)
|
|
79
|
+
config["weight"] = config.get("weight", 1.0)
|
|
80
|
+
return super().from_config(config)
|
|
81
|
+
|
|
82
|
+
async def compute(self, results: list[QuestionAnswerResult[QuestionAnswerPromptOutputT]]) -> dict:
|
|
83
|
+
"""
|
|
84
|
+
Compute the metric.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
results: The evaluation results.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
The computed metric.
|
|
91
|
+
"""
|
|
92
|
+
metric_results = chain.from_iterable(
|
|
93
|
+
[
|
|
94
|
+
await asyncio.gather(*[asyncio.to_thread(self._call_metric, result) for result in batch])
|
|
95
|
+
for batch in batched(results, self.batch_size)
|
|
96
|
+
]
|
|
97
|
+
)
|
|
98
|
+
return self.metric.aggregate(list(metric_results))
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
|
|
102
|
+
"""
|
|
103
|
+
Call the metric with the proper arguments.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class QuestionAnswerAnswerCorrectness(QuestionAnswerMetric[LLMBasedAnswerCorrectness]):
|
|
108
|
+
"""
|
|
109
|
+
Metric checking answer correctness based on LLM.
|
|
110
|
+
More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_correctness).
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
metric_cls: type[LLMBasedAnswerCorrectness] = LLMBasedAnswerCorrectness
|
|
114
|
+
|
|
115
|
+
def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
|
|
116
|
+
return self.metric(
|
|
117
|
+
question=result.question,
|
|
118
|
+
answer=(
|
|
119
|
+
result.predicted_result.content
|
|
120
|
+
if isinstance(result.predicted_result.content, str)
|
|
121
|
+
else result.predicted_result.content.answer
|
|
122
|
+
),
|
|
123
|
+
ground_truth_answers=result.reference_answer,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class QuestionAnswerAnswerFaithfulness(QuestionAnswerMetric[LLMBasedFaithfulness]):
|
|
128
|
+
"""
|
|
129
|
+
Metric checking answer faithfulness based on LLM.
|
|
130
|
+
More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_faithfulness).
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
metric_cls: type[LLMBasedFaithfulness] = LLMBasedFaithfulness
|
|
134
|
+
|
|
135
|
+
def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
|
|
136
|
+
return self.metric(
|
|
137
|
+
question=result.question,
|
|
138
|
+
answer=(
|
|
139
|
+
result.predicted_result.content
|
|
140
|
+
if isinstance(result.predicted_result.content, str)
|
|
141
|
+
else result.predicted_result.content.answer
|
|
142
|
+
),
|
|
143
|
+
retrieved_context=result.reference_context,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class QuestionAnswerAnswerRelevance(QuestionAnswerMetric[LLMBasedAnswerRelevance]):
|
|
148
|
+
"""
|
|
149
|
+
Metric checking answer relevance based on LLM.
|
|
150
|
+
More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_relevance).
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
metric_cls: type[LLMBasedAnswerRelevance] = LLMBasedAnswerRelevance
|
|
154
|
+
|
|
155
|
+
def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
|
|
156
|
+
return self.metric(
|
|
157
|
+
question=result.question,
|
|
158
|
+
answer=(
|
|
159
|
+
result.predicted_result.content
|
|
160
|
+
if isinstance(result.predicted_result.content, str)
|
|
161
|
+
else result.predicted_result.content.answer
|
|
162
|
+
),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class QuestionAnswerAnswerConsistency(QuestionAnswerMetric[LLMBasedStyleConsistency]):
|
|
167
|
+
"""
|
|
168
|
+
Metric checking answer relevance based on LLM.
|
|
169
|
+
More details can be found [here](https://docs.relari.ai/metrics/Generation/LLM-Based/llm_style).
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
metric_cls: type[LLMBasedStyleConsistency] = LLMBasedStyleConsistency
|
|
173
|
+
|
|
174
|
+
def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
|
|
175
|
+
return self.metric(
|
|
176
|
+
answer=(
|
|
177
|
+
result.predicted_result.content
|
|
178
|
+
if isinstance(result.predicted_result.content, str)
|
|
179
|
+
else result.predicted_result.content.answer
|
|
180
|
+
),
|
|
181
|
+
ground_truth_answers=result.reference_answer,
|
|
182
|
+
)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Generic
|
|
5
|
+
|
|
6
|
+
from typing_extensions import Self
|
|
7
|
+
|
|
8
|
+
from ragbits.agents._main import AgentResult
|
|
9
|
+
from ragbits.agents.types import (
|
|
10
|
+
QuestionAnswerAgent,
|
|
11
|
+
QuestionAnswerPromptInput,
|
|
12
|
+
QuestionAnswerPromptOutputT,
|
|
13
|
+
)
|
|
14
|
+
from ragbits.core.llms.base import LLMClientOptionsT
|
|
15
|
+
from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class QuestionAnswerData(EvaluationData):
|
|
19
|
+
"""
|
|
20
|
+
Represents the evaluation data for question answer.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
question: str
|
|
24
|
+
reference_answer: str
|
|
25
|
+
reference_context: Any | None = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class QuestionAnswerResult(EvaluationResult, Generic[QuestionAnswerPromptOutputT]):
|
|
30
|
+
"""
|
|
31
|
+
Represents the result of a single evaluation.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
question: str
|
|
35
|
+
predicted_result: AgentResult[QuestionAnswerPromptOutputT]
|
|
36
|
+
reference_answer: str
|
|
37
|
+
reference_context: Any | None = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class QuestionAnswerPipeline(
|
|
41
|
+
EvaluationPipeline[
|
|
42
|
+
QuestionAnswerAgent[LLMClientOptionsT, QuestionAnswerPromptInput, QuestionAnswerPromptOutputT],
|
|
43
|
+
QuestionAnswerData,
|
|
44
|
+
QuestionAnswerResult,
|
|
45
|
+
]
|
|
46
|
+
):
|
|
47
|
+
"""
|
|
48
|
+
Question answer evaluation pipeline.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def from_config(cls, config: dict) -> Self:
|
|
53
|
+
"""
|
|
54
|
+
Create an instance of `QuestionAnswerPipeline` from a configuration dictionary.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
config: A dictionary containing configuration settings for the pipeline.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
An instance of the pipeline class initialized with the provided configuration.
|
|
61
|
+
"""
|
|
62
|
+
config["evaluation_target"] = QuestionAnswerAgent.from_config(config)
|
|
63
|
+
return super().from_config(config)
|
|
64
|
+
|
|
65
|
+
async def __call__(
|
|
66
|
+
self, data: Iterable[QuestionAnswerData]
|
|
67
|
+
) -> Iterable[QuestionAnswerResult[QuestionAnswerPromptOutputT]]:
|
|
68
|
+
"""
|
|
69
|
+
Run the question answer evaluation pipeline.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
data: The evaluation data batch.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
The evaluation result batch.
|
|
76
|
+
"""
|
|
77
|
+
results = await asyncio.gather(
|
|
78
|
+
*[
|
|
79
|
+
self.evaluation_target.run(
|
|
80
|
+
QuestionAnswerPromptInput(
|
|
81
|
+
question=row.question,
|
|
82
|
+
context=row.reference_context,
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
for row in data
|
|
86
|
+
]
|
|
87
|
+
)
|
|
88
|
+
return [
|
|
89
|
+
QuestionAnswerResult(
|
|
90
|
+
question=row.question,
|
|
91
|
+
predicted_result=result,
|
|
92
|
+
reference_answer=row.reference_answer,
|
|
93
|
+
reference_context=row.reference_context,
|
|
94
|
+
)
|
|
95
|
+
for row, result in zip(data, results, strict=False)
|
|
96
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragbits-evaluate
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.19.1
|
|
4
4
|
Summary: Evaluation module for Ragbits components
|
|
5
5
|
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
6
|
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.4.1
|
|
|
27
27
|
Requires-Dist: hydra-core<2.0.0,>=1.3.2
|
|
28
28
|
Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
|
|
29
29
|
Requires-Dist: optuna<5.0.0,>=4.0.0
|
|
30
|
-
Requires-Dist: ragbits-core==0.
|
|
30
|
+
Requires-Dist: ragbits-core==0.19.1
|
|
31
31
|
Provides-Extra: relari
|
|
32
32
|
Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
|
|
33
33
|
Description-Content-Type: text/markdown
|
|
@@ -6,9 +6,10 @@ ragbits/evaluate/optimizer.py,sha256=RqYgoiCIEhjXO0HEN6uwldblHyoPuT3qUdncuHPZgCg
|
|
|
6
6
|
ragbits/evaluate/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
ragbits/evaluate/utils.py,sha256=w-hbvKRHI9tEva9wKDTVla0Wm2eCHT2MxVkof27Sqfw,4831
|
|
8
8
|
ragbits/evaluate/dataloaders/__init__.py,sha256=UFJFjmvi3GUQFsx6A5sYD01HH2f7TXcHRW2VNM1pmIA,83
|
|
9
|
-
ragbits/evaluate/dataloaders/base.py,sha256=
|
|
10
|
-
ragbits/evaluate/dataloaders/document_search.py,sha256=
|
|
9
|
+
ragbits/evaluate/dataloaders/base.py,sha256=x8rEl5utNOziF_9urL0grkqoXwMgaDWYSM5akw3Kt9Y,3213
|
|
10
|
+
ragbits/evaluate/dataloaders/document_search.py,sha256=c9Bc4ZtFEKAiG9B70JFiBZlZDkBSGNWFRKabF7PMTU0,2495
|
|
11
11
|
ragbits/evaluate/dataloaders/exceptions.py,sha256=xUOBLj1JuCkcqzRVnu0A0I_i1THxbDt2MEDVdDGjDyY,735
|
|
12
|
+
ragbits/evaluate/dataloaders/question_answer.py,sha256=naXFDtla0otOTWSyHVvWvgDYEq-Wry4irnAJR2tHMNg,2032
|
|
12
13
|
ragbits/evaluate/dataset_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
14
|
ragbits/evaluate/dataset_generator/pipeline.py,sha256=dgnV-Qm0Z7S1Y6ga9-9RscXxxr3krOKsIj7E9WS4ANk,4940
|
|
14
15
|
ragbits/evaluate/dataset_generator/utils.py,sha256=zD-ksXlX62kkIgzBefE4ILsP7He9bHimnZ63LLsMKCA,1325
|
|
@@ -27,9 +28,11 @@ ragbits/evaluate/factories/__init__.py,sha256=7nh0J80EfqMWRGtHx4hkfHNMztfC6FMhH8
|
|
|
27
28
|
ragbits/evaluate/metrics/__init__.py,sha256=Mr83ytGyvdXtBlr7Bbo0-5auE0530xsd3wffKSIf8cE,95
|
|
28
29
|
ragbits/evaluate/metrics/base.py,sha256=bOscQ_nJXLGWmP2ls9jncrUoeghNBnKDJsab71pFEjo,2519
|
|
29
30
|
ragbits/evaluate/metrics/document_search.py,sha256=MfvMwEPenqiJdKYuW6WLvmtMch9ZVYb0T6ibpOF3vGI,3189
|
|
31
|
+
ragbits/evaluate/metrics/question_answer.py,sha256=_XMFjkJcG-xdOO2fCfoKIhJb5VVM_GK_yKhFGXO8FRM,6566
|
|
30
32
|
ragbits/evaluate/pipelines/__init__.py,sha256=Bqp_L7aRq12Ua19ELZDsdYvra6-GlLrQ9cIG2IWArko,1294
|
|
31
33
|
ragbits/evaluate/pipelines/base.py,sha256=QV3fjPnbJjeCgcbt8yV1Ho3BamEUc3wSca3MAzaBlV0,1739
|
|
32
34
|
ragbits/evaluate/pipelines/document_search.py,sha256=tgk-I21eshdBbWVsuNa1zWK_fWuDNXhhMCn1_Fdu_Ko,3840
|
|
33
|
-
|
|
34
|
-
ragbits_evaluate-0.
|
|
35
|
-
ragbits_evaluate-0.
|
|
35
|
+
ragbits/evaluate/pipelines/question_answer.py,sha256=3CYVHDLnOy4z7kgYPMluiJ8POulHo-w3PEiqvqsF4Dc,2797
|
|
36
|
+
ragbits_evaluate-0.19.1.dist-info/METADATA,sha256=xO13dMZHaA1yUZ-kppa6R8KzVSsn_G4VjHKqXMew3ak,2300
|
|
37
|
+
ragbits_evaluate-0.19.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
ragbits_evaluate-0.19.1.dist-info/RECORD,,
|
|
File without changes
|