ragbits-evaluate 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ragbits-evaluate might be problematic. Click here for more details.
- ragbits/evaluate/cli.py +2 -2
- ragbits/evaluate/dataloaders/document_search.py +33 -5
- ragbits/evaluate/evaluator.py +99 -54
- ragbits/evaluate/factories/__init__.py +11 -26
- ragbits/evaluate/metrics/base.py +8 -4
- ragbits/evaluate/metrics/document_search.py +13 -2
- ragbits/evaluate/optimizer.py +9 -9
- ragbits/evaluate/pipelines/base.py +2 -1
- ragbits/evaluate/pipelines/document_search.py +25 -15
- ragbits/evaluate/utils.py +48 -14
- {ragbits_evaluate-0.17.0.dist-info → ragbits_evaluate-0.18.0.dist-info}/METADATA +2 -2
- {ragbits_evaluate-0.17.0.dist-info → ragbits_evaluate-0.18.0.dist-info}/RECORD +13 -13
- {ragbits_evaluate-0.17.0.dist-info → ragbits_evaluate-0.18.0.dist-info}/WHEEL +0 -0
ragbits/evaluate/cli.py
CHANGED
|
@@ -140,10 +140,10 @@ def run() -> None:
|
|
|
140
140
|
metric_results = await evaluator.compute(
|
|
141
141
|
pipeline=state.pipeline,
|
|
142
142
|
dataloader=state.dataloader,
|
|
143
|
-
|
|
143
|
+
metricset=state.metrics,
|
|
144
144
|
)
|
|
145
145
|
evaluation_results = EvaluationResult(
|
|
146
|
-
metrics={"metrics": metric_results
|
|
146
|
+
metrics={"metrics": metric_results.metrics, "time_perf": metric_results.time_perf}
|
|
147
147
|
)
|
|
148
148
|
print_output(evaluation_results)
|
|
149
149
|
|
|
@@ -2,6 +2,7 @@ from collections.abc import Iterable
|
|
|
2
2
|
|
|
3
3
|
from datasets import load_dataset
|
|
4
4
|
|
|
5
|
+
from ragbits.core.sources.base import Source
|
|
5
6
|
from ragbits.evaluate.dataloaders.base import DataLoader
|
|
6
7
|
from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
|
|
7
8
|
from ragbits.evaluate.pipelines.document_search import DocumentSearchData
|
|
@@ -15,6 +16,30 @@ class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
|
|
|
15
16
|
and contain the following features: "question, "passages".
|
|
16
17
|
"""
|
|
17
18
|
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
source: Source,
|
|
22
|
+
question_key: str = "question",
|
|
23
|
+
document_ids_key: str = "document_ids",
|
|
24
|
+
passages_key: str = "passages",
|
|
25
|
+
page_numbers_key: str = "page_numbers",
|
|
26
|
+
) -> None:
|
|
27
|
+
"""
|
|
28
|
+
Initialize the document search data loader.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
source: The source to load the data from.
|
|
32
|
+
question_key: The dataset column name that contains the question.
|
|
33
|
+
document_ids_key: The dataset column name that contains the document ids. Document ids are optional.
|
|
34
|
+
passages_key: The dataset column name that contains the passages. Passages are optional.
|
|
35
|
+
page_numbers_key: The dataset column name that contains the page numbers. Page numbers are optional.
|
|
36
|
+
"""
|
|
37
|
+
super().__init__(source)
|
|
38
|
+
self.question_key = question_key
|
|
39
|
+
self.document_ids_key = document_ids_key
|
|
40
|
+
self.passages_key = passages_key
|
|
41
|
+
self.page_numbers_key = page_numbers_key
|
|
42
|
+
|
|
18
43
|
async def load(self) -> Iterable[DocumentSearchData]:
|
|
19
44
|
"""
|
|
20
45
|
Load the data from source and format them.
|
|
@@ -28,18 +53,21 @@ class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
|
|
|
28
53
|
data_path = await self.source.fetch()
|
|
29
54
|
dataset = load_dataset(
|
|
30
55
|
path=str(data_path.parent),
|
|
31
|
-
split=
|
|
56
|
+
split="train",
|
|
57
|
+
data_files={"train": str(data_path.name)},
|
|
32
58
|
)
|
|
33
|
-
if
|
|
59
|
+
if self.question_key not in dataset.features:
|
|
34
60
|
raise DataLoaderIncorrectFormatDataError(
|
|
35
|
-
required_features=[
|
|
61
|
+
required_features=[self.question_key],
|
|
36
62
|
data_path=data_path,
|
|
37
63
|
)
|
|
38
64
|
|
|
39
65
|
return [
|
|
40
66
|
DocumentSearchData(
|
|
41
|
-
question=data
|
|
42
|
-
|
|
67
|
+
question=data.get(self.question_key),
|
|
68
|
+
reference_document_ids=data.get(self.document_ids_key),
|
|
69
|
+
reference_passages=data.get(self.passages_key),
|
|
70
|
+
reference_page_numbers=data.get(self.page_numbers_key),
|
|
43
71
|
)
|
|
44
72
|
for data in dataset
|
|
45
73
|
]
|
ragbits/evaluate/evaluator.py
CHANGED
|
@@ -1,16 +1,45 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import random
|
|
2
3
|
import time
|
|
3
|
-
from collections.abc import Iterable
|
|
4
|
-
from dataclasses import
|
|
4
|
+
from collections.abc import Awaitable, Callable, Iterable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Generic, ParamSpec, TypeVar
|
|
5
7
|
|
|
6
8
|
from pydantic import BaseModel
|
|
7
|
-
from tqdm
|
|
9
|
+
from tqdm import tqdm
|
|
8
10
|
|
|
9
11
|
from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
|
|
12
|
+
from ragbits.core.utils.helpers import batched
|
|
10
13
|
from ragbits.evaluate.dataloaders.base import DataLoader
|
|
11
14
|
from ragbits.evaluate.metrics.base import MetricSet
|
|
12
15
|
from ragbits.evaluate.pipelines.base import EvaluationDataT, EvaluationPipeline, EvaluationResultT, EvaluationTargetT
|
|
13
16
|
|
|
17
|
+
_CallP = ParamSpec("_CallP")
|
|
18
|
+
_CallReturnT = TypeVar("_CallReturnT")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class EvaluationTimePerf:
|
|
23
|
+
"""
|
|
24
|
+
Container for evaluation time performance metrics.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
total_time_in_seconds: float
|
|
28
|
+
samples_per_second: float
|
|
29
|
+
latency_in_seconds: float
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class EvaluatorResult(Generic[EvaluationResultT]):
|
|
34
|
+
"""
|
|
35
|
+
Container for evaluation results.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
metrics: dict[str, int | float]
|
|
39
|
+
results: list[EvaluationResultT]
|
|
40
|
+
errors: list[Exception]
|
|
41
|
+
time_perf: EvaluationTimePerf
|
|
42
|
+
|
|
14
43
|
|
|
15
44
|
class EvaluationConfig(BaseModel):
|
|
16
45
|
"""
|
|
@@ -36,17 +65,29 @@ class Evaluator(WithConstructionConfig):
|
|
|
36
65
|
Evaluator class.
|
|
37
66
|
"""
|
|
38
67
|
|
|
39
|
-
def __init__(
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
batch_size: int = 10,
|
|
71
|
+
num_retries: int = 3,
|
|
72
|
+
backoff_multiplier: int = 1,
|
|
73
|
+
backoff_max: int = 60,
|
|
74
|
+
) -> None:
|
|
40
75
|
"""
|
|
41
|
-
Initialize the
|
|
76
|
+
Initialize the Evaluator instance.
|
|
42
77
|
|
|
43
78
|
Args:
|
|
44
79
|
batch_size: batch size for the evaluation pipeline inference.
|
|
80
|
+
num_retries: The number of retries per evaluation pipeline inference error.
|
|
81
|
+
backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
|
|
82
|
+
backoff_max: The maximum allowed delay (in seconds) between retries.
|
|
45
83
|
"""
|
|
46
84
|
self.batch_size = batch_size
|
|
85
|
+
self.num_retries = num_retries
|
|
86
|
+
self.backoff_multiplier = backoff_multiplier
|
|
87
|
+
self.backoff_max = backoff_max
|
|
47
88
|
|
|
48
89
|
@classmethod
|
|
49
|
-
async def run_from_config(cls, config: dict) ->
|
|
90
|
+
async def run_from_config(cls, config: dict) -> EvaluatorResult:
|
|
50
91
|
"""
|
|
51
92
|
Run the evaluation based on configuration.
|
|
52
93
|
|
|
@@ -60,50 +101,50 @@ class Evaluator(WithConstructionConfig):
|
|
|
60
101
|
evaluation_config = EvaluationConfig.model_validate(evaluator_config.evaluation)
|
|
61
102
|
pipeline: EvaluationPipeline = EvaluationPipeline.subclass_from_config(evaluation_config.pipeline)
|
|
62
103
|
dataloader: DataLoader = DataLoader.subclass_from_config(evaluation_config.dataloader)
|
|
63
|
-
|
|
104
|
+
metricset: MetricSet = MetricSet.from_config(evaluation_config.metrics)
|
|
64
105
|
|
|
65
106
|
evaluator = cls.from_config(evaluator_config.evaluator or {})
|
|
66
107
|
return await evaluator.compute(
|
|
67
108
|
pipeline=pipeline,
|
|
68
109
|
dataloader=dataloader,
|
|
69
|
-
|
|
110
|
+
metricset=metricset,
|
|
70
111
|
)
|
|
71
112
|
|
|
72
113
|
async def compute(
|
|
73
114
|
self,
|
|
74
115
|
pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
|
|
75
116
|
dataloader: DataLoader[EvaluationDataT],
|
|
76
|
-
|
|
77
|
-
) ->
|
|
117
|
+
metricset: MetricSet[EvaluationResultT],
|
|
118
|
+
) -> EvaluatorResult[EvaluationResultT]:
|
|
78
119
|
"""
|
|
79
120
|
Compute the evaluation results for the given pipeline and data.
|
|
80
121
|
|
|
81
122
|
Args:
|
|
82
123
|
pipeline: The pipeline to be evaluated.
|
|
83
124
|
dataloader: The dataloader to load the data.
|
|
84
|
-
|
|
125
|
+
metricset: The metrics to be computed.
|
|
85
126
|
|
|
86
127
|
Returns:
|
|
87
128
|
The evaluation results.
|
|
88
129
|
"""
|
|
89
|
-
dataset = await dataloader.load()
|
|
90
130
|
await pipeline.prepare()
|
|
91
131
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
132
|
+
dataset = await dataloader.load()
|
|
133
|
+
results, errors, time_perf = await self._call_pipeline(pipeline, dataset)
|
|
134
|
+
metrics = await metricset.compute(results)
|
|
95
135
|
|
|
96
|
-
return
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
136
|
+
return EvaluatorResult(
|
|
137
|
+
metrics=metrics,
|
|
138
|
+
results=results,
|
|
139
|
+
errors=errors,
|
|
140
|
+
time_perf=time_perf,
|
|
141
|
+
)
|
|
101
142
|
|
|
102
143
|
async def _call_pipeline(
|
|
103
144
|
self,
|
|
104
145
|
pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
|
|
105
146
|
dataset: Iterable[EvaluationDataT],
|
|
106
|
-
) -> tuple[list[EvaluationResultT],
|
|
147
|
+
) -> tuple[list[EvaluationResultT], list[Exception], EvaluationTimePerf]:
|
|
107
148
|
"""
|
|
108
149
|
Call the pipeline with the given data.
|
|
109
150
|
|
|
@@ -114,47 +155,53 @@ class Evaluator(WithConstructionConfig):
|
|
|
114
155
|
Returns:
|
|
115
156
|
The evaluation results and performance metrics.
|
|
116
157
|
"""
|
|
117
|
-
semaphore = asyncio.Semaphore(self.batch_size)
|
|
118
|
-
|
|
119
|
-
async def _call_pipeline_with_semaphore(data: EvaluationDataT) -> EvaluationResultT:
|
|
120
|
-
async with semaphore:
|
|
121
|
-
return await pipeline(data)
|
|
122
|
-
|
|
123
158
|
start_time = time.perf_counter()
|
|
124
|
-
|
|
159
|
+
outputs = [
|
|
160
|
+
await self._call_with_error_handling(pipeline, data)
|
|
161
|
+
for data in tqdm(batched(dataset, self.batch_size), desc="Evaluation")
|
|
162
|
+
]
|
|
125
163
|
end_time = time.perf_counter()
|
|
126
164
|
|
|
127
|
-
|
|
165
|
+
errors = [output for output in outputs if isinstance(output, Exception)]
|
|
166
|
+
results = [item for output in outputs if not isinstance(output, Exception) for item in output]
|
|
128
167
|
|
|
129
|
-
|
|
130
|
-
|
|
168
|
+
return results, errors, self._compute_time_perf(start_time, end_time, len(outputs))
|
|
169
|
+
|
|
170
|
+
async def _call_with_error_handling(
|
|
171
|
+
self,
|
|
172
|
+
executable: Callable[_CallP, Awaitable[_CallReturnT]],
|
|
173
|
+
*executable_args: _CallP.args,
|
|
174
|
+
**executable_kwargs: _CallP.kwargs,
|
|
175
|
+
) -> _CallReturnT | Exception:
|
|
131
176
|
"""
|
|
132
|
-
|
|
177
|
+
Call executable with a standarized error handling.
|
|
178
|
+
If an error occurs, the executable is retried `num_retries` times using randomized exponential backoff.
|
|
133
179
|
|
|
134
180
|
Args:
|
|
135
|
-
|
|
181
|
+
executable: The callable function to execute.
|
|
182
|
+
executable_args: Positional arguments to pass to the executable.
|
|
183
|
+
executable_kwargs: Keyword arguments to pass to the executable.
|
|
136
184
|
|
|
137
185
|
Returns:
|
|
138
|
-
The
|
|
139
|
-
"""
|
|
140
|
-
return {"results": [asdict(result) for result in results]}
|
|
186
|
+
The result of the executable if successful.
|
|
141
187
|
|
|
142
|
-
|
|
143
|
-
|
|
188
|
+
Raises:
|
|
189
|
+
Exception: The last encountered exception after all retries are exhausted.
|
|
144
190
|
"""
|
|
145
|
-
|
|
191
|
+
for i in range(max(0, self.num_retries) + 1):
|
|
192
|
+
try:
|
|
193
|
+
return await executable(*executable_args, **executable_kwargs)
|
|
194
|
+
except Exception as exc:
|
|
195
|
+
if i == self.num_retries:
|
|
196
|
+
return exc
|
|
146
197
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
results: The evaluation results.
|
|
198
|
+
delay = random.uniform(0, min(2**i * self.backoff_multiplier, self.backoff_max)) # noqa: S311
|
|
199
|
+
await asyncio.sleep(delay)
|
|
150
200
|
|
|
151
|
-
|
|
152
|
-
The computed metric.
|
|
153
|
-
"""
|
|
154
|
-
return {"metrics": metrics.compute(results)}
|
|
201
|
+
raise RuntimeError("Unreachable code reached") # mypy quirk
|
|
155
202
|
|
|
156
203
|
@staticmethod
|
|
157
|
-
def _compute_time_perf(start_time: float, end_time: float, num_samples: int) ->
|
|
204
|
+
def _compute_time_perf(start_time: float, end_time: float, num_samples: int) -> EvaluationTimePerf:
|
|
158
205
|
"""
|
|
159
206
|
Compute the performance metrics.
|
|
160
207
|
|
|
@@ -170,10 +217,8 @@ class Evaluator(WithConstructionConfig):
|
|
|
170
217
|
throughput = num_samples / latency
|
|
171
218
|
latency_sample = 1.0 / throughput if throughput > 0 else 0.0
|
|
172
219
|
|
|
173
|
-
return
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
},
|
|
179
|
-
}
|
|
220
|
+
return EvaluationTimePerf(
|
|
221
|
+
total_time_in_seconds=latency,
|
|
222
|
+
samples_per_second=throughput,
|
|
223
|
+
latency_in_seconds=latency_sample,
|
|
224
|
+
)
|
|
@@ -1,43 +1,21 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
|
|
3
|
+
from continuous_eval.metrics.retrieval.matching_strategy import RougeChunkMatch
|
|
3
4
|
from datasets import load_dataset
|
|
4
5
|
|
|
5
6
|
from ragbits.core.embeddings.dense import LiteLLMEmbedder
|
|
6
7
|
from ragbits.core.sources.hf import HuggingFaceSource
|
|
7
|
-
from ragbits.core.utils.config_handling import ObjectConstructionConfig
|
|
8
8
|
from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
|
|
9
9
|
from ragbits.document_search import DocumentSearch
|
|
10
10
|
from ragbits.document_search.documents.document import DocumentMeta
|
|
11
11
|
from ragbits.evaluate.dataloaders.document_search import DocumentSearchDataLoader
|
|
12
12
|
from ragbits.evaluate.metrics import MetricSet
|
|
13
|
-
|
|
14
|
-
DS_PRECISION_RECALL_F1 = {
|
|
15
|
-
"precision_recall_f1": ObjectConstructionConfig.model_validate(
|
|
16
|
-
{
|
|
17
|
-
"type": "ragbits.evaluate.metrics.document_search:DocumentSearchPrecisionRecallF1",
|
|
18
|
-
"config": {
|
|
19
|
-
"matching_strategy": {
|
|
20
|
-
"type": "RougeChunkMatch",
|
|
21
|
-
"config": {
|
|
22
|
-
"threshold": 0.5,
|
|
23
|
-
},
|
|
24
|
-
},
|
|
25
|
-
},
|
|
26
|
-
}
|
|
27
|
-
),
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def precision_recall_f1() -> MetricSet:
|
|
32
|
-
"""
|
|
33
|
-
Factory of precision recall f1 metric set for retrival evaluation.
|
|
34
|
-
"""
|
|
35
|
-
return MetricSet.from_config(config=DS_PRECISION_RECALL_F1)
|
|
13
|
+
from ragbits.evaluate.metrics.document_search import DocumentSearchPrecisionRecallF1
|
|
36
14
|
|
|
37
15
|
|
|
38
16
|
async def _add_example_documents(document_search: DocumentSearch) -> None:
|
|
39
17
|
dataset = load_dataset(path="deepsense-ai/synthetic-rag-dataset_v1.0", split="train")
|
|
40
|
-
documents = [DocumentMeta.
|
|
18
|
+
documents = [DocumentMeta.from_literal(doc) for chunks in dataset["chunks"] for doc in chunks]
|
|
41
19
|
await document_search.ingest(documents)
|
|
42
20
|
|
|
43
21
|
|
|
@@ -45,7 +23,7 @@ def basic_document_search_factory() -> DocumentSearch:
|
|
|
45
23
|
"""
|
|
46
24
|
Factory for basic example document search instance.
|
|
47
25
|
"""
|
|
48
|
-
document_search = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
|
|
26
|
+
document_search: DocumentSearch = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
|
|
49
27
|
asyncio.run(_add_example_documents(document_search))
|
|
50
28
|
return document_search
|
|
51
29
|
|
|
@@ -55,3 +33,10 @@ def synthetic_rag_dataset() -> DocumentSearchDataLoader:
|
|
|
55
33
|
Factory for synthetic RAG dataset.
|
|
56
34
|
"""
|
|
57
35
|
return DocumentSearchDataLoader(source=HuggingFaceSource(path="deepsense-ai/synthetic-rag-dataset_v1.0"))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def precision_recall_f1() -> MetricSet:
|
|
39
|
+
"""
|
|
40
|
+
Factory of precision recall f1 metric set for retrival evaluation.
|
|
41
|
+
"""
|
|
42
|
+
return MetricSet(DocumentSearchPrecisionRecallF1(matching_strategy=RougeChunkMatch()))
|
ragbits/evaluate/metrics/base.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from types import ModuleType
|
|
3
4
|
from typing import ClassVar, Generic
|
|
@@ -19,7 +20,7 @@ class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
|
|
|
19
20
|
|
|
20
21
|
def __init__(self, weight: float = 1.0) -> None:
|
|
21
22
|
"""
|
|
22
|
-
|
|
23
|
+
Initialize the metric.
|
|
23
24
|
|
|
24
25
|
Args:
|
|
25
26
|
weight: Metric value weight in the final score, used during optimization.
|
|
@@ -28,7 +29,7 @@ class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
|
|
|
28
29
|
self.weight = weight
|
|
29
30
|
|
|
30
31
|
@abstractmethod
|
|
31
|
-
def compute(self, results: list[EvaluationResultT]) -> dict:
|
|
32
|
+
async def compute(self, results: list[EvaluationResultT]) -> dict:
|
|
32
33
|
"""
|
|
33
34
|
Compute the metric.
|
|
34
35
|
|
|
@@ -70,7 +71,7 @@ class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
|
|
|
70
71
|
"""
|
|
71
72
|
return cls(*[Metric.subclass_from_config(metric_config) for metric_config in config.values()])
|
|
72
73
|
|
|
73
|
-
def compute(self, results: list[EvaluationResultT]) -> dict:
|
|
74
|
+
async def compute(self, results: list[EvaluationResultT]) -> dict:
|
|
74
75
|
"""
|
|
75
76
|
Compute the metrics.
|
|
76
77
|
|
|
@@ -80,6 +81,9 @@ class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
|
|
|
80
81
|
Returns:
|
|
81
82
|
The computed metrics.
|
|
82
83
|
"""
|
|
84
|
+
metric_results = await asyncio.gather(*[metric.compute(results) for metric in self.metrics])
|
|
83
85
|
return {
|
|
84
|
-
name: metric.weight * value
|
|
86
|
+
name: metric.weight * value
|
|
87
|
+
for metric, result in zip(self.metrics, metric_results, strict=False)
|
|
88
|
+
for name, value in result.items()
|
|
85
89
|
}
|
|
@@ -46,7 +46,7 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
|
|
|
46
46
|
matching_strategy = matching_strategy_cls(**config["matching_strategy"]["config"])
|
|
47
47
|
return cls(matching_strategy=matching_strategy, weight=config.get("weight", 1.0))
|
|
48
48
|
|
|
49
|
-
def compute(self, results: list[DocumentSearchResult]) -> dict:
|
|
49
|
+
async def compute(self, results: list[DocumentSearchResult]) -> dict:
|
|
50
50
|
"""
|
|
51
51
|
Compute the metric.
|
|
52
52
|
|
|
@@ -57,7 +57,18 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
|
|
|
57
57
|
The computed metric.
|
|
58
58
|
"""
|
|
59
59
|
return self.metric.aggregate(
|
|
60
|
-
[
|
|
60
|
+
[
|
|
61
|
+
self.metric(
|
|
62
|
+
[
|
|
63
|
+
element.text_representation
|
|
64
|
+
for element in result.predicted_elements
|
|
65
|
+
if element.text_representation
|
|
66
|
+
],
|
|
67
|
+
result.reference_passages,
|
|
68
|
+
)
|
|
69
|
+
for result in results
|
|
70
|
+
if result.reference_passages is not None
|
|
71
|
+
]
|
|
61
72
|
)
|
|
62
73
|
|
|
63
74
|
|
ragbits/evaluate/optimizer.py
CHANGED
|
@@ -61,7 +61,7 @@ class Optimizer(WithConstructionConfig):
|
|
|
61
61
|
evaluator_config = EvaluatorConfig.model_validate(optimizer_config.evaluator)
|
|
62
62
|
|
|
63
63
|
dataloader: DataLoader = DataLoader.subclass_from_config(evaluator_config.evaluation.dataloader)
|
|
64
|
-
|
|
64
|
+
metricset: MetricSet = MetricSet.from_config(evaluator_config.evaluation.metrics)
|
|
65
65
|
|
|
66
66
|
pipeline_class = import_by_path(evaluator_config.evaluation.pipeline.type)
|
|
67
67
|
pipeline_config = dict(evaluator_config.evaluation.pipeline.config)
|
|
@@ -71,7 +71,7 @@ class Optimizer(WithConstructionConfig):
|
|
|
71
71
|
return optimizer.optimize(
|
|
72
72
|
pipeline_class=pipeline_class,
|
|
73
73
|
pipeline_config=pipeline_config,
|
|
74
|
-
|
|
74
|
+
metricset=metricset,
|
|
75
75
|
dataloader=dataloader,
|
|
76
76
|
callbacks=callbacks,
|
|
77
77
|
)
|
|
@@ -81,7 +81,7 @@ class Optimizer(WithConstructionConfig):
|
|
|
81
81
|
pipeline_class: type[EvaluationPipeline],
|
|
82
82
|
pipeline_config: dict,
|
|
83
83
|
dataloader: DataLoader,
|
|
84
|
-
|
|
84
|
+
metricset: MetricSet,
|
|
85
85
|
callbacks: list[Callable] | None = None,
|
|
86
86
|
) -> list[tuple[dict, float, dict[str, float]]]:
|
|
87
87
|
"""
|
|
@@ -91,7 +91,7 @@ class Optimizer(WithConstructionConfig):
|
|
|
91
91
|
pipeline_class: Pipeline to be optimized.
|
|
92
92
|
pipeline_config: Configuration defining the optimization process.
|
|
93
93
|
dataloader: Data loader.
|
|
94
|
-
|
|
94
|
+
metricset: Metrics to be optimized.
|
|
95
95
|
callbacks: Experiment callbacks.
|
|
96
96
|
|
|
97
97
|
Returns:
|
|
@@ -104,7 +104,7 @@ class Optimizer(WithConstructionConfig):
|
|
|
104
104
|
pipeline_class=pipeline_class,
|
|
105
105
|
pipeline_config=pipeline_config,
|
|
106
106
|
dataloader=dataloader,
|
|
107
|
-
|
|
107
|
+
metricset=metricset,
|
|
108
108
|
)
|
|
109
109
|
|
|
110
110
|
study = optuna.create_study(direction=self.direction)
|
|
@@ -131,7 +131,7 @@ class Optimizer(WithConstructionConfig):
|
|
|
131
131
|
pipeline_class: type[EvaluationPipeline],
|
|
132
132
|
pipeline_config: dict,
|
|
133
133
|
dataloader: DataLoader,
|
|
134
|
-
|
|
134
|
+
metricset: MetricSet,
|
|
135
135
|
) -> float:
|
|
136
136
|
"""
|
|
137
137
|
Run a single experiment.
|
|
@@ -153,11 +153,11 @@ class Optimizer(WithConstructionConfig):
|
|
|
153
153
|
evaluator.compute(
|
|
154
154
|
pipeline=pipeline,
|
|
155
155
|
dataloader=dataloader,
|
|
156
|
-
|
|
156
|
+
metricset=metricset,
|
|
157
157
|
)
|
|
158
158
|
)
|
|
159
|
-
score = sum(results
|
|
160
|
-
metrics_values = results
|
|
159
|
+
score = sum(results.metrics.values())
|
|
160
|
+
metrics_values = results.metrics
|
|
161
161
|
break
|
|
162
162
|
except Exception as exc:
|
|
163
163
|
message = (
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Iterable
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from types import ModuleType
|
|
4
5
|
from typing import ClassVar, Generic, TypeVar
|
|
@@ -51,7 +52,7 @@ class EvaluationPipeline(WithConstructionConfig, Generic[EvaluationTargetT, Eval
|
|
|
51
52
|
pass
|
|
52
53
|
|
|
53
54
|
@abstractmethod
|
|
54
|
-
async def __call__(self, data: EvaluationDataT) -> EvaluationResultT:
|
|
55
|
+
async def __call__(self, data: Iterable[EvaluationDataT]) -> Iterable[EvaluationResultT]:
|
|
55
56
|
"""
|
|
56
57
|
Run the evaluation pipeline.
|
|
57
58
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from collections.abc import Iterable, Sequence
|
|
1
3
|
from dataclasses import dataclass
|
|
2
4
|
from uuid import uuid4
|
|
3
5
|
|
|
@@ -5,6 +7,7 @@ from typing_extensions import Self
|
|
|
5
7
|
|
|
6
8
|
from ragbits.core.sources.hf import HuggingFaceSource
|
|
7
9
|
from ragbits.document_search import DocumentSearch
|
|
10
|
+
from ragbits.document_search.documents.element import Element
|
|
8
11
|
from ragbits.evaluate.pipelines.base import EvaluationData, EvaluationPipeline, EvaluationResult
|
|
9
12
|
|
|
10
13
|
|
|
@@ -14,7 +17,9 @@ class DocumentSearchData(EvaluationData):
|
|
|
14
17
|
"""
|
|
15
18
|
|
|
16
19
|
question: str
|
|
17
|
-
|
|
20
|
+
reference_document_ids: list[str | int] | None = None
|
|
21
|
+
reference_passages: list[str] | None = None
|
|
22
|
+
reference_page_numbers: list[int] | None = None
|
|
18
23
|
|
|
19
24
|
|
|
20
25
|
@dataclass
|
|
@@ -24,8 +29,10 @@ class DocumentSearchResult(EvaluationResult):
|
|
|
24
29
|
"""
|
|
25
30
|
|
|
26
31
|
question: str
|
|
27
|
-
|
|
28
|
-
|
|
32
|
+
predicted_elements: Sequence[Element]
|
|
33
|
+
reference_document_ids: list[str | int] | None = None
|
|
34
|
+
reference_passages: list[str] | None = None
|
|
35
|
+
reference_page_numbers: list[int] | None = None
|
|
29
36
|
|
|
30
37
|
|
|
31
38
|
class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchData, DocumentSearchResult]):
|
|
@@ -60,7 +67,7 @@ class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchDa
|
|
|
60
67
|
# TODO: optimize this for cases with duplicated document search configs between runs
|
|
61
68
|
if config.get("source"):
|
|
62
69
|
config["vector_store"]["config"]["index_name"] = str(uuid4())
|
|
63
|
-
evaluation_target = DocumentSearch.from_config(config)
|
|
70
|
+
evaluation_target: DocumentSearch = DocumentSearch.from_config(config)
|
|
64
71
|
return cls(evaluation_target=evaluation_target, source=config.get("source"))
|
|
65
72
|
|
|
66
73
|
async def prepare(self) -> None:
|
|
@@ -76,21 +83,24 @@ class DocumentSearchPipeline(EvaluationPipeline[DocumentSearch, DocumentSearchDa
|
|
|
76
83
|
)
|
|
77
84
|
await self.evaluation_target.ingest(sources)
|
|
78
85
|
|
|
79
|
-
async def __call__(self, data: DocumentSearchData) -> DocumentSearchResult:
|
|
86
|
+
async def __call__(self, data: Iterable[DocumentSearchData]) -> Iterable[DocumentSearchResult]:
|
|
80
87
|
"""
|
|
81
88
|
Run the document search evaluation pipeline.
|
|
82
89
|
|
|
83
90
|
Args:
|
|
84
|
-
data: The evaluation data.
|
|
91
|
+
data: The evaluation data batch.
|
|
85
92
|
|
|
86
93
|
Returns:
|
|
87
|
-
The evaluation result.
|
|
94
|
+
The evaluation result batch.
|
|
88
95
|
"""
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
96
|
+
results = await asyncio.gather(*[self.evaluation_target.search(row.question) for row in data])
|
|
97
|
+
return [
|
|
98
|
+
DocumentSearchResult(
|
|
99
|
+
question=row.question,
|
|
100
|
+
predicted_elements=elements,
|
|
101
|
+
reference_document_ids=row.reference_document_ids,
|
|
102
|
+
reference_passages=row.reference_passages,
|
|
103
|
+
reference_page_numbers=row.reference_page_numbers,
|
|
104
|
+
)
|
|
105
|
+
for row, elements in zip(data, results, strict=False)
|
|
106
|
+
]
|
ragbits/evaluate/utils.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import sys
|
|
3
|
+
import traceback
|
|
4
|
+
from dataclasses import asdict
|
|
3
5
|
from datetime import datetime
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
from typing import Any
|
|
@@ -12,13 +14,15 @@ from neptune.utils import stringify_unsupported
|
|
|
12
14
|
from neptune_optuna import NeptuneCallback
|
|
13
15
|
from omegaconf import DictConfig
|
|
14
16
|
|
|
17
|
+
from ragbits.evaluate.evaluator import EvaluatorResult
|
|
15
18
|
|
|
16
|
-
|
|
19
|
+
|
|
20
|
+
def log_evaluation_to_file(result: EvaluatorResult, output_dir: Path | None = None) -> Path:
|
|
17
21
|
"""
|
|
18
|
-
Log the evaluation
|
|
22
|
+
Log the evaluation result locally.
|
|
19
23
|
|
|
20
24
|
Args:
|
|
21
|
-
|
|
25
|
+
result: The evaluation result.
|
|
22
26
|
output_dir: The output directory.
|
|
23
27
|
|
|
24
28
|
Returns:
|
|
@@ -27,27 +31,57 @@ def log_evaluation_to_file(results: dict, output_dir: Path | None = None) -> Pat
|
|
|
27
31
|
output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
|
|
28
32
|
metrics_file = output_dir / "metrics.json"
|
|
29
33
|
results_file = output_dir / "results.json"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
_save_json(
|
|
34
|
+
errors_file = output_dir / "errors.json"
|
|
35
|
+
|
|
36
|
+
_save_json(metrics_file, metrics=result.metrics, time_perf=asdict(result.time_perf))
|
|
37
|
+
_save_json(results_file, results=[asdict(entry) for entry in result.results])
|
|
38
|
+
_save_json(
|
|
39
|
+
errors_file,
|
|
40
|
+
errors=[
|
|
41
|
+
{
|
|
42
|
+
"type": exc.__class__.__name__,
|
|
43
|
+
"message": str(exc),
|
|
44
|
+
"stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
|
|
45
|
+
}
|
|
46
|
+
for exc in result.errors
|
|
47
|
+
],
|
|
48
|
+
)
|
|
33
49
|
|
|
34
50
|
return output_dir
|
|
35
51
|
|
|
36
52
|
|
|
37
|
-
def log_evaluation_to_neptune(
|
|
53
|
+
def log_evaluation_to_neptune(result: EvaluatorResult, config: DictConfig, tags: str | list[str] | None = None) -> None:
|
|
38
54
|
"""
|
|
39
|
-
Log the evaluation
|
|
55
|
+
Log the evaluation result to Neptune.
|
|
40
56
|
|
|
41
57
|
Args:
|
|
42
|
-
|
|
43
|
-
config:
|
|
44
|
-
tags:
|
|
58
|
+
result: The evaluation result.
|
|
59
|
+
config: The evaluation configuration.
|
|
60
|
+
tags: The experiment tags.
|
|
45
61
|
"""
|
|
46
62
|
run = Run(tags=tags)
|
|
47
63
|
run["config"] = stringify_unsupported(config)
|
|
48
|
-
run["evaluation/metrics"] = stringify_unsupported(
|
|
49
|
-
run["evaluation/time_perf"] = stringify_unsupported(
|
|
50
|
-
run["evaluation/results"].upload(
|
|
64
|
+
run["evaluation/metrics"] = stringify_unsupported(result.metrics)
|
|
65
|
+
run["evaluation/time_perf"] = stringify_unsupported(asdict(result.time_perf))
|
|
66
|
+
run["evaluation/results"].upload(
|
|
67
|
+
File.from_content(json.dumps([asdict(entry) for entry in result.results], indent=4), extension="json")
|
|
68
|
+
)
|
|
69
|
+
run["evaluation/errors"].upload(
|
|
70
|
+
File.from_content(
|
|
71
|
+
json.dumps(
|
|
72
|
+
[
|
|
73
|
+
{
|
|
74
|
+
"type": exc.__class__.__name__,
|
|
75
|
+
"message": str(exc),
|
|
76
|
+
"stacktrace": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
|
|
77
|
+
}
|
|
78
|
+
for exc in result.errors
|
|
79
|
+
],
|
|
80
|
+
indent=4,
|
|
81
|
+
),
|
|
82
|
+
extension="json",
|
|
83
|
+
)
|
|
84
|
+
)
|
|
51
85
|
|
|
52
86
|
|
|
53
87
|
def log_dataset_to_file(dataset: Dataset, output_dir: Path | None = None) -> Path:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragbits-evaluate
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.18.0
|
|
4
4
|
Summary: Evaluation module for Ragbits components
|
|
5
5
|
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
6
|
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.4.1
|
|
|
27
27
|
Requires-Dist: hydra-core<2.0.0,>=1.3.2
|
|
28
28
|
Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
|
|
29
29
|
Requires-Dist: optuna<5.0.0,>=4.0.0
|
|
30
|
-
Requires-Dist: ragbits-core==0.
|
|
30
|
+
Requires-Dist: ragbits-core==0.18.0
|
|
31
31
|
Provides-Extra: relari
|
|
32
32
|
Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
|
|
33
33
|
Description-Content-Type: text/markdown
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
ragbits/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
ragbits/evaluate/cli.py,sha256=
|
|
2
|
+
ragbits/evaluate/cli.py,sha256=vP8l2DyNXpR6jQP83wXKP_RRnGjEXjKnTVBg9RPbDKo,4505
|
|
3
3
|
ragbits/evaluate/config.py,sha256=2WSmbVxyQi893L2FSjRFQoXkWZp1GetcNmR2GCDe0tA,339
|
|
4
|
-
ragbits/evaluate/evaluator.py,sha256=
|
|
5
|
-
ragbits/evaluate/optimizer.py,sha256=
|
|
4
|
+
ragbits/evaluate/evaluator.py,sha256=awRDaDTubHtM_1SANIeE5GhQ0v9fawF0q1Tj6FWZDvQ,7348
|
|
5
|
+
ragbits/evaluate/optimizer.py,sha256=RqYgoiCIEhjXO0HEN6uwldblHyoPuT3qUdncuHPZgCg,8485
|
|
6
6
|
ragbits/evaluate/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
ragbits/evaluate/utils.py,sha256=
|
|
7
|
+
ragbits/evaluate/utils.py,sha256=w-hbvKRHI9tEva9wKDTVla0Wm2eCHT2MxVkof27Sqfw,4831
|
|
8
8
|
ragbits/evaluate/dataloaders/__init__.py,sha256=UFJFjmvi3GUQFsx6A5sYD01HH2f7TXcHRW2VNM1pmIA,83
|
|
9
9
|
ragbits/evaluate/dataloaders/base.py,sha256=ovL38_tH12q9wd3yeflIlovGuSD8S1X9HUUtwv17QrM,1774
|
|
10
|
-
ragbits/evaluate/dataloaders/document_search.py,sha256=
|
|
10
|
+
ragbits/evaluate/dataloaders/document_search.py,sha256=BLOaXP6TVtSsV2xScY4X_th285hWI4b9lcRuUXNxZ3U,2813
|
|
11
11
|
ragbits/evaluate/dataloaders/exceptions.py,sha256=xUOBLj1JuCkcqzRVnu0A0I_i1THxbDt2MEDVdDGjDyY,735
|
|
12
12
|
ragbits/evaluate/dataset_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
ragbits/evaluate/dataset_generator/pipeline.py,sha256=dgnV-Qm0Z7S1Y6ga9-9RscXxxr3krOKsIj7E9WS4ANk,4940
|
|
@@ -23,13 +23,13 @@ ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py,sha256=ydMHyI0JrWZf
|
|
|
23
23
|
ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
ragbits/evaluate/dataset_generator/tasks/text_generation/base.py,sha256=2h-Y14H3fRHKbTNvXWKRus8t0hdTITd9LMoIFVwfKfA,2138
|
|
25
25
|
ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py,sha256=QAClPbTVNCe4QzVOGuepRnsmkt9ZF6bXBAuJI2elRuE,3851
|
|
26
|
-
ragbits/evaluate/factories/__init__.py,sha256=
|
|
26
|
+
ragbits/evaluate/factories/__init__.py,sha256=7nh0J80EfqMWRGtHx4hkfHNMztfC6FMhH8gHumwcH9w,1727
|
|
27
27
|
ragbits/evaluate/metrics/__init__.py,sha256=Mr83ytGyvdXtBlr7Bbo0-5auE0530xsd3wffKSIf8cE,95
|
|
28
|
-
ragbits/evaluate/metrics/base.py,sha256=
|
|
29
|
-
ragbits/evaluate/metrics/document_search.py,sha256=
|
|
28
|
+
ragbits/evaluate/metrics/base.py,sha256=bOscQ_nJXLGWmP2ls9jncrUoeghNBnKDJsab71pFEjo,2519
|
|
29
|
+
ragbits/evaluate/metrics/document_search.py,sha256=MfvMwEPenqiJdKYuW6WLvmtMch9ZVYb0T6ibpOF3vGI,3189
|
|
30
30
|
ragbits/evaluate/pipelines/__init__.py,sha256=Bqp_L7aRq12Ua19ELZDsdYvra6-GlLrQ9cIG2IWArko,1294
|
|
31
|
-
ragbits/evaluate/pipelines/base.py,sha256=
|
|
32
|
-
ragbits/evaluate/pipelines/document_search.py,sha256=
|
|
33
|
-
ragbits_evaluate-0.
|
|
34
|
-
ragbits_evaluate-0.
|
|
35
|
-
ragbits_evaluate-0.
|
|
31
|
+
ragbits/evaluate/pipelines/base.py,sha256=QV3fjPnbJjeCgcbt8yV1Ho3BamEUc3wSca3MAzaBlV0,1739
|
|
32
|
+
ragbits/evaluate/pipelines/document_search.py,sha256=tgk-I21eshdBbWVsuNa1zWK_fWuDNXhhMCn1_Fdu_Ko,3840
|
|
33
|
+
ragbits_evaluate-0.18.0.dist-info/METADATA,sha256=in_9YOI8UrGJR_tXGPKJYoinR6Ju4Taa0PaK9DQVCnw,2300
|
|
34
|
+
ragbits_evaluate-0.18.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
35
|
+
ragbits_evaluate-0.18.0.dist-info/RECORD,,
|
|
File without changes
|