ragbits-evaluate 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ragbits-evaluate might be problematic. Click here for more details.
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/.gitignore +4 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/CHANGELOG.md +10 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/PKG-INFO +3 -3
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/pyproject.toml +2 -2
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/question_answer.py +0 -1
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +1 -1
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +1 -1
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/question_answer.py +37 -14
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/README.md +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/cli.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/config.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/base.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/document_search.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/exceptions.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/evaluator.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/factories/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/base.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/document_search.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/optimizer.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/__init__.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/base.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/document_search.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/question_answer.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/py.typed +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/utils.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/tests/cli/test_run_evaluation.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/tests/unit/test_evaluator.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/tests/unit/test_metrics.py +0 -0
- {ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/tests/unit/test_optimizer.py +0 -0
|
@@ -2,6 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
## Unreleased
|
|
4
4
|
|
|
5
|
+
## 1.1.0 (2025-07-09)
|
|
6
|
+
|
|
7
|
+
### Changed
|
|
8
|
+
|
|
9
|
+
- ragbits-core updated to version v1.1.0
|
|
10
|
+
|
|
11
|
+
- Update qa data loader docstring (#565)
|
|
12
|
+
- Fix deadlock on qa metrics compute (#609)
|
|
13
|
+
- Upgrade distilabel version to 1.5.0 (#682)
|
|
14
|
+
|
|
5
15
|
## 1.0.0 (2025-06-04)
|
|
6
16
|
|
|
7
17
|
### Changed
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragbits-evaluate
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Evaluation module for Ragbits components
|
|
5
5
|
Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
|
|
6
6
|
Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
|
|
@@ -23,11 +23,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
23
23
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
24
|
Requires-Python: >=3.10
|
|
25
25
|
Requires-Dist: datasets<4.0.0,>=3.0.1
|
|
26
|
-
Requires-Dist: distilabel<2.0.0,>=1.
|
|
26
|
+
Requires-Dist: distilabel<2.0.0,>=1.5.0
|
|
27
27
|
Requires-Dist: hydra-core<2.0.0,>=1.3.2
|
|
28
28
|
Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
|
|
29
29
|
Requires-Dist: optuna<5.0.0,>=4.0.0
|
|
30
|
-
Requires-Dist: ragbits-core==1.
|
|
30
|
+
Requires-Dist: ragbits-core==1.1.0
|
|
31
31
|
Provides-Extra: relari
|
|
32
32
|
Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
|
|
33
33
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ragbits-evaluate"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.1.0"
|
|
4
4
|
description = "Evaluation module for Ragbits components"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -32,7 +32,7 @@ classifiers = [
|
|
|
32
32
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
33
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
34
34
|
]
|
|
35
|
-
dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.
|
|
35
|
+
dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.1.0"]
|
|
36
36
|
|
|
37
37
|
[project.urls]
|
|
38
38
|
"Homepage" = "https://github.com/deepsense-ai/ragbits"
|
|
@@ -27,7 +27,6 @@ class QuestionAnswerDataLoader(DataLoader[QuestionAnswerData]):
|
|
|
27
27
|
Args:
|
|
28
28
|
source: The source to load the data from.
|
|
29
29
|
split: The split to load the data from.
|
|
30
|
-
required_keys: The required keys to load the data from.
|
|
31
30
|
question_key: The dataset column name that contains the question.
|
|
32
31
|
answer_key: The dataset column name that contains the answer.
|
|
33
32
|
context_key: The dataset column name that contains the context. Context is optional.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
-
from distilabel.
|
|
3
|
+
from distilabel.models import LLM
|
|
4
4
|
|
|
5
5
|
from ragbits.evaluate.dataset_generator.tasks.text_generation.base import BaseDistilabelTask
|
|
6
6
|
from ragbits.evaluate.dataset_generator.utils import get_closest_substring, get_passages_list
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/question_answer.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
+
from asyncio import AbstractEventLoop
|
|
3
4
|
from itertools import chain
|
|
4
5
|
from typing import Generic, TypeVar
|
|
5
6
|
|
|
@@ -27,8 +28,9 @@ class _MetricLMM(LLMInterface):
|
|
|
27
28
|
Implementation of required interface of Relari generative metrics based on LiteLMM.
|
|
28
29
|
"""
|
|
29
30
|
|
|
30
|
-
def __init__(self, llm: LLM) -> None:
|
|
31
|
+
def __init__(self, llm: LLM, loop: AbstractEventLoop) -> None:
|
|
31
32
|
self._llm = llm
|
|
33
|
+
self._loop = loop
|
|
32
34
|
|
|
33
35
|
def run(self, prompt: dict[str, str], temperature: float = 0, max_tokens: int = 1024) -> str:
|
|
34
36
|
formatted_prompt = [
|
|
@@ -39,7 +41,10 @@ class _MetricLMM(LLMInterface):
|
|
|
39
41
|
temperature=temperature,
|
|
40
42
|
max_tokens=max_tokens,
|
|
41
43
|
)
|
|
42
|
-
return asyncio.
|
|
44
|
+
return asyncio.run_coroutine_threadsafe(
|
|
45
|
+
self._llm.generate(formatted_prompt, options=options),
|
|
46
|
+
self._loop,
|
|
47
|
+
).result()
|
|
43
48
|
|
|
44
49
|
|
|
45
50
|
class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
|
|
@@ -60,7 +65,7 @@ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
|
|
|
60
65
|
weight: Metric value weight in the final score, used during optimization.
|
|
61
66
|
"""
|
|
62
67
|
super().__init__(weight=weight)
|
|
63
|
-
self.
|
|
68
|
+
self.llm = llm
|
|
64
69
|
self.batch_size = batch_size
|
|
65
70
|
|
|
66
71
|
@classmethod
|
|
@@ -89,16 +94,18 @@ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
|
|
|
89
94
|
Returns:
|
|
90
95
|
The computed metric.
|
|
91
96
|
"""
|
|
97
|
+
metric = self.metric_cls(_MetricLMM(self.llm, loop=asyncio.get_running_loop()))
|
|
92
98
|
metric_results = chain.from_iterable(
|
|
93
99
|
[
|
|
94
|
-
await asyncio.gather(*[asyncio.to_thread(self._call_metric, result) for result in batch])
|
|
100
|
+
await asyncio.gather(*[asyncio.to_thread(self._call_metric, metric, result) for result in batch])
|
|
95
101
|
for batch in batched(results, self.batch_size)
|
|
96
102
|
]
|
|
97
103
|
)
|
|
98
|
-
return
|
|
104
|
+
return metric.aggregate(list(metric_results))
|
|
99
105
|
|
|
106
|
+
@staticmethod
|
|
100
107
|
@abstractmethod
|
|
101
|
-
def _call_metric(
|
|
108
|
+
def _call_metric(metric: MetricT, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
|
|
102
109
|
"""
|
|
103
110
|
Call the metric with the proper arguments.
|
|
104
111
|
"""
|
|
@@ -112,8 +119,12 @@ class QuestionAnswerAnswerCorrectness(QuestionAnswerMetric[LLMBasedAnswerCorrect
|
|
|
112
119
|
|
|
113
120
|
metric_cls: type[LLMBasedAnswerCorrectness] = LLMBasedAnswerCorrectness
|
|
114
121
|
|
|
115
|
-
|
|
116
|
-
|
|
122
|
+
@staticmethod
|
|
123
|
+
def _call_metric(
|
|
124
|
+
metric: LLMBasedAnswerCorrectness,
|
|
125
|
+
result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
|
|
126
|
+
) -> dict:
|
|
127
|
+
return metric(
|
|
117
128
|
question=result.question,
|
|
118
129
|
answer=(
|
|
119
130
|
result.predicted_result.content
|
|
@@ -132,8 +143,12 @@ class QuestionAnswerAnswerFaithfulness(QuestionAnswerMetric[LLMBasedFaithfulness
|
|
|
132
143
|
|
|
133
144
|
metric_cls: type[LLMBasedFaithfulness] = LLMBasedFaithfulness
|
|
134
145
|
|
|
135
|
-
|
|
136
|
-
|
|
146
|
+
@staticmethod
|
|
147
|
+
def _call_metric(
|
|
148
|
+
metric: LLMBasedFaithfulness,
|
|
149
|
+
result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
|
|
150
|
+
) -> dict:
|
|
151
|
+
return metric(
|
|
137
152
|
question=result.question,
|
|
138
153
|
answer=(
|
|
139
154
|
result.predicted_result.content
|
|
@@ -152,8 +167,12 @@ class QuestionAnswerAnswerRelevance(QuestionAnswerMetric[LLMBasedAnswerRelevance
|
|
|
152
167
|
|
|
153
168
|
metric_cls: type[LLMBasedAnswerRelevance] = LLMBasedAnswerRelevance
|
|
154
169
|
|
|
155
|
-
|
|
156
|
-
|
|
170
|
+
@staticmethod
|
|
171
|
+
def _call_metric(
|
|
172
|
+
metric: LLMBasedAnswerRelevance,
|
|
173
|
+
result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
|
|
174
|
+
) -> dict:
|
|
175
|
+
return metric(
|
|
157
176
|
question=result.question,
|
|
158
177
|
answer=(
|
|
159
178
|
result.predicted_result.content
|
|
@@ -171,8 +190,12 @@ class QuestionAnswerAnswerConsistency(QuestionAnswerMetric[LLMBasedStyleConsiste
|
|
|
171
190
|
|
|
172
191
|
metric_cls: type[LLMBasedStyleConsistency] = LLMBasedStyleConsistency
|
|
173
192
|
|
|
174
|
-
|
|
175
|
-
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _call_metric(
|
|
195
|
+
metric: LLMBasedStyleConsistency,
|
|
196
|
+
result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
|
|
197
|
+
) -> dict:
|
|
198
|
+
return metric(
|
|
176
199
|
answer=(
|
|
177
200
|
result.predicted_result.content
|
|
178
201
|
if isinstance(result.predicted_result.content, str)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/exceptions.py
RENAMED
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/__init__.py
RENAMED
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/factories/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/document_search.py
RENAMED
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/document_search.py
RENAMED
|
File without changes
|
{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/question_answer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|