ragbits-evaluate 0.20.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragbits-evaluate might be problematic. Click here for more details.

@@ -27,7 +27,6 @@ class QuestionAnswerDataLoader(DataLoader[QuestionAnswerData]):
27
27
  Args:
28
28
  source: The source to load the data from.
29
29
  split: The split to load the data from.
30
- required_keys: The required keys to load the data from.
31
30
  question_key: The dataset column name that contains the question.
32
31
  answer_key: The dataset column name that contains the answer.
33
32
  context_key: The dataset column name that contains the context. Context is optional.
@@ -2,7 +2,7 @@ import sys
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Any
4
4
 
5
- from distilabel.llms.base import LLM
5
+ from distilabel.models import LLM
6
6
  from distilabel.steps.tasks import TextGeneration
7
7
 
8
8
  from ragbits.core.prompt import ChatFormat, Prompt
@@ -1,6 +1,6 @@
1
1
  from typing import Any
2
2
 
3
- from distilabel.llms.base import LLM
3
+ from distilabel.models import LLM
4
4
 
5
5
  from ragbits.evaluate.dataset_generator.tasks.text_generation.base import BaseDistilabelTask
6
6
  from ragbits.evaluate.dataset_generator.utils import get_closest_substring, get_passages_list
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  from abc import ABC, abstractmethod
3
+ from asyncio import AbstractEventLoop
3
4
  from itertools import chain
4
5
  from typing import Generic, TypeVar
5
6
 
@@ -27,8 +28,9 @@ class _MetricLMM(LLMInterface):
27
28
  Implementation of required interface of Relari generative metrics based on LiteLMM.
28
29
  """
29
30
 
30
- def __init__(self, llm: LLM) -> None:
31
+ def __init__(self, llm: LLM, loop: AbstractEventLoop) -> None:
31
32
  self._llm = llm
33
+ self._loop = loop
32
34
 
33
35
  def run(self, prompt: dict[str, str], temperature: float = 0, max_tokens: int = 1024) -> str:
34
36
  formatted_prompt = [
@@ -39,7 +41,10 @@ class _MetricLMM(LLMInterface):
39
41
  temperature=temperature,
40
42
  max_tokens=max_tokens,
41
43
  )
42
- return asyncio.run(self._llm.generate(formatted_prompt, options=options))
44
+ return asyncio.run_coroutine_threadsafe(
45
+ self._llm.generate(formatted_prompt, options=options),
46
+ self._loop,
47
+ ).result()
43
48
 
44
49
 
45
50
  class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
@@ -60,7 +65,7 @@ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
60
65
  weight: Metric value weight in the final score, used during optimization.
61
66
  """
62
67
  super().__init__(weight=weight)
63
- self.metric = self.metric_cls(_MetricLMM(llm))
68
+ self.llm = llm
64
69
  self.batch_size = batch_size
65
70
 
66
71
  @classmethod
@@ -89,16 +94,18 @@ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
89
94
  Returns:
90
95
  The computed metric.
91
96
  """
97
+ metric = self.metric_cls(_MetricLMM(self.llm, loop=asyncio.get_running_loop()))
92
98
  metric_results = chain.from_iterable(
93
99
  [
94
- await asyncio.gather(*[asyncio.to_thread(self._call_metric, result) for result in batch])
100
+ await asyncio.gather(*[asyncio.to_thread(self._call_metric, metric, result) for result in batch])
95
101
  for batch in batched(results, self.batch_size)
96
102
  ]
97
103
  )
98
- return self.metric.aggregate(list(metric_results))
104
+ return metric.aggregate(list(metric_results))
99
105
 
106
+ @staticmethod
100
107
  @abstractmethod
101
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
108
+ def _call_metric(metric: MetricT, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
102
109
  """
103
110
  Call the metric with the proper arguments.
104
111
  """
@@ -112,8 +119,12 @@ class QuestionAnswerAnswerCorrectness(QuestionAnswerMetric[LLMBasedAnswerCorrect
112
119
 
113
120
  metric_cls: type[LLMBasedAnswerCorrectness] = LLMBasedAnswerCorrectness
114
121
 
115
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
116
- return self.metric(
122
+ @staticmethod
123
+ def _call_metric(
124
+ metric: LLMBasedAnswerCorrectness,
125
+ result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
126
+ ) -> dict:
127
+ return metric(
117
128
  question=result.question,
118
129
  answer=(
119
130
  result.predicted_result.content
@@ -132,8 +143,12 @@ class QuestionAnswerAnswerFaithfulness(QuestionAnswerMetric[LLMBasedFaithfulness
132
143
 
133
144
  metric_cls: type[LLMBasedFaithfulness] = LLMBasedFaithfulness
134
145
 
135
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
136
- return self.metric(
146
+ @staticmethod
147
+ def _call_metric(
148
+ metric: LLMBasedFaithfulness,
149
+ result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
150
+ ) -> dict:
151
+ return metric(
137
152
  question=result.question,
138
153
  answer=(
139
154
  result.predicted_result.content
@@ -152,8 +167,12 @@ class QuestionAnswerAnswerRelevance(QuestionAnswerMetric[LLMBasedAnswerRelevance
152
167
 
153
168
  metric_cls: type[LLMBasedAnswerRelevance] = LLMBasedAnswerRelevance
154
169
 
155
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
156
- return self.metric(
170
+ @staticmethod
171
+ def _call_metric(
172
+ metric: LLMBasedAnswerRelevance,
173
+ result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
174
+ ) -> dict:
175
+ return metric(
157
176
  question=result.question,
158
177
  answer=(
159
178
  result.predicted_result.content
@@ -171,8 +190,12 @@ class QuestionAnswerAnswerConsistency(QuestionAnswerMetric[LLMBasedStyleConsiste
171
190
 
172
191
  metric_cls: type[LLMBasedStyleConsistency] = LLMBasedStyleConsistency
173
192
 
174
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
175
- return self.metric(
193
+ @staticmethod
194
+ def _call_metric(
195
+ metric: LLMBasedStyleConsistency,
196
+ result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
197
+ ) -> dict:
198
+ return metric(
176
199
  answer=(
177
200
  result.predicted_result.content
178
201
  if isinstance(result.predicted_result.content, str)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 0.20.1
3
+ Version: 1.1.0
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -23,11 +23,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
24
  Requires-Python: >=3.10
25
25
  Requires-Dist: datasets<4.0.0,>=3.0.1
26
- Requires-Dist: distilabel<2.0.0,>=1.4.1
26
+ Requires-Dist: distilabel<2.0.0,>=1.5.0
27
27
  Requires-Dist: hydra-core<2.0.0,>=1.3.2
28
28
  Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
29
29
  Requires-Dist: optuna<5.0.0,>=4.0.0
30
- Requires-Dist: ragbits-core==0.20.1
30
+ Requires-Dist: ragbits-core==1.1.0
31
31
  Provides-Extra: relari
32
32
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
33
33
  Description-Content-Type: text/markdown
@@ -9,7 +9,7 @@ ragbits/evaluate/dataloaders/__init__.py,sha256=UFJFjmvi3GUQFsx6A5sYD01HH2f7TXcH
9
9
  ragbits/evaluate/dataloaders/base.py,sha256=x8rEl5utNOziF_9urL0grkqoXwMgaDWYSM5akw3Kt9Y,3213
10
10
  ragbits/evaluate/dataloaders/document_search.py,sha256=c9Bc4ZtFEKAiG9B70JFiBZlZDkBSGNWFRKabF7PMTU0,2495
11
11
  ragbits/evaluate/dataloaders/exceptions.py,sha256=xUOBLj1JuCkcqzRVnu0A0I_i1THxbDt2MEDVdDGjDyY,735
12
- ragbits/evaluate/dataloaders/question_answer.py,sha256=naXFDtla0otOTWSyHVvWvgDYEq-Wry4irnAJR2tHMNg,2032
12
+ ragbits/evaluate/dataloaders/question_answer.py,sha256=PvG2n9zSy5bH4NJKgSxgxqHjNozLHPJijuBvryiCq_o,1964
13
13
  ragbits/evaluate/dataset_generator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  ragbits/evaluate/dataset_generator/pipeline.py,sha256=dgnV-Qm0Z7S1Y6ga9-9RscXxxr3krOKsIj7E9WS4ANk,4940
15
15
  ragbits/evaluate/dataset_generator/utils.py,sha256=zD-ksXlX62kkIgzBefE4ILsP7He9bHimnZ63LLsMKCA,1325
@@ -22,17 +22,17 @@ ragbits/evaluate/dataset_generator/tasks/filter/__init__.py,sha256=47DEQpj8HBSa-
22
22
  ragbits/evaluate/dataset_generator/tasks/filter/base.py,sha256=vKSBOaVC5hLzZe2NMS0LrGZwhN07x0M2WcrijXoh5iY,1146
23
23
  ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py,sha256=ydMHyI0JrWZfZZqY1EFAZ38SsdYCiXyvrvcDsxJPOBg,988
24
24
  ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- ragbits/evaluate/dataset_generator/tasks/text_generation/base.py,sha256=2h-Y14H3fRHKbTNvXWKRus8t0hdTITd9LMoIFVwfKfA,2138
26
- ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py,sha256=QAClPbTVNCe4QzVOGuepRnsmkt9ZF6bXBAuJI2elRuE,3851
25
+ ragbits/evaluate/dataset_generator/tasks/text_generation/base.py,sha256=bSNsswe2AMskmlctslTo0cJSz_cTc5EW1WW7kshIJPQ,2135
26
+ ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py,sha256=vVk2U3KgnutkqtJOKGoQKxMFD9YLbx4IR7jBIm2a2RE,3848
27
27
  ragbits/evaluate/factories/__init__.py,sha256=7nh0J80EfqMWRGtHx4hkfHNMztfC6FMhH8gHumwcH9w,1727
28
28
  ragbits/evaluate/metrics/__init__.py,sha256=Mr83ytGyvdXtBlr7Bbo0-5auE0530xsd3wffKSIf8cE,95
29
29
  ragbits/evaluate/metrics/base.py,sha256=bOscQ_nJXLGWmP2ls9jncrUoeghNBnKDJsab71pFEjo,2519
30
30
  ragbits/evaluate/metrics/document_search.py,sha256=MfvMwEPenqiJdKYuW6WLvmtMch9ZVYb0T6ibpOF3vGI,3189
31
- ragbits/evaluate/metrics/question_answer.py,sha256=_XMFjkJcG-xdOO2fCfoKIhJb5VVM_GK_yKhFGXO8FRM,6566
31
+ ragbits/evaluate/metrics/question_answer.py,sha256=369lOoY76KY-wUxBKl0lSQlJSF0JhmPpehNQYeiWNHg,7072
32
32
  ragbits/evaluate/pipelines/__init__.py,sha256=Bqp_L7aRq12Ua19ELZDsdYvra6-GlLrQ9cIG2IWArko,1294
33
33
  ragbits/evaluate/pipelines/base.py,sha256=QV3fjPnbJjeCgcbt8yV1Ho3BamEUc3wSca3MAzaBlV0,1739
34
34
  ragbits/evaluate/pipelines/document_search.py,sha256=tgk-I21eshdBbWVsuNa1zWK_fWuDNXhhMCn1_Fdu_Ko,3840
35
35
  ragbits/evaluate/pipelines/question_answer.py,sha256=3CYVHDLnOy4z7kgYPMluiJ8POulHo-w3PEiqvqsF4Dc,2797
36
- ragbits_evaluate-0.20.1.dist-info/METADATA,sha256=HdTC5f4iph7a0DAcopzaI55KElixcr0zsAVtUi2S3wE,2300
37
- ragbits_evaluate-0.20.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- ragbits_evaluate-0.20.1.dist-info/RECORD,,
36
+ ragbits_evaluate-1.1.0.dist-info/METADATA,sha256=dYK_B94gLLLfz3qPBEP5iF57nd9jFaOOijqkUGijqT8,2298
37
+ ragbits_evaluate-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ ragbits_evaluate-1.1.0.dist-info/RECORD,,