ragbits-evaluate 0.20.1__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ragbits-evaluate might be problematic. Click here for more details.

Files changed (44) hide show
  1. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/.gitignore +4 -0
  2. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/CHANGELOG.md +16 -0
  3. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/PKG-INFO +3 -3
  4. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/pyproject.toml +2 -2
  5. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/question_answer.py +0 -1
  6. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +1 -1
  7. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +1 -1
  8. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/question_answer.py +37 -14
  9. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/README.md +0 -0
  10. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/__init__.py +0 -0
  11. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/cli.py +0 -0
  12. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/config.py +0 -0
  13. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/__init__.py +0 -0
  14. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/base.py +0 -0
  15. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/document_search.py +0 -0
  16. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/exceptions.py +0 -0
  17. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
  18. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
  19. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
  20. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
  21. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
  22. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
  23. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
  24. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
  25. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
  26. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
  27. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
  28. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
  29. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/evaluator.py +0 -0
  30. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/factories/__init__.py +0 -0
  31. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/__init__.py +0 -0
  32. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/base.py +0 -0
  33. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/document_search.py +0 -0
  34. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/optimizer.py +0 -0
  35. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/__init__.py +0 -0
  36. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/base.py +0 -0
  37. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/document_search.py +0 -0
  38. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/pipelines/question_answer.py +0 -0
  39. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/py.typed +0 -0
  40. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/utils.py +0 -0
  41. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/tests/cli/test_run_evaluation.py +0 -0
  42. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/tests/unit/test_evaluator.py +0 -0
  43. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/tests/unit/test_metrics.py +0 -0
  44. {ragbits_evaluate-0.20.1 → ragbits_evaluate-1.1.0}/tests/unit/test_optimizer.py +0 -0
@@ -9,6 +9,9 @@ venv/
9
9
  __pycache__/
10
10
  **.egg-info/
11
11
 
12
+ # Local cursor rules
13
+ .cursor/rules/local/
14
+
12
15
  # Byte-compiled / optimized / DLL files
13
16
  __pycache__/
14
17
  *.py[cod]
@@ -101,3 +104,4 @@ qdrant/
101
104
  .aider*
102
105
 
103
106
  .DS_Store
107
+ node_modules/
@@ -2,6 +2,22 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ ## 1.1.0 (2025-07-09)
6
+
7
+ ### Changed
8
+
9
+ - ragbits-core updated to version v1.1.0
10
+
11
+ - Update qa data loader docstring (#565)
12
+ - Fix deadlock on qa metrics compute (#609)
13
+ - Upgrade distilabel version to 1.5.0 (#682)
14
+
15
+ ## 1.0.0 (2025-06-04)
16
+
17
+ ### Changed
18
+
19
+ - ragbits-core updated to version v1.0.0
20
+
5
21
  ## 0.20.1 (2025-06-04)
6
22
 
7
23
  ### Changed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 0.20.1
3
+ Version: 1.1.0
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -23,11 +23,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
24
  Requires-Python: >=3.10
25
25
  Requires-Dist: datasets<4.0.0,>=3.0.1
26
- Requires-Dist: distilabel<2.0.0,>=1.4.1
26
+ Requires-Dist: distilabel<2.0.0,>=1.5.0
27
27
  Requires-Dist: hydra-core<2.0.0,>=1.3.2
28
28
  Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
29
29
  Requires-Dist: optuna<5.0.0,>=4.0.0
30
- Requires-Dist: ragbits-core==0.20.1
30
+ Requires-Dist: ragbits-core==1.1.0
31
31
  Provides-Extra: relari
32
32
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
33
33
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ragbits-evaluate"
3
- version = "0.20.1"
3
+ version = "1.1.0"
4
4
  description = "Evaluation module for Ragbits components"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
32
32
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
33
  "Topic :: Software Development :: Libraries :: Python Modules",
34
34
  ]
35
- dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.4.1,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==0.20.1"]
35
+ dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.1.0"]
36
36
 
37
37
  [project.urls]
38
38
  "Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -27,7 +27,6 @@ class QuestionAnswerDataLoader(DataLoader[QuestionAnswerData]):
27
27
  Args:
28
28
  source: The source to load the data from.
29
29
  split: The split to load the data from.
30
- required_keys: The required keys to load the data from.
31
30
  question_key: The dataset column name that contains the question.
32
31
  answer_key: The dataset column name that contains the answer.
33
32
  context_key: The dataset column name that contains the context. Context is optional.
@@ -2,7 +2,7 @@ import sys
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Any
4
4
 
5
- from distilabel.llms.base import LLM
5
+ from distilabel.models import LLM
6
6
  from distilabel.steps.tasks import TextGeneration
7
7
 
8
8
  from ragbits.core.prompt import ChatFormat, Prompt
@@ -1,6 +1,6 @@
1
1
  from typing import Any
2
2
 
3
- from distilabel.llms.base import LLM
3
+ from distilabel.models import LLM
4
4
 
5
5
  from ragbits.evaluate.dataset_generator.tasks.text_generation.base import BaseDistilabelTask
6
6
  from ragbits.evaluate.dataset_generator.utils import get_closest_substring, get_passages_list
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  from abc import ABC, abstractmethod
3
+ from asyncio import AbstractEventLoop
3
4
  from itertools import chain
4
5
  from typing import Generic, TypeVar
5
6
 
@@ -27,8 +28,9 @@ class _MetricLMM(LLMInterface):
27
28
  Implementation of required interface of Relari generative metrics based on LiteLMM.
28
29
  """
29
30
 
30
- def __init__(self, llm: LLM) -> None:
31
+ def __init__(self, llm: LLM, loop: AbstractEventLoop) -> None:
31
32
  self._llm = llm
33
+ self._loop = loop
32
34
 
33
35
  def run(self, prompt: dict[str, str], temperature: float = 0, max_tokens: int = 1024) -> str:
34
36
  formatted_prompt = [
@@ -39,7 +41,10 @@ class _MetricLMM(LLMInterface):
39
41
  temperature=temperature,
40
42
  max_tokens=max_tokens,
41
43
  )
42
- return asyncio.run(self._llm.generate(formatted_prompt, options=options))
44
+ return asyncio.run_coroutine_threadsafe(
45
+ self._llm.generate(formatted_prompt, options=options),
46
+ self._loop,
47
+ ).result()
43
48
 
44
49
 
45
50
  class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
@@ -60,7 +65,7 @@ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
60
65
  weight: Metric value weight in the final score, used during optimization.
61
66
  """
62
67
  super().__init__(weight=weight)
63
- self.metric = self.metric_cls(_MetricLMM(llm))
68
+ self.llm = llm
64
69
  self.batch_size = batch_size
65
70
 
66
71
  @classmethod
@@ -89,16 +94,18 @@ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
89
94
  Returns:
90
95
  The computed metric.
91
96
  """
97
+ metric = self.metric_cls(_MetricLMM(self.llm, loop=asyncio.get_running_loop()))
92
98
  metric_results = chain.from_iterable(
93
99
  [
94
- await asyncio.gather(*[asyncio.to_thread(self._call_metric, result) for result in batch])
100
+ await asyncio.gather(*[asyncio.to_thread(self._call_metric, metric, result) for result in batch])
95
101
  for batch in batched(results, self.batch_size)
96
102
  ]
97
103
  )
98
- return self.metric.aggregate(list(metric_results))
104
+ return metric.aggregate(list(metric_results))
99
105
 
106
+ @staticmethod
100
107
  @abstractmethod
101
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
108
+ def _call_metric(metric: MetricT, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
102
109
  """
103
110
  Call the metric with the proper arguments.
104
111
  """
@@ -112,8 +119,12 @@ class QuestionAnswerAnswerCorrectness(QuestionAnswerMetric[LLMBasedAnswerCorrect
112
119
 
113
120
  metric_cls: type[LLMBasedAnswerCorrectness] = LLMBasedAnswerCorrectness
114
121
 
115
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
116
- return self.metric(
122
+ @staticmethod
123
+ def _call_metric(
124
+ metric: LLMBasedAnswerCorrectness,
125
+ result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
126
+ ) -> dict:
127
+ return metric(
117
128
  question=result.question,
118
129
  answer=(
119
130
  result.predicted_result.content
@@ -132,8 +143,12 @@ class QuestionAnswerAnswerFaithfulness(QuestionAnswerMetric[LLMBasedFaithfulness
132
143
 
133
144
  metric_cls: type[LLMBasedFaithfulness] = LLMBasedFaithfulness
134
145
 
135
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
136
- return self.metric(
146
+ @staticmethod
147
+ def _call_metric(
148
+ metric: LLMBasedFaithfulness,
149
+ result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
150
+ ) -> dict:
151
+ return metric(
137
152
  question=result.question,
138
153
  answer=(
139
154
  result.predicted_result.content
@@ -152,8 +167,12 @@ class QuestionAnswerAnswerRelevance(QuestionAnswerMetric[LLMBasedAnswerRelevance
152
167
 
153
168
  metric_cls: type[LLMBasedAnswerRelevance] = LLMBasedAnswerRelevance
154
169
 
155
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
156
- return self.metric(
170
+ @staticmethod
171
+ def _call_metric(
172
+ metric: LLMBasedAnswerRelevance,
173
+ result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
174
+ ) -> dict:
175
+ return metric(
157
176
  question=result.question,
158
177
  answer=(
159
178
  result.predicted_result.content
@@ -171,8 +190,12 @@ class QuestionAnswerAnswerConsistency(QuestionAnswerMetric[LLMBasedStyleConsiste
171
190
 
172
191
  metric_cls: type[LLMBasedStyleConsistency] = LLMBasedStyleConsistency
173
192
 
174
- def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
175
- return self.metric(
193
+ @staticmethod
194
+ def _call_metric(
195
+ metric: LLMBasedStyleConsistency,
196
+ result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
197
+ ) -> dict:
198
+ return metric(
176
199
  answer=(
177
200
  result.predicted_result.content
178
201
  if isinstance(result.predicted_result.content, str)