PyPI - ragbits-evaluate - Versions diffs - 1.0.0__tar.gz → 1.1.0__tar.gz - Mend

ragbits-evaluate 1.0.0tar.gz → 1.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ragbits-evaluate might be problematic. Click here for more details.

Files changed (44) hide show

{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/.gitignore RENAMED Viewed

@@ -9,6 +9,9 @@ venv/
 __pycache__/
 **.egg-info/
+# Local cursor rules
+.cursor/rules/local/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -101,3 +104,4 @@ qdrant/
 .aider*
 .DS_Store
+node_modules/

{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,16 @@
 ## Unreleased
+## 1.1.0 (2025-07-09)
+### Changed
+- ragbits-core updated to version v1.1.0
+- Update qa data loader docstring (#565)
+- Fix deadlock on qa metrics compute (#609)
+- Upgrade distilabel version to 1.5.0 (#682)
 ## 1.0.0 (2025-06-04)
 ### Changed

{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragbits-evaluate
-Version: 1.0.0
+Version: 1.1.0
 Summary: Evaluation module for Ragbits components
 Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
 Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -23,11 +23,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
 Requires-Dist: datasets<4.0.0,>=3.0.1
-Requires-Dist: distilabel<2.0.0,>=1.4.1
+Requires-Dist: distilabel<2.0.0,>=1.5.0
 Requires-Dist: hydra-core<2.0.0,>=1.3.2
 Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
 Requires-Dist: optuna<5.0.0,>=4.0.0
-Requires-Dist: ragbits-core==1.0.0
+Requires-Dist: ragbits-core==1.1.0
 Provides-Extra: relari
 Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
 Description-Content-Type: text/markdown

{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ragbits-evaluate"
-version = "1.0.0"
+version = "1.1.0"
 description = "Evaluation module for Ragbits components"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.4.1,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.0.0"]
+dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.1.0"]
 [project.urls]
 "Homepage" = "https://github.com/deepsense-ai/ragbits"

{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataloaders/question_answer.py RENAMED Viewed

@@ -27,7 +27,6 @@ class QuestionAnswerDataLoader(DataLoader[QuestionAnswerData]):
         Args:
             source: The source to load the data from.
             split: The split to load the data from.
-            required_keys: The required keys to load the data from.
             question_key: The dataset column name that contains the question.
             answer_key: The dataset column name that contains the answer.
             context_key: The dataset column name that contains the context. Context is optional.

{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py RENAMED Viewed

@@ -2,7 +2,7 @@ import sys
 from abc import ABC, abstractmethod
 from typing import Any
-from distilabel.llms.base import LLM
+from distilabel.models import LLM
 from distilabel.steps.tasks import TextGeneration
 from ragbits.core.prompt import ChatFormat, Prompt

{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from typing import Any
-from distilabel.llms.base import LLM
+from distilabel.models import LLM
 from ragbits.evaluate.dataset_generator.tasks.text_generation.base import BaseDistilabelTask
 from ragbits.evaluate.dataset_generator.utils import get_closest_substring, get_passages_list

{ragbits_evaluate-1.0.0 → ragbits_evaluate-1.1.0}/src/ragbits/evaluate/metrics/question_answer.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 from abc import ABC, abstractmethod
+from asyncio import AbstractEventLoop
 from itertools import chain
 from typing import Generic, TypeVar
@@ -27,8 +28,9 @@ class _MetricLMM(LLMInterface):
     Implementation of required interface of Relari generative metrics based on LiteLMM.
     """
-    def __init__(self, llm: LLM) -> None:
+    def __init__(self, llm: LLM, loop: AbstractEventLoop) -> None:
         self._llm = llm
+        self._loop = loop
     def run(self, prompt: dict[str, str], temperature: float = 0, max_tokens: int = 1024) -> str:
         formatted_prompt = [
@@ -39,7 +41,10 @@ class _MetricLMM(LLMInterface):
             temperature=temperature,
             max_tokens=max_tokens,
         )
-        return asyncio.run(self._llm.generate(formatted_prompt, options=options))
+        return asyncio.run_coroutine_threadsafe(
+            self._llm.generate(formatted_prompt, options=options),
+            self._loop,
+        ).result()
 class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
@@ -60,7 +65,7 @@ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
             weight: Metric value weight in the final score, used during optimization.
         """
         super().__init__(weight=weight)
-        self.metric = self.metric_cls(_MetricLMM(llm))
+        self.llm = llm
         self.batch_size = batch_size
     @classmethod
@@ -89,16 +94,18 @@ class QuestionAnswerMetric(Generic[MetricT], Metric[QuestionAnswerResult], ABC):
         Returns:
             The computed metric.
         """
+        metric = self.metric_cls(_MetricLMM(self.llm, loop=asyncio.get_running_loop()))
         metric_results = chain.from_iterable(
             [
-                await asyncio.gather(*[asyncio.to_thread(self._call_metric, result) for result in batch])
+                await asyncio.gather(*[asyncio.to_thread(self._call_metric, metric, result) for result in batch])
                 for batch in batched(results, self.batch_size)
             ]
         )
-        return self.metric.aggregate(list(metric_results))
+        return metric.aggregate(list(metric_results))
+    @staticmethod
     @abstractmethod
-    def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
+    def _call_metric(metric: MetricT, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
         """
         Call the metric with the proper arguments.
         """
@@ -112,8 +119,12 @@ class QuestionAnswerAnswerCorrectness(QuestionAnswerMetric[LLMBasedAnswerCorrect
     metric_cls: type[LLMBasedAnswerCorrectness] = LLMBasedAnswerCorrectness
-    def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
-        return self.metric(
+    @staticmethod
+    def _call_metric(
+        metric: LLMBasedAnswerCorrectness,
+        result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
+    ) -> dict:
+        return metric(
             question=result.question,
             answer=(
                 result.predicted_result.content
@@ -132,8 +143,12 @@ class QuestionAnswerAnswerFaithfulness(QuestionAnswerMetric[LLMBasedFaithfulness
     metric_cls: type[LLMBasedFaithfulness] = LLMBasedFaithfulness
-    def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
-        return self.metric(
+    @staticmethod
+    def _call_metric(
+        metric: LLMBasedFaithfulness,
+        result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
+    ) -> dict:
+        return metric(
             question=result.question,
             answer=(
                 result.predicted_result.content
@@ -152,8 +167,12 @@ class QuestionAnswerAnswerRelevance(QuestionAnswerMetric[LLMBasedAnswerRelevance
     metric_cls: type[LLMBasedAnswerRelevance] = LLMBasedAnswerRelevance
-    def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
-        return self.metric(
+    @staticmethod
+    def _call_metric(
+        metric: LLMBasedAnswerRelevance,
+        result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
+    ) -> dict:
+        return metric(
             question=result.question,
             answer=(
                 result.predicted_result.content
@@ -171,8 +190,12 @@ class QuestionAnswerAnswerConsistency(QuestionAnswerMetric[LLMBasedStyleConsiste
     metric_cls: type[LLMBasedStyleConsistency] = LLMBasedStyleConsistency
-    def _call_metric(self, result: QuestionAnswerResult[QuestionAnswerPromptOutputT]) -> dict:
-        return self.metric(
+    @staticmethod
+    def _call_metric(
+        metric: LLMBasedStyleConsistency,
+        result: QuestionAnswerResult[QuestionAnswerPromptOutputT],
+    ) -> dict:
+        return metric(
             answer=(
                 result.predicted_result.content
                 if isinstance(result.predicted_result.content, str)