PyPI - ragbits-evaluate - Versions diffs - 0.16.0__tar.gz → 0.17.1__tar.gz - Mend

ragbits-evaluate 0.16.0tar.gz → 0.17.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ragbits-evaluate might be problematic. Click here for more details.

Files changed (47) hide show

{ragbits_evaluate-0.16.0 → ragbits_evaluate-0.17.1}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,21 @@
 ## Unreleased
+## 0.17.1 (2025-05-09)
+### Changed
+- ragbits-core updated to version v0.17.1
+## 0.17.0 (2025-05-06)
+### Changed
+- ragbits-core updated to version v0.17.0
+- Add tests for ragbits-evaluate package (#390)
+- Integrate sources with dataloaders (#529)
 ## 0.16.0 (2025-04-29)
 ### Changed

{ragbits_evaluate-0.16.0 → ragbits_evaluate-0.17.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ragbits-evaluate
-Version: 0.16.0
+Version: 0.17.1
 Summary: Evaluation module for Ragbits components
 Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
 Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -22,11 +22,12 @@ Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
-Requires-Dist: distilabel==1.4.1
-Requires-Dist: hydra-core~=1.3.2
-Requires-Dist: neptune[optuna]~=1.12.0
-Requires-Dist: optuna==4.0.0
-Requires-Dist: ragbits-core==0.16.0
+Requires-Dist: datasets<4.0.0,>=3.0.1
+Requires-Dist: distilabel<2.0.0,>=1.4.1
+Requires-Dist: hydra-core<2.0.0,>=1.3.2
+Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
+Requires-Dist: optuna<5.0.0,>=4.0.0
+Requires-Dist: ragbits-core==0.17.1
 Provides-Extra: relari
 Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
 Description-Content-Type: text/markdown

{ragbits_evaluate-0.16.0 → ragbits_evaluate-0.17.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ragbits-evaluate"
-version = "0.16.0"
+version = "0.17.1"
 description = "Evaluation module for Ragbits components"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-dependencies = ["hydra-core~=1.3.2", "neptune[optuna]~=1.12.0", "optuna==4.0.0", "distilabel==1.4.1", "ragbits-core==0.16.0"]
+dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.4.1,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==0.17.1"]
 [project.urls]
 "Homepage" = "https://github.com/deepsense-ai/ragbits"

{ragbits_evaluate-0.16.0 → ragbits_evaluate-0.17.1}/src/ragbits/evaluate/cli.py RENAMED Viewed

@@ -8,12 +8,13 @@ from pydantic import BaseModel
 from ragbits.cli._utils import get_instance_or_exit
 from ragbits.cli.state import print_output
-from ragbits.core.utils.config_handling import WithConstructionConfig, import_by_path
+from ragbits.core.utils.config_handling import WithConstructionConfig
 from ragbits.evaluate.config import eval_config
-from ragbits.evaluate.dataloaders import DataLoader, get_dataloader_instance
+from ragbits.evaluate.dataloaders import DataLoader
 from ragbits.evaluate.evaluator import Evaluator
 from ragbits.evaluate.metrics.base import MetricSet
 from ragbits.evaluate.pipelines import get_evaluation_pipeline_for_target
+from ragbits.evaluate.pipelines.base import EvaluationPipeline
 eval_app = typer.Typer(no_args_is_help=True)
@@ -30,9 +31,9 @@ def register(app: typer.Typer) -> None:
 @dataclass
 class _CLIState:
-    evaluation_target: WithConstructionConfig | None = None
-    metrics: MetricSet | None = None
     dataloader: DataLoader | None = None
+    pipeline: EvaluationPipeline | None = None
+    metrics: MetricSet | None = None
 class EvaluationResult(BaseModel):
@@ -46,26 +47,18 @@ state: _CLIState = _CLIState()
 @eval_app.callback()
 def common_args(
-    target_cls: Annotated[
-        str,
-        typer.Option(
-            help="A path to target class to be evaluated in a format python.path:ModuleName",
-            exists=True,
-            resolve_path=True,
-        ),
-    ],
-    dataloader_args: Annotated[
-        str,
+    dataloader_factory_path: Annotated[
+        str | None,
         typer.Option(
-            help="Comma separated arguments of dataloader",
+            help="A path to evaluation data loader factory in format python.path:function_name",
             exists=True,
             resolve_path=True,
         ),
-    ],
-    dataloader_cls: Annotated[
-        str | None,
+    ] = None,
+    dataloader_yaml_path: Annotated[
+        Path | None,
         typer.Option(
-            help="Dataloader class path in a format python.path:ModuleName to override the default",
+            help="A path to evaluation data loader configuration",
             exists=True,
             resolve_path=True,
         ),
@@ -73,7 +66,7 @@ def common_args(
     target_factory_path: Annotated[
         str | None,
         typer.Option(
-            help="A path to a factory of the target class in format: python.path:function_name",
+            help="A path to a factory of the evaluation target class in format: python.path:function_name",
             exists=True,
             resolve_path=True,
         ),
@@ -81,7 +74,7 @@ def common_args(
     target_yaml_path: Annotated[
         Path | None,
         typer.Option(
-            help="A path to a YAML configuration file of the target class",
+            help="A path to a YAML configuration file of the evaluation target class",
             exists=True,
             resolve_path=True,
         ),
@@ -106,40 +99,48 @@ def common_args(
     """
     Common arguments for the evaluate commands.
     """
-    state.evaluation_target = get_instance_or_exit(
-        import_by_path(target_cls),
+    evaluation_target = get_instance_or_exit(
+        cls=WithConstructionConfig,
         factory_path=target_factory_path,
         yaml_path=target_yaml_path,
+        config_override=eval_config,
     )
-    # TODO validate if given metric set is suitable for evaluation target
-    state.metrics = get_instance_or_exit(
-        MetricSet, factory_path=metrics_factory_path, yaml_path=metrics_yaml_path, config_override=eval_config
+    state.pipeline = get_evaluation_pipeline_for_target(evaluation_target)
+    # TODO: validate if given dataloader is suitable for evaluation pipeline
+    state.dataloader = get_instance_or_exit(
+        cls=DataLoader,
+        factory_path=dataloader_factory_path,
+        yaml_path=dataloader_yaml_path,
+        config_override=eval_config,
     )
-    # TODO validate if given dataloader is suitable for evaluation target
-    state.dataloader = get_dataloader_instance(
-        config=eval_config, dataloader_args=dataloader_args, dataloader_cls_override=dataloader_cls
+    # TODO: validate if given metric set is suitable for evaluation pipeline
+    state.metrics = get_instance_or_exit(
+        cls=MetricSet,
+        factory_path=metrics_factory_path,
+        yaml_path=metrics_yaml_path,
+        config_override=eval_config,
     )
 @eval_app.command()
-def run_evaluation() -> None:
+def run() -> None:
     """
-    Evaluate the set-up pipeline.
+    Evaluate the pipeline.
     """
     async def run() -> None:
-        if state.evaluation_target is None:
-            raise ValueError("Evaluation target not initialized")
+        if state.dataloader is None:
+            raise ValueError("Evaluation dataloader not initialized")
+        if state.pipeline is None:
+            raise ValueError("Evaluation pipeline not initialized")
         if state.metrics is None:
             raise ValueError("Evaluation metrics not initialized")
-        if state.dataloader is None:
-            raise ValueError("Dataloader not initialized")
-        evaluation_pipeline = get_evaluation_pipeline_for_target(evaluation_target=state.evaluation_target)
         evaluator = Evaluator()
         metric_results = await evaluator.compute(
-            pipeline=evaluation_pipeline,
-            metrics=state.metrics,
+            pipeline=state.pipeline,
             dataloader=state.dataloader,
+            metrics=state.metrics,
         )
         evaluation_results = EvaluationResult(
             metrics={"metrics": metric_results["metrics"], "time_perf": metric_results["time_perf"]}

ragbits_evaluate-0.17.1/src/ragbits/evaluate/config.py ADDED Viewed

@@ -0,0 +1,11 @@
+from ragbits.core.config import CoreConfig
+from ragbits.core.utils._pyproject import get_config_instance
+class EvaluateConfig(CoreConfig):
+    """
+    Configuration for the ragbits-evaluate package, loaded from downstream projects' pyproject.toml files.
+    """
+eval_config = get_config_instance(EvaluateConfig, subproject="evaluate")

ragbits_evaluate-0.17.1/src/ragbits/evaluate/dataloaders/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from ragbits.evaluate.dataloaders.base import DataLoader
+__all__ = ["DataLoader"]

ragbits_evaluate-0.17.1/src/ragbits/evaluate/dataloaders/base.py ADDED Viewed

@@ -0,0 +1,62 @@
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from types import ModuleType
+from typing import ClassVar, Generic
+from pydantic import BaseModel
+from typing_extensions import Self
+from ragbits.core.sources.base import Source
+from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
+from ragbits.evaluate import dataloaders
+from ragbits.evaluate.pipelines.base import EvaluationDataT
+class DataLoaderConfig(BaseModel):
+    """
+    Schema for the data loader config.
+    """
+    source: ObjectConstructionConfig
+class DataLoader(WithConstructionConfig, Generic[EvaluationDataT], ABC):
+    """
+    Evaluation data loader.
+    """
+    default_module: ClassVar[ModuleType | None] = dataloaders
+    configuration_key: ClassVar[str] = "dataloader"
+    def __init__(self, source: Source) -> None:
+        """
+        Initialize the data loader.
+        Args:
+            source: The source to load the evaluation data from.
+        """
+        self.source = source
+    @classmethod
+    def from_config(cls, config: dict) -> Self:
+        """
+        Create an instance of `DataLoader` from a configuration dictionary.
+        Args:
+            config: A dictionary containing configuration settings for the data loader.
+        Returns:
+            An instance of the data loader class initialized with the provided configuration.
+        """
+        dataloader_config = DataLoaderConfig.model_validate(config)
+        config["source"] = Source.subclass_from_config(dataloader_config.source)
+        return super().from_config(config)
+    @abstractmethod
+    async def load(self) -> Iterable[EvaluationDataT]:
+        """
+        Load the data.
+        Returns:
+            The loaded data.
+        """

ragbits_evaluate-0.17.1/src/ragbits/evaluate/dataloaders/document_search.py ADDED Viewed

@@ -0,0 +1,45 @@
+from collections.abc import Iterable
+from datasets import load_dataset
+from ragbits.evaluate.dataloaders.base import DataLoader
+from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
+from ragbits.evaluate.pipelines.document_search import DocumentSearchData
+class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
+    """
+    Document search evaluation data loader.
+    The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files)
+    and contain the following features: "question, "passages".
+    """
+    async def load(self) -> Iterable[DocumentSearchData]:
+        """
+        Load the data from source and format them.
+        Returns:
+            The document search evaluation data.
+        Raises:
+            DataLoaderIncorrectFormatDataError: If evaluation dataset is incorrectly formatted.
+        """
+        data_path = await self.source.fetch()
+        dataset = load_dataset(
+            path=str(data_path.parent),
+            split=data_path.stem,
+        )
+        if "question" not in dataset.features or "passages" not in dataset.features:
+            raise DataLoaderIncorrectFormatDataError(
+                required_features=["question", "passages"],
+                data_path=data_path,
+            )
+        return [
+            DocumentSearchData(
+                question=data["question"],
+                reference_passages=data["passages"],
+            )
+            for data in dataset
+        ]

ragbits_evaluate-0.17.1/src/ragbits/evaluate/dataloaders/exceptions.py ADDED Viewed

@@ -0,0 +1,25 @@
+from pathlib import Path
+class DataLoaderError(Exception):
+    """
+    Class for all exceptions raised by the data loader.
+    """
+    def __init__(self, message: str, data_path: Path) -> None:
+        super().__init__(message)
+        self.message = message
+        self.data_path = data_path
+class DataLoaderIncorrectFormatDataError(DataLoaderError):
+    """
+    Raised when the data are incorrectly formatted.
+    """
+    def __init__(self, required_features: list[str], data_path: Path) -> None:
+        super().__init__(
+            message=f"Dataset {data_path} is incorrectly formatted. Required features: {required_features}",
+            data_path=data_path,
+        )
+        self.required_features = required_features

{ragbits_evaluate-0.16.0 → ragbits_evaluate-0.17.1}/src/ragbits/evaluate/evaluator.py RENAMED Viewed

@@ -9,30 +9,46 @@ from tqdm.asyncio import tqdm
 from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
 from ragbits.evaluate.dataloaders.base import DataLoader
 from ragbits.evaluate.metrics.base import MetricSet
-from ragbits.evaluate.pipelines.base import EvaluationPipeline, EvaluationResult
+from ragbits.evaluate.pipelines.base import EvaluationDataT, EvaluationPipeline, EvaluationResultT, EvaluationTargetT
-class EvaluatorConfig(BaseModel):
+class EvaluationConfig(BaseModel):
     """
-    Schema for for the dict taken by `Evaluator.run_from_config` method.
+    Schema for the evaluation run config.
     """
-    dataloader: ObjectConstructionConfig
     pipeline: ObjectConstructionConfig
+    dataloader: ObjectConstructionConfig
     metrics: dict[str, ObjectConstructionConfig]
+class EvaluatorConfig(BaseModel):
+    """
+    Schema for the evaluator config.
+    """
+    evaluation: EvaluationConfig
+    evaluator: dict | None = None
 class Evaluator(WithConstructionConfig):
     """
     Evaluator class.
     """
-    CONCURRENCY: int = 10
+    def __init__(self, batch_size: int = 10) -> None:
+        """
+        Initialize the evaluator.
+        Args:
+            batch_size: batch size for the evaluation pipeline inference.
+        """
+        self.batch_size = batch_size
     @classmethod
     async def run_from_config(cls, config: dict) -> dict:
         """
-        Runs the evaluation based on configuration.
+        Run the evaluation based on configuration.
         Args:
             config: Evaluation config.
@@ -40,12 +56,14 @@ class Evaluator(WithConstructionConfig):
         Returns:
             The evaluation results.
         """
-        model = EvaluatorConfig.model_validate(config)
-        dataloader: DataLoader = DataLoader.subclass_from_config(model.dataloader)
-        pipeline: EvaluationPipeline = EvaluationPipeline.subclass_from_config(model.pipeline)
-        metrics: MetricSet = MetricSet.from_config(model.metrics)
-        return await cls().compute(
+        evaluator_config = EvaluatorConfig.model_validate(config)
+        evaluation_config = EvaluationConfig.model_validate(evaluator_config.evaluation)
+        pipeline: EvaluationPipeline = EvaluationPipeline.subclass_from_config(evaluation_config.pipeline)
+        dataloader: DataLoader = DataLoader.subclass_from_config(evaluation_config.dataloader)
+        metrics: MetricSet = MetricSet.from_config(evaluation_config.metrics)
+        evaluator = cls.from_config(evaluator_config.evaluator or {})
+        return await evaluator.compute(
             pipeline=pipeline,
             dataloader=dataloader,
             metrics=metrics,
@@ -53,9 +71,9 @@ class Evaluator(WithConstructionConfig):
     async def compute(
         self,
-        pipeline: EvaluationPipeline,
-        dataloader: DataLoader,
-        metrics: MetricSet,
+        pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
+        dataloader: DataLoader[EvaluationDataT],
+        metrics: MetricSet[EvaluationResultT],
     ) -> dict:
         """
         Compute the evaluation results for the given pipeline and data.
@@ -83,9 +101,9 @@ class Evaluator(WithConstructionConfig):
     async def _call_pipeline(
         self,
-        pipeline: EvaluationPipeline,
-        dataset: Iterable[dict],
-    ) -> tuple[list[EvaluationResult], dict]:
+        pipeline: EvaluationPipeline[EvaluationTargetT, EvaluationDataT, EvaluationResultT],
+        dataset: Iterable[EvaluationDataT],
+    ) -> tuple[list[EvaluationResultT], dict]:
         """
         Call the pipeline with the given data.
@@ -96,9 +114,9 @@ class Evaluator(WithConstructionConfig):
         Returns:
             The evaluation results and performance metrics.
         """
-        semaphore = asyncio.Semaphore(self.CONCURRENCY)
+        semaphore = asyncio.Semaphore(self.batch_size)
-        async def _call_pipeline_with_semaphore(data: dict) -> EvaluationResult:
+        async def _call_pipeline_with_semaphore(data: EvaluationDataT) -> EvaluationResultT:
             async with semaphore:
                 return await pipeline(data)
@@ -109,7 +127,7 @@ class Evaluator(WithConstructionConfig):
         return pipe_outputs, self._compute_time_perf(start_time, end_time, len(pipe_outputs))
     @staticmethod
-    def _results_processor(results: list[EvaluationResult]) -> dict:
+    def _results_processor(results: list[EvaluationResultT]) -> dict:
         """
         Process the results.
@@ -122,7 +140,7 @@ class Evaluator(WithConstructionConfig):
         return {"results": [asdict(result) for result in results]}
     @staticmethod
-    def _compute_metrics(metrics: MetricSet, results: list[EvaluationResult]) -> dict:
+    def _compute_metrics(metrics: MetricSet[EvaluationResultT], results: list[EvaluationResultT]) -> dict:
         """
         Compute a metric using the given inputs.

{ragbits_evaluate-0.16.0 → ragbits_evaluate-0.17.1}/src/ragbits/evaluate/factories/__init__.py RENAMED Viewed

@@ -3,10 +3,12 @@ import asyncio
 from datasets import load_dataset
 from ragbits.core.embeddings.dense import LiteLLMEmbedder
+from ragbits.core.sources.hf import HuggingFaceSource
 from ragbits.core.utils.config_handling import ObjectConstructionConfig
 from ragbits.core.vector_stores.in_memory import InMemoryVectorStore
 from ragbits.document_search import DocumentSearch
 from ragbits.document_search.documents.document import DocumentMeta
+from ragbits.evaluate.dataloaders.document_search import DocumentSearchDataLoader
 from ragbits.evaluate.metrics import MetricSet
 DS_PRECISION_RECALL_F1 = {
@@ -27,7 +29,9 @@ DS_PRECISION_RECALL_F1 = {
 def precision_recall_f1() -> MetricSet:
-    """A factory of precision recall f1 metric set for retrival evaluation"""
+    """
+    Factory of precision recall f1 metric set for retrival evaluation.
+    """
     return MetricSet.from_config(config=DS_PRECISION_RECALL_F1)
@@ -38,7 +42,16 @@ async def _add_example_documents(document_search: DocumentSearch) -> None:
 def basic_document_search_factory() -> DocumentSearch:
-    """A factory for basic example document search instance"""
+    """
+    Factory for basic example document search instance.
+    """
     document_search = DocumentSearch(vector_store=InMemoryVectorStore(embedder=LiteLLMEmbedder()))
     asyncio.run(_add_example_documents(document_search))
     return document_search
+def synthetic_rag_dataset() -> DocumentSearchDataLoader:
+    """
+    Factory for synthetic RAG dataset.
+    """
+    return DocumentSearchDataLoader(source=HuggingFaceSource(path="deepsense-ai/synthetic-rag-dataset_v1.0"))

{ragbits_evaluate-0.16.0 → ragbits_evaluate-0.17.1}/src/ragbits/evaluate/metrics/base.py RENAMED Viewed

@@ -1,19 +1,22 @@
 from abc import ABC, abstractmethod
-from typing import Generic, TypeVar
+from types import ModuleType
+from typing import ClassVar, Generic
 from typing_extensions import Self
 from ragbits.core.utils.config_handling import WithConstructionConfig
-from ragbits.evaluate.pipelines.base import EvaluationResult
+from ragbits.evaluate import metrics
+from ragbits.evaluate.pipelines.base import EvaluationResultT
-ResultT = TypeVar("ResultT", bound=EvaluationResult)
-class Metric(WithConstructionConfig, Generic[ResultT], ABC):
+class Metric(WithConstructionConfig, Generic[EvaluationResultT], ABC):
     """
     Base class for metrics.
     """
+    default_module: ClassVar[ModuleType | None] = metrics
+    configuration_key: ClassVar[str] = "metric"
     def __init__(self, weight: float = 1.0) -> None:
         """
         Initializes the metric.
@@ -25,7 +28,7 @@ class Metric(WithConstructionConfig, Generic[ResultT], ABC):
         self.weight = weight
     @abstractmethod
-    def compute(self, results: list[ResultT]) -> dict:
+    def compute(self, results: list[EvaluationResultT]) -> dict:
         """
         Compute the metric.
@@ -37,16 +40,17 @@ class Metric(WithConstructionConfig, Generic[ResultT], ABC):
         """
-class MetricSet(WithConstructionConfig, Generic[ResultT]):
+class MetricSet(WithConstructionConfig, Generic[EvaluationResultT]):
     """
     Represents a set of metrics.
     """
-    configuration_key = "metrics"
+    configuration_key: ClassVar[str] = "metrics"
+    default_module: ClassVar[ModuleType | None] = metrics
-    def __init__(self, *metrics: Metric[ResultT]) -> None:
+    def __init__(self, *metrics: Metric[EvaluationResultT]) -> None:
         """
-        Initializes the metric set.
+        Initialize the metric set.
         Args:
             metrics: The metrics.
@@ -66,7 +70,7 @@ class MetricSet(WithConstructionConfig, Generic[ResultT]):
         """
         return cls(*[Metric.subclass_from_config(metric_config) for metric_config in config.values()])
-    def compute(self, results: list[ResultT]) -> dict:
+    def compute(self, results: list[EvaluationResultT]) -> dict:
         """
         Compute the metrics.

{ragbits_evaluate-0.16.0 → ragbits_evaluate-0.17.1}/src/ragbits/evaluate/metrics/document_search.py RENAMED Viewed

@@ -19,7 +19,7 @@ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
     def __init__(self, matching_strategy: MatchingStrategy, weight: float = 1.0) -> None:
         """
-        Initializes the document search metric.
+        Initialize the document search metric.
         Args:
             matching_strategy: Matching strategys that determine relevance.

ragbits-evaluate 0.16.0__tar.gz → 0.17.1__tar.gz

Potentially problematic release.

ragbits-evaluate 0.16.0tar.gz → 0.17.1tar.gz