ragbits-evaluate 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,97 @@
1
+ # Directories
2
+ .vscode/
3
+ .idea/
4
+ .neptune/
5
+ .pytest_cache/
6
+ .mypy_cache/
7
+ venv/
8
+ __pycache__/
9
+ **.egg-info/
10
+
11
+ # Byte-compiled / optimized / DLL files
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+
16
+ # C extensions
17
+ *.so
18
+
19
+ # Distribution / packaging
20
+ .Python
21
+ env/
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ parts/
31
+ sdist/
32
+ var/
33
+ *.egg-info/
34
+ .installed.cfg
35
+ *.egg
36
+
37
+ # Sphinx documentation
38
+ docs/_build/
39
+ public/
40
+ # autogenerated package license table
41
+ docs/licenses_table.rst
42
+
43
+ # license dump file
44
+ licenses.txt
45
+
46
+ # File formats
47
+ *.onnx
48
+ *.pyc
49
+ *.pt
50
+ *.pth
51
+ *.pkl
52
+ *.mar
53
+ *.torchscript
54
+ **/.ipynb_checkpoints
55
+ **/dist/
56
+ **/checkpoints/
57
+ **/outputs/
58
+ **/multirun/
59
+
60
+ # Other env files
61
+ .python-version
62
+ pyvenv.cfg
63
+ pip-selfcheck.json
64
+
65
+ # Unit test / coverage reports
66
+ htmlcov/
67
+ .tox/
68
+ .coverage
69
+ .coverage.*
70
+ .cache
71
+ nosetests.xml
72
+ coverage.xml
73
+ *,cover
74
+ .hypothesis/
75
+
76
+ # dotenv
77
+ .env
78
+
79
+ # coverage and pytest reports
80
+ coverage.xml
81
+ report.xml
82
+
83
+ # CMake
84
+ cmake-build-*/
85
+
86
+ # Terraform
87
+ **/.terraform.lock.hcl
88
+ **/.terraform
89
+
90
+ # benchmarks
91
+ benchmarks/sql/data/
92
+
93
+ # mkdocs generated files
94
+ site/
95
+
96
+ # build artifacts
97
+ dist/
@@ -0,0 +1,13 @@
1
+ # CHANGELOG
2
+
3
+ ## Unreleased
4
+
5
+ ## 0.2.0 (2024-10-23)
6
+
7
+ - Initial release of the package.
8
+ - Evaluation pipeline framework with capability to define evaluators & metrics.
9
+ - Evaluation pipeline for `ragbits-document-search`.
10
+
11
+ ### Changed
12
+
13
+ - ragbits-core updated to version v0.2.0
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.3
2
+ Name: ragbits-evaluate
3
+ Version: 0.2.0
4
+ Summary: Evaluation module for Ragbits components
5
+ Author-email: "deepsense.ai" <ragbits@deepsense.ai>
6
+ License-Expression: MIT
7
+ Keywords: Evaluation,GenAI,Generative AI,LLMs,Large Language Models,RAG,Retrieval Augmented Generation
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Natural Language :: English
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: hydra-core~=1.3.2
21
+ Requires-Dist: neptune~=1.12.0
22
+ Requires-Dist: ragbits-core==0.2.0
23
+ Provides-Extra: relari
24
+ Requires-Dist: continuous-eval~=0.3.12; extra == 'relari'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # Ragbits Evaluate
@@ -0,0 +1 @@
1
+ # Ragbits Evaluate
@@ -0,0 +1,60 @@
1
+ [project]
2
+ name = "ragbits-evaluate"
3
+ version = "0.2.0"
4
+ description = "Evaluation module for Ragbits components"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = "MIT"
8
+ authors = [
9
+ { name = "deepsense.ai", email = "ragbits@deepsense.ai"}
10
+ ]
11
+ keywords = [
12
+ "Retrieval Augmented Generation",
13
+ "RAG",
14
+ "Large Language Models",
15
+ "LLMs",
16
+ "Generative AI",
17
+ "GenAI",
18
+ "Evaluation"
19
+ ]
20
+ classifiers = [
21
+ "Development Status :: 4 - Beta",
22
+ "Environment :: Console",
23
+ "Intended Audience :: Science/Research",
24
+ "License :: OSI Approved :: MIT License",
25
+ "Natural Language :: English",
26
+ "Operating System :: OS Independent",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
31
+ "Topic :: Software Development :: Libraries :: Python Modules",
32
+ ]
33
+ dependencies = ["hydra-core~=1.3.2", "neptune~=1.12.0", "ragbits-core==0.2.0"]
34
+
35
+ [project.optional-dependencies]
36
+ relari = [
37
+ "continuous-eval~=0.3.12",
38
+ ]
39
+
40
+ [tool.uv]
41
+ dev-dependencies = [
42
+ "pre-commit~=3.8.0",
43
+ "pytest~=8.3.3",
44
+ "pytest-cov~=5.0.0",
45
+ "pytest-asyncio~=0.24.0",
46
+ "pip-licenses>=4.0.0,<5.0.0"
47
+ ]
48
+
49
+ [build-system]
50
+ requires = ["hatchling"]
51
+ build-backend = "hatchling.build"
52
+
53
+ [tool.hatch.metadata]
54
+ allow-direct-references = true
55
+
56
+ [tool.hatch.build.targets.wheel]
57
+ packages = ["src/ragbits"]
58
+
59
+ [tool.pytest.ini_options]
60
+ asyncio_mode = "auto"
@@ -0,0 +1,112 @@
1
+ import time
2
+ from dataclasses import asdict
3
+ from typing import Any, Iterable
4
+
5
+ from tqdm.asyncio import tqdm
6
+
7
+ from ragbits.evaluate.loaders.base import DataLoader
8
+ from ragbits.evaluate.metrics.base import MetricSet
9
+ from ragbits.evaluate.pipelines.base import EvaluationPipeline, EvaluationResult
10
+
11
+
12
+ class Evaluator:
13
+ """
14
+ Evaluator class.
15
+ """
16
+
17
+ async def compute(
18
+ self,
19
+ pipeline: EvaluationPipeline,
20
+ dataloader: DataLoader,
21
+ metrics: MetricSet,
22
+ ) -> dict[str, Any]:
23
+ """
24
+ Compute the evaluation results for the given pipeline and data.
25
+
26
+ Args:
27
+ pipeline: The pipeline to be evaluated.
28
+ dataloader: The dataloader to load the data.
29
+ metrics: The metrics to be computed.
30
+
31
+ Returns:
32
+ The evaluation results.
33
+ """
34
+ dataset = await dataloader.load()
35
+ results, perf_results = await self._call_pipeline(pipeline, dataset)
36
+ computed_metrics = self._compute_metrics(metrics, results)
37
+ processed_results = self._results_processor(results)
38
+
39
+ return {
40
+ **perf_results,
41
+ **computed_metrics,
42
+ **processed_results,
43
+ }
44
+
45
+ async def _call_pipeline(
46
+ self,
47
+ pipeline: EvaluationPipeline,
48
+ dataset: Iterable,
49
+ ) -> tuple[list[EvaluationResult], dict[str, Any]]:
50
+ """
51
+ Call the pipeline with the given data.
52
+
53
+ Args:
54
+ pipeline: The pipeline to be called.
55
+ data: The evaluation data.
56
+
57
+ Returns:
58
+ The evaluation results and performance metrics.
59
+ """
60
+ start_time = time.perf_counter()
61
+ pipe_outputs = await tqdm.gather(*[pipeline(data) for data in dataset], desc="Evaluation")
62
+ end_time = time.perf_counter()
63
+ return pipe_outputs, self._compute_time_perf(start_time, end_time, len(pipe_outputs))
64
+
65
+ def _results_processor(self, results: list[EvaluationResult]) -> dict[str, Any]:
66
+ """
67
+ Process the results.
68
+
69
+ Args:
70
+ results: The evaluation results.
71
+
72
+ Returns:
73
+ The processed results.
74
+ """
75
+ return {"results": [asdict(result) for result in results]}
76
+
77
+ def _compute_metrics(self, metrics: MetricSet, results: list[EvaluationResult]) -> dict[str, Any]:
78
+ """
79
+ Compute a metric using the given inputs.
80
+
81
+ Args:
82
+ metrics: The metrics to be computed.
83
+ results: The evaluation results.
84
+
85
+ Returns:
86
+ The computed metric.
87
+ """
88
+ return {"metrics": metrics.compute(results)}
89
+
90
+ def _compute_time_perf(self, start_time: float, end_time: float, num_samples: int) -> dict[str, Any]:
91
+ """
92
+ Compute the performance metrics.
93
+
94
+ Args:
95
+ start_time: The start time.
96
+ end_time: The end time.
97
+ num_samples: The number of samples.
98
+
99
+ Returns:
100
+ The performance metrics.
101
+ """
102
+ latency = end_time - start_time
103
+ throughput = num_samples / latency
104
+ latency_sample = 1.0 / throughput if throughput > 0 else 0.0
105
+
106
+ return {
107
+ "time_perf": {
108
+ "total_time_in_seconds": latency,
109
+ "samples_per_second": throughput,
110
+ "latency_in_seconds": latency_sample,
111
+ },
112
+ }
@@ -0,0 +1,24 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Generic, TypeVar
3
+
4
+ from omegaconf import DictConfig
5
+
6
+ DataT = TypeVar("DataT")
7
+
8
+
9
+ class DataLoader(Generic[DataT], ABC):
10
+ """
11
+ Data loader.
12
+ """
13
+
14
+ def __init__(self, config: DictConfig) -> None:
15
+ self.config = config
16
+
17
+ @abstractmethod
18
+ async def load(self) -> DataT:
19
+ """
20
+ Load the data.
21
+
22
+ Returns:
23
+ The loaded data.
24
+ """
@@ -0,0 +1,25 @@
1
+ from typing import TypeAlias
2
+
3
+ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict, load_dataset
4
+
5
+ from ragbits.evaluate.loaders.base import DataLoader
6
+
7
+ HFData: TypeAlias = DatasetDict | Dataset | IterableDatasetDict | IterableDataset
8
+
9
+
10
+ class HFDataLoader(DataLoader[HFData]):
11
+ """
12
+ Hugging Face data loader.
13
+ """
14
+
15
+ async def load(self) -> HFData:
16
+ """
17
+ Load the data from Hugging Face.
18
+
19
+ Returns:
20
+ The loaded data.
21
+ """
22
+ return load_dataset(
23
+ path=self.config.path,
24
+ split=self.config.split,
25
+ )
@@ -0,0 +1,78 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Generic, Optional, TypeVar
3
+
4
+ from omegaconf import DictConfig
5
+ from typing_extensions import Self
6
+
7
+ from ragbits.evaluate.pipelines.base import EvaluationResult
8
+
9
+ ResultT = TypeVar("ResultT", bound=EvaluationResult)
10
+
11
+
12
+ class Metric(Generic[ResultT], ABC):
13
+ """
14
+ Base class for metrics.
15
+ """
16
+
17
+ def __init__(self, config: Optional[DictConfig] = None) -> None:
18
+ """
19
+ Initializes the metric.
20
+
21
+ Args:
22
+ config: The metric configuration.
23
+ """
24
+ super().__init__()
25
+ self.config = getattr(config, self.__class__.__name__, DictConfig({}))
26
+
27
+ @abstractmethod
28
+ def compute(self, results: list[ResultT]) -> dict[str, Any]:
29
+ """
30
+ Compute the metric.
31
+
32
+ Args:
33
+ results: The evaluation results.
34
+
35
+ Returns:
36
+ The computed metric.
37
+ """
38
+
39
+
40
+ class MetricSet(Generic[ResultT]):
41
+ """
42
+ Represents a set of metrics.
43
+ """
44
+
45
+ def __init__(self, *metrics: type[Metric[ResultT]]) -> None:
46
+ """
47
+ Initializes the metric set.
48
+
49
+ Args:
50
+ metrics: The metrics.
51
+ """
52
+ self._metrics = metrics
53
+ self.metrics: list[Metric[ResultT]] = []
54
+
55
+ def __call__(self, config: Optional[DictConfig] = None) -> Self:
56
+ """
57
+ Initializes the metrics.
58
+
59
+ Args:
60
+ config: The configuration for the metrics.
61
+
62
+ Returns:
63
+ The initialized metric set.
64
+ """
65
+ self.metrics = [metric(config) for metric in self._metrics]
66
+ return self
67
+
68
+ def compute(self, results: list[ResultT]) -> dict[str, Any]:
69
+ """
70
+ Compute the metrics.
71
+
72
+ Args:
73
+ results: The evaluation results.
74
+
75
+ Returns:
76
+ The computed metrics.
77
+ """
78
+ return {name: value for metric in self.metrics for name, value in metric.compute(results).items()}
@@ -0,0 +1,68 @@
1
+ import importlib
2
+ from abc import ABC
3
+ from typing import Any, Optional
4
+
5
+ from continuous_eval.metrics.retrieval import PrecisionRecallF1, RankedRetrievalMetrics
6
+ from omegaconf import DictConfig
7
+
8
+ from ragbits.evaluate.metrics.base import Metric, MetricSet
9
+ from ragbits.evaluate.pipelines.document_search import DocumentSearchResult
10
+
11
+
12
+ class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
13
+ """
14
+ Metric for document search evaluation based on Relari backend.
15
+ More details can be found [here](https://docs.relari.ai/category/retrieval-rag).
16
+ """
17
+
18
+ metric_cls: type[PrecisionRecallF1 | RankedRetrievalMetrics]
19
+
20
+ def __init__(self, config: Optional[DictConfig] = None) -> None:
21
+ """
22
+ Initializes the metric.
23
+
24
+ Args:
25
+ config: The metric configuration.
26
+ """
27
+ super().__init__(config)
28
+
29
+ matching_strategy = getattr(
30
+ importlib.import_module("continuous_eval.metrics.retrieval.matching_strategy"),
31
+ self.config.matching_strategy,
32
+ )
33
+ self.metric = self.metric_cls(matching_strategy(**self.config.options))
34
+
35
+ def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]:
36
+ """
37
+ Compute the metric.
38
+
39
+ Args:
40
+ results: The evaluation results.
41
+
42
+ Returns:
43
+ The computed metric.
44
+ """
45
+ return self.metric.aggregate(
46
+ [self.metric(result.predicted_passages, result.reference_passages) for result in results]
47
+ )
48
+
49
+
50
+ class DocumentSearchPrecisionRecallF1(DocumentSearchMetric):
51
+ """
52
+ Precision, recall, and F1 score for context retrieval.
53
+ More details can be found [here](https://docs.relari.ai/metrics/Retrieval/Deterministic/precision_recall).
54
+ """
55
+
56
+ metric_cls = PrecisionRecallF1
57
+
58
+
59
+ class DocumentSearchRankedRetrievalMetrics(DocumentSearchMetric):
60
+ """
61
+ Rank-aware metrics takes into account the order in which the contexts are retrieved.
62
+ More details can be found [here](https://docs.relari.ai/metrics/Retrieval/Deterministic/rank_aware_metrics).
63
+ """
64
+
65
+ metric_cls = RankedRetrievalMetrics
66
+
67
+
68
+ document_search_metrics = MetricSet(DocumentSearchPrecisionRecallF1, DocumentSearchRankedRetrievalMetrics)
@@ -0,0 +1,40 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any, Optional
4
+
5
+ from omegaconf import DictConfig
6
+
7
+
8
+ @dataclass
9
+ class EvaluationResult(ABC):
10
+ """
11
+ Represents the result of a single evaluation.
12
+ """
13
+
14
+
15
+ class EvaluationPipeline(ABC):
16
+ """
17
+ Collection evaluation pipeline.
18
+ """
19
+
20
+ def __init__(self, config: Optional[DictConfig] = None) -> None:
21
+ """
22
+ Initializes the evaluation pipeline.
23
+
24
+ Args:
25
+ config: The evaluation pipeline configuration.
26
+ """
27
+ super().__init__()
28
+ self.config = config or DictConfig({})
29
+
30
+ @abstractmethod
31
+ async def __call__(self, data: dict[str, Any]) -> EvaluationResult:
32
+ """
33
+ Runs the evaluation pipeline.
34
+
35
+ Args:
36
+ data: The evaluation data.
37
+
38
+ Returns:
39
+ The evaluation result.
40
+ """
@@ -0,0 +1,51 @@
1
+ from dataclasses import dataclass
2
+ from functools import cached_property
3
+
4
+ from ragbits.document_search import DocumentSearch
5
+ from ragbits.document_search.documents.element import TextElement
6
+ from ragbits.evaluate.pipelines.base import EvaluationPipeline, EvaluationResult
7
+
8
+
9
+ @dataclass
10
+ class DocumentSearchResult(EvaluationResult):
11
+ """
12
+ Represents the result of a single evaluation.
13
+ """
14
+
15
+ question: str
16
+ reference_passages: list[str]
17
+ predicted_passages: list[str]
18
+
19
+
20
+ class DocumentSearchPipeline(EvaluationPipeline):
21
+ """
22
+ Document search evaluation pipeline.
23
+ """
24
+
25
+ @cached_property
26
+ def document_search(self) -> "DocumentSearch":
27
+ """
28
+ Returns the document search instance.
29
+
30
+ Returns:
31
+ The document search instance.
32
+ """
33
+ return DocumentSearch.from_config(self.config) # type: ignore
34
+
35
+ async def __call__(self, data: dict) -> DocumentSearchResult:
36
+ """
37
+ Runs the document search evaluation pipeline.
38
+
39
+ Args:
40
+ data: The evaluation data.
41
+
42
+ Returns:
43
+ The evaluation result.
44
+ """
45
+ elements = await self.document_search.search(data["question"])
46
+ predicted_passages = [element.content for element in elements if isinstance(element, TextElement)]
47
+ return DocumentSearchResult(
48
+ question=data["question"],
49
+ reference_passages=data["passages"],
50
+ predicted_passages=predicted_passages,
51
+ )
@@ -0,0 +1,91 @@
1
+ import json
2
+ import sys
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from hydra.core.hydra_config import HydraConfig
8
+ from neptune import Run
9
+ from neptune.utils import stringify_unsupported
10
+ from omegaconf import DictConfig
11
+
12
+
13
+ def _save(file_path: Path, **data: Any) -> None:
14
+ """
15
+ Save the data to a file. Add the current timestamp and Python version to the data.
16
+
17
+ Args:
18
+ file_path: The path to the file.
19
+ data: The data to be saved.
20
+ """
21
+ current_time = datetime.now()
22
+
23
+ data["_timestamp"] = current_time.isoformat()
24
+ data["_python_version"] = sys.version
25
+ data["_interpreter_path"] = sys.executable
26
+
27
+ with open(file_path, "w", encoding="utf-8") as file:
28
+ json.dump(data, file, indent=4)
29
+
30
+
31
+ def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path:
32
+ """
33
+ Log the evaluation results locally.
34
+
35
+ Args:
36
+ results: The evaluation results.
37
+ output_dir: The output directory.
38
+
39
+ Returns:
40
+ The output directory.
41
+ """
42
+ output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
43
+ metrics_file = output_dir / "metrics.json"
44
+ results_file = output_dir / "results.json"
45
+
46
+ _save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"])
47
+ _save(results_file, results=results["results"])
48
+
49
+ return output_dir
50
+
51
+
52
+ def setup_neptune(config: DictConfig) -> Run | None:
53
+ """
54
+ Setup the Neptune run.
55
+
56
+ Args:
57
+ config: The Hydra configuration.
58
+
59
+ Returns:
60
+ The Neptune run.
61
+ """
62
+ if config.neptune.run:
63
+ run = Run(
64
+ project=config.neptune.project,
65
+ tags=[
66
+ config.task.type,
67
+ config.task.name,
68
+ config.data.name,
69
+ ],
70
+ )
71
+ run["config"] = stringify_unsupported(config)
72
+ return run
73
+ return None
74
+
75
+
76
+ def log_to_neptune(run: Run, results: dict[str, Any], output_dir: Path | None = None) -> None:
77
+ """
78
+ Log the evaluation results to Neptune.
79
+
80
+ Args:
81
+ run: The Neptune run.
82
+ results: The evaluation results.
83
+ output_dir: The output directory.
84
+ """
85
+ output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
86
+
87
+ run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
88
+ run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
89
+ run["evaluation/results"] = stringify_unsupported(results["results"])
90
+ run["evaluation/metrics.json"].upload((output_dir / "metrics.json").as_posix())
91
+ run["evaluation/results.json"].upload((output_dir / "results.json").as_posix())