ragbits-evaluate 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits_evaluate-0.2.0/.gitignore +97 -0
- ragbits_evaluate-0.2.0/CHANGELOG.md +13 -0
- ragbits_evaluate-0.2.0/PKG-INFO +27 -0
- ragbits_evaluate-0.2.0/README.md +1 -0
- ragbits_evaluate-0.2.0/pyproject.toml +60 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/__init__.py +0 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/evaluator.py +112 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/loaders/__init__.py +0 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/loaders/base.py +24 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/loaders/hf.py +25 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/metrics/__init__.py +0 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/metrics/base.py +78 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/metrics/document_search.py +68 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/pipelines/__init__.py +0 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/pipelines/base.py +40 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/pipelines/document_search.py +51 -0
- ragbits_evaluate-0.2.0/src/ragbits/evaluate/utils.py +91 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Directories
|
|
2
|
+
.vscode/
|
|
3
|
+
.idea/
|
|
4
|
+
.neptune/
|
|
5
|
+
.pytest_cache/
|
|
6
|
+
.mypy_cache/
|
|
7
|
+
venv/
|
|
8
|
+
__pycache__/
|
|
9
|
+
**.egg-info/
|
|
10
|
+
|
|
11
|
+
# Byte-compiled / optimized / DLL files
|
|
12
|
+
__pycache__/
|
|
13
|
+
*.py[cod]
|
|
14
|
+
*$py.class
|
|
15
|
+
|
|
16
|
+
# C extensions
|
|
17
|
+
*.so
|
|
18
|
+
|
|
19
|
+
# Distribution / packaging
|
|
20
|
+
.Python
|
|
21
|
+
env/
|
|
22
|
+
build/
|
|
23
|
+
develop-eggs/
|
|
24
|
+
dist/
|
|
25
|
+
downloads/
|
|
26
|
+
eggs/
|
|
27
|
+
.eggs/
|
|
28
|
+
lib/
|
|
29
|
+
lib64/
|
|
30
|
+
parts/
|
|
31
|
+
sdist/
|
|
32
|
+
var/
|
|
33
|
+
*.egg-info/
|
|
34
|
+
.installed.cfg
|
|
35
|
+
*.egg
|
|
36
|
+
|
|
37
|
+
# Sphinx documentation
|
|
38
|
+
docs/_build/
|
|
39
|
+
public/
|
|
40
|
+
# autogenerated package license table
|
|
41
|
+
docs/licenses_table.rst
|
|
42
|
+
|
|
43
|
+
# license dump file
|
|
44
|
+
licenses.txt
|
|
45
|
+
|
|
46
|
+
# File formats
|
|
47
|
+
*.onnx
|
|
48
|
+
*.pyc
|
|
49
|
+
*.pt
|
|
50
|
+
*.pth
|
|
51
|
+
*.pkl
|
|
52
|
+
*.mar
|
|
53
|
+
*.torchscript
|
|
54
|
+
**/.ipynb_checkpoints
|
|
55
|
+
**/dist/
|
|
56
|
+
**/checkpoints/
|
|
57
|
+
**/outputs/
|
|
58
|
+
**/multirun/
|
|
59
|
+
|
|
60
|
+
# Other env files
|
|
61
|
+
.python-version
|
|
62
|
+
pyvenv.cfg
|
|
63
|
+
pip-selfcheck.json
|
|
64
|
+
|
|
65
|
+
# Unit test / coverage reports
|
|
66
|
+
htmlcov/
|
|
67
|
+
.tox/
|
|
68
|
+
.coverage
|
|
69
|
+
.coverage.*
|
|
70
|
+
.cache
|
|
71
|
+
nosetests.xml
|
|
72
|
+
coverage.xml
|
|
73
|
+
*,cover
|
|
74
|
+
.hypothesis/
|
|
75
|
+
|
|
76
|
+
# dotenv
|
|
77
|
+
.env
|
|
78
|
+
|
|
79
|
+
# coverage and pytest reports
|
|
80
|
+
coverage.xml
|
|
81
|
+
report.xml
|
|
82
|
+
|
|
83
|
+
# CMake
|
|
84
|
+
cmake-build-*/
|
|
85
|
+
|
|
86
|
+
# Terraform
|
|
87
|
+
**/.terraform.lock.hcl
|
|
88
|
+
**/.terraform
|
|
89
|
+
|
|
90
|
+
# benchmarks
|
|
91
|
+
benchmarks/sql/data/
|
|
92
|
+
|
|
93
|
+
# mkdocs generated files
|
|
94
|
+
site/
|
|
95
|
+
|
|
96
|
+
# build artifacts
|
|
97
|
+
dist/
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# CHANGELOG
|
|
2
|
+
|
|
3
|
+
## Unreleased
|
|
4
|
+
|
|
5
|
+
## 0.2.0 (2024-10-23)
|
|
6
|
+
|
|
7
|
+
- Initial release of the package.
|
|
8
|
+
- Evaluation pipeline framework with capability to define evaluators & metrics.
|
|
9
|
+
- Evaluation pipeline for `ragbits-document-search`.
|
|
10
|
+
|
|
11
|
+
### Changed
|
|
12
|
+
|
|
13
|
+
- ragbits-core updated to version v0.2.0
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ragbits-evaluate
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Evaluation module for Ragbits components
|
|
5
|
+
Author-email: "deepsense.ai" <ragbits@deepsense.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: Evaluation,GenAI,Generative AI,LLMs,Large Language Models,RAG,Retrieval Augmented Generation
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Natural Language :: English
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: hydra-core~=1.3.2
|
|
21
|
+
Requires-Dist: neptune~=1.12.0
|
|
22
|
+
Requires-Dist: ragbits-core==0.2.0
|
|
23
|
+
Provides-Extra: relari
|
|
24
|
+
Requires-Dist: continuous-eval~=0.3.12; extra == 'relari'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# Ragbits Evaluate
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Ragbits Evaluate
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ragbits-evaluate"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Evaluation module for Ragbits components"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "deepsense.ai", email = "ragbits@deepsense.ai"}
|
|
10
|
+
]
|
|
11
|
+
keywords = [
|
|
12
|
+
"Retrieval Augmented Generation",
|
|
13
|
+
"RAG",
|
|
14
|
+
"Large Language Models",
|
|
15
|
+
"LLMs",
|
|
16
|
+
"Generative AI",
|
|
17
|
+
"GenAI",
|
|
18
|
+
"Evaluation"
|
|
19
|
+
]
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Development Status :: 4 - Beta",
|
|
22
|
+
"Environment :: Console",
|
|
23
|
+
"Intended Audience :: Science/Research",
|
|
24
|
+
"License :: OSI Approved :: MIT License",
|
|
25
|
+
"Natural Language :: English",
|
|
26
|
+
"Operating System :: OS Independent",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
31
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
32
|
+
]
|
|
33
|
+
dependencies = ["hydra-core~=1.3.2", "neptune~=1.12.0", "ragbits-core==0.2.0"]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
relari = [
|
|
37
|
+
"continuous-eval~=0.3.12",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[tool.uv]
|
|
41
|
+
dev-dependencies = [
|
|
42
|
+
"pre-commit~=3.8.0",
|
|
43
|
+
"pytest~=8.3.3",
|
|
44
|
+
"pytest-cov~=5.0.0",
|
|
45
|
+
"pytest-asyncio~=0.24.0",
|
|
46
|
+
"pip-licenses>=4.0.0,<5.0.0"
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[build-system]
|
|
50
|
+
requires = ["hatchling"]
|
|
51
|
+
build-backend = "hatchling.build"
|
|
52
|
+
|
|
53
|
+
[tool.hatch.metadata]
|
|
54
|
+
allow-direct-references = true
|
|
55
|
+
|
|
56
|
+
[tool.hatch.build.targets.wheel]
|
|
57
|
+
packages = ["src/ragbits"]
|
|
58
|
+
|
|
59
|
+
[tool.pytest.ini_options]
|
|
60
|
+
asyncio_mode = "auto"
|
|
File without changes
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from dataclasses import asdict
|
|
3
|
+
from typing import Any, Iterable
|
|
4
|
+
|
|
5
|
+
from tqdm.asyncio import tqdm
|
|
6
|
+
|
|
7
|
+
from ragbits.evaluate.loaders.base import DataLoader
|
|
8
|
+
from ragbits.evaluate.metrics.base import MetricSet
|
|
9
|
+
from ragbits.evaluate.pipelines.base import EvaluationPipeline, EvaluationResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Evaluator:
|
|
13
|
+
"""
|
|
14
|
+
Evaluator class.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
async def compute(
|
|
18
|
+
self,
|
|
19
|
+
pipeline: EvaluationPipeline,
|
|
20
|
+
dataloader: DataLoader,
|
|
21
|
+
metrics: MetricSet,
|
|
22
|
+
) -> dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Compute the evaluation results for the given pipeline and data.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
pipeline: The pipeline to be evaluated.
|
|
28
|
+
dataloader: The dataloader to load the data.
|
|
29
|
+
metrics: The metrics to be computed.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
The evaluation results.
|
|
33
|
+
"""
|
|
34
|
+
dataset = await dataloader.load()
|
|
35
|
+
results, perf_results = await self._call_pipeline(pipeline, dataset)
|
|
36
|
+
computed_metrics = self._compute_metrics(metrics, results)
|
|
37
|
+
processed_results = self._results_processor(results)
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
**perf_results,
|
|
41
|
+
**computed_metrics,
|
|
42
|
+
**processed_results,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async def _call_pipeline(
|
|
46
|
+
self,
|
|
47
|
+
pipeline: EvaluationPipeline,
|
|
48
|
+
dataset: Iterable,
|
|
49
|
+
) -> tuple[list[EvaluationResult], dict[str, Any]]:
|
|
50
|
+
"""
|
|
51
|
+
Call the pipeline with the given data.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
pipeline: The pipeline to be called.
|
|
55
|
+
data: The evaluation data.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
The evaluation results and performance metrics.
|
|
59
|
+
"""
|
|
60
|
+
start_time = time.perf_counter()
|
|
61
|
+
pipe_outputs = await tqdm.gather(*[pipeline(data) for data in dataset], desc="Evaluation")
|
|
62
|
+
end_time = time.perf_counter()
|
|
63
|
+
return pipe_outputs, self._compute_time_perf(start_time, end_time, len(pipe_outputs))
|
|
64
|
+
|
|
65
|
+
def _results_processor(self, results: list[EvaluationResult]) -> dict[str, Any]:
|
|
66
|
+
"""
|
|
67
|
+
Process the results.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
results: The evaluation results.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The processed results.
|
|
74
|
+
"""
|
|
75
|
+
return {"results": [asdict(result) for result in results]}
|
|
76
|
+
|
|
77
|
+
def _compute_metrics(self, metrics: MetricSet, results: list[EvaluationResult]) -> dict[str, Any]:
|
|
78
|
+
"""
|
|
79
|
+
Compute a metric using the given inputs.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
metrics: The metrics to be computed.
|
|
83
|
+
results: The evaluation results.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
The computed metric.
|
|
87
|
+
"""
|
|
88
|
+
return {"metrics": metrics.compute(results)}
|
|
89
|
+
|
|
90
|
+
def _compute_time_perf(self, start_time: float, end_time: float, num_samples: int) -> dict[str, Any]:
|
|
91
|
+
"""
|
|
92
|
+
Compute the performance metrics.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
start_time: The start time.
|
|
96
|
+
end_time: The end time.
|
|
97
|
+
num_samples: The number of samples.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
The performance metrics.
|
|
101
|
+
"""
|
|
102
|
+
latency = end_time - start_time
|
|
103
|
+
throughput = num_samples / latency
|
|
104
|
+
latency_sample = 1.0 / throughput if throughput > 0 else 0.0
|
|
105
|
+
|
|
106
|
+
return {
|
|
107
|
+
"time_perf": {
|
|
108
|
+
"total_time_in_seconds": latency,
|
|
109
|
+
"samples_per_second": throughput,
|
|
110
|
+
"latency_in_seconds": latency_sample,
|
|
111
|
+
},
|
|
112
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Generic, TypeVar
|
|
3
|
+
|
|
4
|
+
from omegaconf import DictConfig
|
|
5
|
+
|
|
6
|
+
DataT = TypeVar("DataT")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataLoader(Generic[DataT], ABC):
|
|
10
|
+
"""
|
|
11
|
+
Data loader.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, config: DictConfig) -> None:
|
|
15
|
+
self.config = config
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
async def load(self) -> DataT:
|
|
19
|
+
"""
|
|
20
|
+
Load the data.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
The loaded data.
|
|
24
|
+
"""
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import TypeAlias
|
|
2
|
+
|
|
3
|
+
from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict, load_dataset
|
|
4
|
+
|
|
5
|
+
from ragbits.evaluate.loaders.base import DataLoader
|
|
6
|
+
|
|
7
|
+
HFData: TypeAlias = DatasetDict | Dataset | IterableDatasetDict | IterableDataset
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HFDataLoader(DataLoader[HFData]):
|
|
11
|
+
"""
|
|
12
|
+
Hugging Face data loader.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
async def load(self) -> HFData:
|
|
16
|
+
"""
|
|
17
|
+
Load the data from Hugging Face.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
The loaded data.
|
|
21
|
+
"""
|
|
22
|
+
return load_dataset(
|
|
23
|
+
path=self.config.path,
|
|
24
|
+
split=self.config.split,
|
|
25
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, Generic, Optional, TypeVar
|
|
3
|
+
|
|
4
|
+
from omegaconf import DictConfig
|
|
5
|
+
from typing_extensions import Self
|
|
6
|
+
|
|
7
|
+
from ragbits.evaluate.pipelines.base import EvaluationResult
|
|
8
|
+
|
|
9
|
+
ResultT = TypeVar("ResultT", bound=EvaluationResult)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Metric(Generic[ResultT], ABC):
|
|
13
|
+
"""
|
|
14
|
+
Base class for metrics.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, config: Optional[DictConfig] = None) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Initializes the metric.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
config: The metric configuration.
|
|
23
|
+
"""
|
|
24
|
+
super().__init__()
|
|
25
|
+
self.config = getattr(config, self.__class__.__name__, DictConfig({}))
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def compute(self, results: list[ResultT]) -> dict[str, Any]:
|
|
29
|
+
"""
|
|
30
|
+
Compute the metric.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
results: The evaluation results.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
The computed metric.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class MetricSet(Generic[ResultT]):
|
|
41
|
+
"""
|
|
42
|
+
Represents a set of metrics.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, *metrics: type[Metric[ResultT]]) -> None:
|
|
46
|
+
"""
|
|
47
|
+
Initializes the metric set.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
metrics: The metrics.
|
|
51
|
+
"""
|
|
52
|
+
self._metrics = metrics
|
|
53
|
+
self.metrics: list[Metric[ResultT]] = []
|
|
54
|
+
|
|
55
|
+
def __call__(self, config: Optional[DictConfig] = None) -> Self:
|
|
56
|
+
"""
|
|
57
|
+
Initializes the metrics.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
config: The configuration for the metrics.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The initialized metric set.
|
|
64
|
+
"""
|
|
65
|
+
self.metrics = [metric(config) for metric in self._metrics]
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
def compute(self, results: list[ResultT]) -> dict[str, Any]:
|
|
69
|
+
"""
|
|
70
|
+
Compute the metrics.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
results: The evaluation results.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
The computed metrics.
|
|
77
|
+
"""
|
|
78
|
+
return {name: value for metric in self.metrics for name, value in metric.compute(results).items()}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
from continuous_eval.metrics.retrieval import PrecisionRecallF1, RankedRetrievalMetrics
|
|
6
|
+
from omegaconf import DictConfig
|
|
7
|
+
|
|
8
|
+
from ragbits.evaluate.metrics.base import Metric, MetricSet
|
|
9
|
+
from ragbits.evaluate.pipelines.document_search import DocumentSearchResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DocumentSearchMetric(Metric[DocumentSearchResult], ABC):
|
|
13
|
+
"""
|
|
14
|
+
Metric for document search evaluation based on Relari backend.
|
|
15
|
+
More details can be found [here](https://docs.relari.ai/category/retrieval-rag).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
metric_cls: type[PrecisionRecallF1 | RankedRetrievalMetrics]
|
|
19
|
+
|
|
20
|
+
def __init__(self, config: Optional[DictConfig] = None) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Initializes the metric.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
config: The metric configuration.
|
|
26
|
+
"""
|
|
27
|
+
super().__init__(config)
|
|
28
|
+
|
|
29
|
+
matching_strategy = getattr(
|
|
30
|
+
importlib.import_module("continuous_eval.metrics.retrieval.matching_strategy"),
|
|
31
|
+
self.config.matching_strategy,
|
|
32
|
+
)
|
|
33
|
+
self.metric = self.metric_cls(matching_strategy(**self.config.options))
|
|
34
|
+
|
|
35
|
+
def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]:
|
|
36
|
+
"""
|
|
37
|
+
Compute the metric.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
results: The evaluation results.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
The computed metric.
|
|
44
|
+
"""
|
|
45
|
+
return self.metric.aggregate(
|
|
46
|
+
[self.metric(result.predicted_passages, result.reference_passages) for result in results]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DocumentSearchPrecisionRecallF1(DocumentSearchMetric):
|
|
51
|
+
"""
|
|
52
|
+
Precision, recall, and F1 score for context retrieval.
|
|
53
|
+
More details can be found [here](https://docs.relari.ai/metrics/Retrieval/Deterministic/precision_recall).
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
metric_cls = PrecisionRecallF1
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DocumentSearchRankedRetrievalMetrics(DocumentSearchMetric):
|
|
60
|
+
"""
|
|
61
|
+
Rank-aware metrics takes into account the order in which the contexts are retrieved.
|
|
62
|
+
More details can be found [here](https://docs.relari.ai/metrics/Retrieval/Deterministic/rank_aware_metrics).
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
metric_cls = RankedRetrievalMetrics
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
document_search_metrics = MetricSet(DocumentSearchPrecisionRecallF1, DocumentSearchRankedRetrievalMetrics)
|
|
File without changes
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
from omegaconf import DictConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class EvaluationResult(ABC):
|
|
10
|
+
"""
|
|
11
|
+
Represents the result of a single evaluation.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EvaluationPipeline(ABC):
|
|
16
|
+
"""
|
|
17
|
+
Collection evaluation pipeline.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, config: Optional[DictConfig] = None) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Initializes the evaluation pipeline.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
config: The evaluation pipeline configuration.
|
|
26
|
+
"""
|
|
27
|
+
super().__init__()
|
|
28
|
+
self.config = config or DictConfig({})
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
async def __call__(self, data: dict[str, Any]) -> EvaluationResult:
|
|
32
|
+
"""
|
|
33
|
+
Runs the evaluation pipeline.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
data: The evaluation data.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The evaluation result.
|
|
40
|
+
"""
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from functools import cached_property
|
|
3
|
+
|
|
4
|
+
from ragbits.document_search import DocumentSearch
|
|
5
|
+
from ragbits.document_search.documents.element import TextElement
|
|
6
|
+
from ragbits.evaluate.pipelines.base import EvaluationPipeline, EvaluationResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class DocumentSearchResult(EvaluationResult):
|
|
11
|
+
"""
|
|
12
|
+
Represents the result of a single evaluation.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
question: str
|
|
16
|
+
reference_passages: list[str]
|
|
17
|
+
predicted_passages: list[str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DocumentSearchPipeline(EvaluationPipeline):
|
|
21
|
+
"""
|
|
22
|
+
Document search evaluation pipeline.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
@cached_property
|
|
26
|
+
def document_search(self) -> "DocumentSearch":
|
|
27
|
+
"""
|
|
28
|
+
Returns the document search instance.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
The document search instance.
|
|
32
|
+
"""
|
|
33
|
+
return DocumentSearch.from_config(self.config) # type: ignore
|
|
34
|
+
|
|
35
|
+
async def __call__(self, data: dict) -> DocumentSearchResult:
|
|
36
|
+
"""
|
|
37
|
+
Runs the document search evaluation pipeline.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
data: The evaluation data.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
The evaluation result.
|
|
44
|
+
"""
|
|
45
|
+
elements = await self.document_search.search(data["question"])
|
|
46
|
+
predicted_passages = [element.content for element in elements if isinstance(element, TextElement)]
|
|
47
|
+
return DocumentSearchResult(
|
|
48
|
+
question=data["question"],
|
|
49
|
+
reference_passages=data["passages"],
|
|
50
|
+
predicted_passages=predicted_passages,
|
|
51
|
+
)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sys
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from hydra.core.hydra_config import HydraConfig
|
|
8
|
+
from neptune import Run
|
|
9
|
+
from neptune.utils import stringify_unsupported
|
|
10
|
+
from omegaconf import DictConfig
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _save(file_path: Path, **data: Any) -> None:
|
|
14
|
+
"""
|
|
15
|
+
Save the data to a file. Add the current timestamp and Python version to the data.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
file_path: The path to the file.
|
|
19
|
+
data: The data to be saved.
|
|
20
|
+
"""
|
|
21
|
+
current_time = datetime.now()
|
|
22
|
+
|
|
23
|
+
data["_timestamp"] = current_time.isoformat()
|
|
24
|
+
data["_python_version"] = sys.version
|
|
25
|
+
data["_interpreter_path"] = sys.executable
|
|
26
|
+
|
|
27
|
+
with open(file_path, "w", encoding="utf-8") as file:
|
|
28
|
+
json.dump(data, file, indent=4)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def log_to_file(results: dict[str, Any], output_dir: Path | None = None) -> Path:
|
|
32
|
+
"""
|
|
33
|
+
Log the evaluation results locally.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
results: The evaluation results.
|
|
37
|
+
output_dir: The output directory.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
The output directory.
|
|
41
|
+
"""
|
|
42
|
+
output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
|
|
43
|
+
metrics_file = output_dir / "metrics.json"
|
|
44
|
+
results_file = output_dir / "results.json"
|
|
45
|
+
|
|
46
|
+
_save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"])
|
|
47
|
+
_save(results_file, results=results["results"])
|
|
48
|
+
|
|
49
|
+
return output_dir
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def setup_neptune(config: DictConfig) -> Run | None:
|
|
53
|
+
"""
|
|
54
|
+
Setup the Neptune run.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
config: The Hydra configuration.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
The Neptune run.
|
|
61
|
+
"""
|
|
62
|
+
if config.neptune.run:
|
|
63
|
+
run = Run(
|
|
64
|
+
project=config.neptune.project,
|
|
65
|
+
tags=[
|
|
66
|
+
config.task.type,
|
|
67
|
+
config.task.name,
|
|
68
|
+
config.data.name,
|
|
69
|
+
],
|
|
70
|
+
)
|
|
71
|
+
run["config"] = stringify_unsupported(config)
|
|
72
|
+
return run
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def log_to_neptune(run: Run, results: dict[str, Any], output_dir: Path | None = None) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Log the evaluation results to Neptune.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
run: The Neptune run.
|
|
82
|
+
results: The evaluation results.
|
|
83
|
+
output_dir: The output directory.
|
|
84
|
+
"""
|
|
85
|
+
output_dir = output_dir or Path(HydraConfig.get().runtime.output_dir)
|
|
86
|
+
|
|
87
|
+
run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
|
|
88
|
+
run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
|
|
89
|
+
run["evaluation/results"] = stringify_unsupported(results["results"])
|
|
90
|
+
run["evaluation/metrics.json"].upload((output_dir / "metrics.json").as_posix())
|
|
91
|
+
run["evaluation/results.json"].upload((output_dir / "results.json").as_posix())
|