retrievalbase 2.1.2__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/CHANGELOG.md +14 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/PKG-INFO +1 -1
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/pyproject.toml +1 -1
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/connector/__init__.py +4 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/connector/minio.py +10 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/connector/parquet.py +4 -0
- retrievalbase-2.2.0/src/retrievalbase/utils.py +107 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/fixtures/components.py +3 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_connector/test_connectors.py +47 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_base_contracts.py +5 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_utils/test_utils.py +1 -2
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/uv.lock +1 -1
- retrievalbase-2.1.2/src/retrievalbase/utils.py +0 -182
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/.gitignore +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/.gitlab-ci.yml +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/.pre-commit-config.yaml +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/.releaserc.json +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/AGENTS.md +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/Makefile +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/README.md +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/codecov.yml +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/commitlint.config.cjs +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/connector/settings.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/constants.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/hf.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/mixins.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/polars.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/preprocess.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/token_counter.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/settings.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/enums.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/async_batcher.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/embedders.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/python/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/python/evaluators.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/python/scores.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/processors.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/rerankers.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/retrievers/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/retrievers/dense/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/retrievers/dense/retrievers.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/settings.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/vector_stores.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/exceptions.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/ingestion/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/ingestion/settings.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/mixins.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/py.typed +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/settings.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/types.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/conftest.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/fixtures/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/fixtures/data.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_dataset/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_dataset/test_huggingface_adapter.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_evaluation/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_evaluation/conftest.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_evaluation/test_python_evaluator.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_config/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_config/test_mixins.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_config/test_settings.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_connector/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/conftest.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_dataset_base_contracts.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_dataset_mixins_more.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_polars_dataset.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_polars_lazy_paths.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_preprocess_filters.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_token_counter_hf.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_token_counters.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/conftest.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_async_batcher.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_bm25_retriever.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_dense_retriever.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_embedders.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_hf_reranker.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_hybrid_retriever.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_hybrid_retriever_runtime.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_processors.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_python_evaluator_classes.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_python_evaluator_runtime.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_rerankers.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_retriever_base.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_scores.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_vector_stores.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_ingestion/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_ingestion/test_text_ingestion_pipeline.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_utils/__init__.py +0 -0
- {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_utils/test_utils_connectors.py +0 -0
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
# [2.2.0](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.1.3...v2.2.0) (2026-05-22)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* **connector:** add target existence checks ([80e739f](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/80e739fddeca65a66d326c1bb673e1ff6a674ba6))
|
|
7
|
+
|
|
8
|
+
## [2.1.3](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.1.2...v2.1.3) (2026-05-19)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* resolve env vars loading errors ([061d549](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/061d549dea621dc2ab34b4af7f23c090f6d70350))
|
|
14
|
+
|
|
1
15
|
## [2.1.2](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.1.1...v2.1.2) (2026-05-19)
|
|
2
16
|
|
|
3
17
|
|
|
@@ -33,6 +33,10 @@ class DatasetConnector[TCDatasetConnector: DatasetConnectorSettings](
|
|
|
33
33
|
def to(self, ds: "Dataset[Any]") -> None:
|
|
34
34
|
raise NotImplementedError
|
|
35
35
|
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def target_exists(self, target: str) -> bool:
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
|
|
36
40
|
def load(self) -> "Dataset[pl.DataFrame | pl.LazyFrame]":
|
|
37
41
|
from retrievalbase.dataset.polars import PolarsDataset
|
|
38
42
|
|
|
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
3
3
|
|
|
4
4
|
import polars as pl
|
|
5
5
|
from minio import Minio
|
|
6
|
+
from minio.error import S3Error
|
|
6
7
|
|
|
7
8
|
from retrievalbase.connector import DatasetConnector
|
|
8
9
|
from retrievalbase.connector.settings import MinioDatasetConnectorSettings
|
|
@@ -43,3 +44,12 @@ class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
|
|
|
43
44
|
length=buffer.getbuffer().nbytes,
|
|
44
45
|
content_type="application/octet-stream",
|
|
45
46
|
)
|
|
47
|
+
|
|
48
|
+
def target_exists(self, target: str) -> bool:
|
|
49
|
+
try:
|
|
50
|
+
self.client.stat_object(self.config.bucket, target)
|
|
51
|
+
except S3Error as error:
|
|
52
|
+
if error.code in {"NoSuchBucket", "NoSuchKey"}:
|
|
53
|
+
return False
|
|
54
|
+
raise
|
|
55
|
+
return True
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from typing import TYPE_CHECKING, Any
|
|
2
3
|
|
|
3
4
|
import polars as pl
|
|
@@ -18,3 +19,6 @@ class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings])
|
|
|
18
19
|
|
|
19
20
|
def to(self, ds: "Dataset[Any]") -> None:
|
|
20
21
|
ds.polars.write_parquet(self.config.path)
|
|
22
|
+
|
|
23
|
+
def target_exists(self, target: str) -> bool:
|
|
24
|
+
return Path(target).exists()
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
import yaml
|
|
7
|
+
from pydantic import SecretStr
|
|
8
|
+
|
|
9
|
+
from retrievalbase.mixins import FromConfigMixin
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from retrievalbase.connector.minio import MinioDatasetConnector
|
|
13
|
+
from retrievalbase.connector.parquet import ParquetDatasetConnector
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_class(path: str) -> Any:
|
|
17
|
+
module_path, class_name = path.rsplit(".", 1)
|
|
18
|
+
module = importlib.import_module(module_path)
|
|
19
|
+
return getattr(module, class_name)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_minio_connector(
|
|
23
|
+
bucket: str, key: str, endpoint: str, access_key: str, secret_key: str
|
|
24
|
+
) -> "MinioDatasetConnector":
|
|
25
|
+
from retrievalbase.connector.minio import MinioDatasetConnector
|
|
26
|
+
from retrievalbase.connector.settings import MinioDatasetConnectorSettings
|
|
27
|
+
|
|
28
|
+
config = MinioDatasetConnectorSettings(
|
|
29
|
+
module_path="",
|
|
30
|
+
endpoint=endpoint,
|
|
31
|
+
bucket=bucket,
|
|
32
|
+
key=key,
|
|
33
|
+
access_key=SecretStr(access_key),
|
|
34
|
+
secret_key=SecretStr(secret_key),
|
|
35
|
+
)
|
|
36
|
+
return MinioDatasetConnector(config)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _get_parquet_connector(path: str, *, lazy: bool) -> "ParquetDatasetConnector":
|
|
40
|
+
from retrievalbase.connector.parquet import ParquetDatasetConnector
|
|
41
|
+
from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
|
|
42
|
+
|
|
43
|
+
config = ParquetDatasetConnectorSettings(module_path="", path=path, lazy=lazy)
|
|
44
|
+
return ParquetDatasetConnector(config)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def extract_schema_columns(
|
|
48
|
+
schema: dict[str, pl.DataType] | list[pl.Field],
|
|
49
|
+
prefix: str = "",
|
|
50
|
+
) -> list[str]:
|
|
51
|
+
cols: list[str] = []
|
|
52
|
+
if isinstance(schema, dict):
|
|
53
|
+
items = schema.items()
|
|
54
|
+
else:
|
|
55
|
+
items = ((field.name, field.dtype) for field in schema) # type: ignore[assignment]
|
|
56
|
+
for name, dtype in items:
|
|
57
|
+
full_name = f"{prefix}.{name}" if prefix else name
|
|
58
|
+
|
|
59
|
+
if isinstance(dtype, pl.Struct):
|
|
60
|
+
nested = extract_schema_columns(dtype.fields, full_name)
|
|
61
|
+
cols.extend(nested)
|
|
62
|
+
else:
|
|
63
|
+
cols.append(full_name)
|
|
64
|
+
return cols
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def resolve_column(path: str) -> pl.Expr:
|
|
68
|
+
parts = path.split(".")
|
|
69
|
+
|
|
70
|
+
expr = pl.col(parts[0])
|
|
71
|
+
for p in parts[1:]:
|
|
72
|
+
expr = expr.struct.field(p)
|
|
73
|
+
|
|
74
|
+
return expr
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def build_schema(
|
|
78
|
+
df: pl.DataFrame,
|
|
79
|
+
schema: dict[str, str],
|
|
80
|
+
) -> pl.DataFrame:
|
|
81
|
+
exprs = []
|
|
82
|
+
|
|
83
|
+
for target_col, source_path in schema.items():
|
|
84
|
+
expr = resolve_column(source_path).alias(target_col)
|
|
85
|
+
exprs.append(expr)
|
|
86
|
+
|
|
87
|
+
return df.select(exprs)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def comp(
|
|
91
|
+
path: str,
|
|
92
|
+
key: str | None = None,
|
|
93
|
+
) -> FromConfigMixin[Any]:
|
|
94
|
+
"""
|
|
95
|
+
Generic factory:
|
|
96
|
+
- loads YAML
|
|
97
|
+
- resolves module_path
|
|
98
|
+
- instantiates component via FromConfigMixin
|
|
99
|
+
"""
|
|
100
|
+
yaml_path = Path(path)
|
|
101
|
+
with yaml_path.open("r") as f:
|
|
102
|
+
raw: dict[str, Any] = yaml.safe_load(f)
|
|
103
|
+
if key is not None:
|
|
104
|
+
raw = raw[key]
|
|
105
|
+
module_path = raw["module_path"]
|
|
106
|
+
cls: type[FromConfigMixin[Any]] = load_class(module_path)
|
|
107
|
+
return cls.from_kwargs(**raw)
|
|
@@ -79,6 +79,9 @@ class FakeDatasetConnector(DatasetConnector[FakeDatasetConnectorSettings]):
|
|
|
79
79
|
def to(self, ds: Any) -> None:
|
|
80
80
|
self.__class__.last_written = ds.polars
|
|
81
81
|
|
|
82
|
+
def target_exists(self, target: str) -> bool:
|
|
83
|
+
return target == "exists"
|
|
84
|
+
|
|
82
85
|
|
|
83
86
|
class FakeTextPreprocessorSettings(TextPreprocessorSettings):
|
|
84
87
|
kind: str = "suffix"
|
|
@@ -5,7 +5,9 @@ from typing import TypedDict, cast
|
|
|
5
5
|
|
|
6
6
|
import polars as pl
|
|
7
7
|
import pytest
|
|
8
|
+
from minio.error import S3Error
|
|
8
9
|
from pydantic import SecretStr
|
|
10
|
+
from urllib3.response import BaseHTTPResponse
|
|
9
11
|
|
|
10
12
|
from retrievalbase.connector.minio import MinioDatasetConnector
|
|
11
13
|
from retrievalbase.connector.parquet import ParquetDatasetConnector
|
|
@@ -82,6 +84,18 @@ def test_parquet_connector_round_trips_dataframe(tmp_path, lazy: bool) -> None:
|
|
|
82
84
|
assert loaded.polars.to_dict(as_series=False) == sample_text_df.to_dict(as_series=False)
|
|
83
85
|
|
|
84
86
|
|
|
87
|
+
def test_parquet_connector_reports_target_existence(tmp_path) -> None:
|
|
88
|
+
existing_path = tmp_path / "dataset.parquet"
|
|
89
|
+
missing_path = tmp_path / "missing.parquet"
|
|
90
|
+
existing_path.write_bytes(b"not a real parquet file")
|
|
91
|
+
connector = ParquetDatasetConnector(
|
|
92
|
+
ParquetDatasetConnectorSettings(module_path="x", path=str(existing_path), lazy=False)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
assert connector.target_exists(str(existing_path)) is True
|
|
96
|
+
assert connector.target_exists(str(missing_path)) is False
|
|
97
|
+
|
|
98
|
+
|
|
85
99
|
def test_minio_connector_reads_and_writes_parquet_payloads(
|
|
86
100
|
monkeypatch: pytest.MonkeyPatch,
|
|
87
101
|
) -> None:
|
|
@@ -160,3 +174,36 @@ def test_minio_connector_reads_and_writes_parquet_payloads(
|
|
|
160
174
|
assert loaded.polars.to_dict(as_series=False) == sample_text_df.to_dict(as_series=False)
|
|
161
175
|
assert writes["closed"] is True
|
|
162
176
|
assert writes["released"] is True
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def test_minio_connector_reports_target_existence(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
180
|
+
calls: list[tuple[str, str]] = []
|
|
181
|
+
|
|
182
|
+
class FakeMinio:
|
|
183
|
+
def __init__(self, endpoint: str, access_key: str, secret_key: str, secure: bool) -> None:
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
def stat_object(self, bucket: str, key: str) -> object:
|
|
187
|
+
calls.append((bucket, key))
|
|
188
|
+
if key == "missing.parquet":
|
|
189
|
+
raise S3Error(
|
|
190
|
+
cast(BaseHTTPResponse, None), "NoSuchKey", "not found", key, "request-id", "host-id", bucket, key
|
|
191
|
+
)
|
|
192
|
+
return object()
|
|
193
|
+
|
|
194
|
+
monkeypatch.setattr("retrievalbase.connector.minio.Minio", FakeMinio)
|
|
195
|
+
|
|
196
|
+
connector = MinioDatasetConnector(
|
|
197
|
+
MinioDatasetConnectorSettings(
|
|
198
|
+
module_path="x",
|
|
199
|
+
endpoint="https://minio.local",
|
|
200
|
+
bucket="datasets",
|
|
201
|
+
key="sample.parquet",
|
|
202
|
+
access_key=SecretStr("access"),
|
|
203
|
+
secret_key=SecretStr("secret"),
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
assert connector.target_exists("sample.parquet") is True
|
|
208
|
+
assert connector.target_exists("missing.parquet") is False
|
|
209
|
+
assert calls == [("datasets", "sample.parquet"), ("datasets", "missing.parquet")]
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_base_contracts.py
RENAMED
|
@@ -33,6 +33,9 @@ class RaisingConnector(DatasetConnector[DatasetConnectorSettings]):
|
|
|
33
33
|
def to(self, ds) -> None:
|
|
34
34
|
super().to(ds)
|
|
35
35
|
|
|
36
|
+
def target_exists(self, target: str) -> bool:
|
|
37
|
+
return super().target_exists(target)
|
|
38
|
+
|
|
36
39
|
|
|
37
40
|
class RaisingPreprocessor(TextPreprocessor[TextPreprocessorSettings]):
|
|
38
41
|
def apply(self, ds):
|
|
@@ -94,6 +97,8 @@ def test_base_contracts_raise_not_implemented_and_noops() -> None:
|
|
|
94
97
|
connector._load()
|
|
95
98
|
with pytest.raises(NotImplementedError):
|
|
96
99
|
connector.to(PolarsTextDataset.from_records([("x", {"doc_id": "1"})]))
|
|
100
|
+
with pytest.raises(NotImplementedError):
|
|
101
|
+
connector.target_exists("x")
|
|
97
102
|
with pytest.raises(NotImplementedError):
|
|
98
103
|
preprocessor.apply(PolarsTextDataset.from_records([("x", {"doc_id": "1"})]))
|
|
99
104
|
with pytest.raises(NotImplementedError):
|
|
@@ -25,7 +25,7 @@ def test_load_class_and_comp_create_runtime_from_module_path(tmp_path) -> None:
|
|
|
25
25
|
assert instance.config.value == 17
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
def
|
|
28
|
+
def test_comp_loads_missing_nested_settings_fields_from_environment(tmp_path, monkeypatch) -> None:
|
|
29
29
|
monkeypatch.setenv("FAKE_CHILD_TOKEN", "token-from-env")
|
|
30
30
|
config_path = tmp_path / "component.yaml"
|
|
31
31
|
config_path.write_text(
|
|
@@ -35,7 +35,6 @@ def test_comp_resolves_nested_settings_from_environment(tmp_path, monkeypatch) -
|
|
|
35
35
|
f" module_path: {FAKE_PARENT_RUNTIME_PATH}",
|
|
36
36
|
" child:",
|
|
37
37
|
f" module_path: {FAKE_ENV_CHILD_RUNTIME_PATH}",
|
|
38
|
-
" token: ${FAKE_CHILD_TOKEN}",
|
|
39
38
|
"",
|
|
40
39
|
]
|
|
41
40
|
),
|
|
@@ -1,182 +0,0 @@
|
|
|
1
|
-
import importlib
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from types import UnionType
|
|
4
|
-
from typing import TYPE_CHECKING, Any, get_args, get_origin
|
|
5
|
-
|
|
6
|
-
import polars as pl
|
|
7
|
-
import yaml
|
|
8
|
-
from pydantic import SecretStr
|
|
9
|
-
|
|
10
|
-
from retrievalbase.mixins import FromConfigMixin
|
|
11
|
-
from retrievalbase.settings import FromConfigMixinSettings
|
|
12
|
-
|
|
13
|
-
if TYPE_CHECKING:
|
|
14
|
-
from retrievalbase.connector.minio import MinioDatasetConnector
|
|
15
|
-
from retrievalbase.connector.parquet import ParquetDatasetConnector
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def load_class(path: str) -> Any:
|
|
19
|
-
module_path, class_name = path.rsplit(".", 1)
|
|
20
|
-
module = importlib.import_module(module_path)
|
|
21
|
-
return getattr(module, class_name)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def _get_minio_connector(
|
|
25
|
-
bucket: str, key: str, endpoint: str, access_key: str, secret_key: str
|
|
26
|
-
) -> "MinioDatasetConnector":
|
|
27
|
-
from retrievalbase.connector.minio import MinioDatasetConnector
|
|
28
|
-
from retrievalbase.connector.settings import MinioDatasetConnectorSettings
|
|
29
|
-
|
|
30
|
-
config = MinioDatasetConnectorSettings(
|
|
31
|
-
module_path="",
|
|
32
|
-
endpoint=endpoint,
|
|
33
|
-
bucket=bucket,
|
|
34
|
-
key=key,
|
|
35
|
-
access_key=SecretStr(access_key),
|
|
36
|
-
secret_key=SecretStr(secret_key),
|
|
37
|
-
)
|
|
38
|
-
return MinioDatasetConnector(config)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def _get_parquet_connector(path: str, *, lazy: bool) -> "ParquetDatasetConnector":
|
|
42
|
-
from retrievalbase.connector.parquet import ParquetDatasetConnector
|
|
43
|
-
from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
|
|
44
|
-
|
|
45
|
-
config = ParquetDatasetConnectorSettings(module_path="", path=path, lazy=lazy)
|
|
46
|
-
return ParquetDatasetConnector(config)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def extract_schema_columns(
|
|
50
|
-
schema: dict[str, pl.DataType] | list[pl.Field],
|
|
51
|
-
prefix: str = "",
|
|
52
|
-
) -> list[str]:
|
|
53
|
-
cols: list[str] = []
|
|
54
|
-
if isinstance(schema, dict):
|
|
55
|
-
items = schema.items()
|
|
56
|
-
else:
|
|
57
|
-
items = ((field.name, field.dtype) for field in schema) # type: ignore[assignment]
|
|
58
|
-
for name, dtype in items:
|
|
59
|
-
full_name = f"{prefix}.{name}" if prefix else name
|
|
60
|
-
|
|
61
|
-
if isinstance(dtype, pl.Struct):
|
|
62
|
-
nested = extract_schema_columns(dtype.fields, full_name)
|
|
63
|
-
cols.extend(nested)
|
|
64
|
-
else:
|
|
65
|
-
cols.append(full_name)
|
|
66
|
-
return cols
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def resolve_column(path: str) -> pl.Expr:
|
|
70
|
-
parts = path.split(".")
|
|
71
|
-
|
|
72
|
-
expr = pl.col(parts[0])
|
|
73
|
-
for p in parts[1:]:
|
|
74
|
-
expr = expr.struct.field(p)
|
|
75
|
-
|
|
76
|
-
return expr
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def build_schema(
|
|
80
|
-
df: pl.DataFrame,
|
|
81
|
-
schema: dict[str, str],
|
|
82
|
-
) -> pl.DataFrame:
|
|
83
|
-
exprs = []
|
|
84
|
-
|
|
85
|
-
for target_col, source_path in schema.items():
|
|
86
|
-
expr = resolve_column(source_path).alias(target_col)
|
|
87
|
-
exprs.append(expr)
|
|
88
|
-
|
|
89
|
-
return df.select(exprs)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _is_env_placeholder(value: Any) -> bool:
|
|
93
|
-
return isinstance(value, str) and value.startswith("${") and value.endswith("}")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def _remove_env_placeholders(raw: dict[str, Any]) -> dict[str, Any]:
|
|
97
|
-
return {key: value for key, value in raw.items() if not _is_env_placeholder(value)}
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def _is_settings_class(value: Any) -> bool:
|
|
101
|
-
return isinstance(value, type) and issubclass(value, FromConfigMixinSettings)
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def _settings_classes_from_annotation(annotation: Any) -> list[type[FromConfigMixinSettings]]:
|
|
105
|
-
origin = get_origin(annotation)
|
|
106
|
-
if _is_settings_class(annotation):
|
|
107
|
-
return [annotation]
|
|
108
|
-
if _is_settings_class(origin):
|
|
109
|
-
return [origin]
|
|
110
|
-
if origin in (UnionType,):
|
|
111
|
-
return [settings_cls for arg in get_args(annotation) for settings_cls in _settings_classes_from_annotation(arg)]
|
|
112
|
-
return [settings_cls for arg in get_args(annotation) for settings_cls in _settings_classes_from_annotation(arg)]
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def _settings_class_from_module_path(module_path: str) -> type[FromConfigMixinSettings] | None:
|
|
116
|
-
try:
|
|
117
|
-
cls = load_class(module_path)
|
|
118
|
-
except (ImportError, AttributeError, ValueError):
|
|
119
|
-
return None
|
|
120
|
-
if isinstance(cls, type) and issubclass(cls, FromConfigMixin):
|
|
121
|
-
return cls.get_config_class()
|
|
122
|
-
return None
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def _settings_class_for_field(annotation: Any, value: dict[str, Any]) -> type[FromConfigMixinSettings] | None:
|
|
126
|
-
module_path = value.get("module_path")
|
|
127
|
-
if isinstance(module_path, str):
|
|
128
|
-
settings_cls = _settings_class_from_module_path(module_path)
|
|
129
|
-
if settings_cls is not None:
|
|
130
|
-
return settings_cls
|
|
131
|
-
|
|
132
|
-
settings_classes = _settings_classes_from_annotation(annotation)
|
|
133
|
-
if len(settings_classes) == 1:
|
|
134
|
-
return settings_classes[0]
|
|
135
|
-
return None
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def _resolve_settings_value(annotation: Any, value: Any) -> Any:
|
|
139
|
-
if isinstance(value, list):
|
|
140
|
-
return [_resolve_settings_value(Any, item) for item in value]
|
|
141
|
-
if not isinstance(value, dict):
|
|
142
|
-
return value
|
|
143
|
-
|
|
144
|
-
settings_cls = _settings_class_for_field(annotation, value)
|
|
145
|
-
if settings_cls is None:
|
|
146
|
-
return {
|
|
147
|
-
key: _resolve_settings_value(Any, nested_value)
|
|
148
|
-
for key, nested_value in _remove_env_placeholders(value).items()
|
|
149
|
-
}
|
|
150
|
-
return _build_settings(settings_cls, value)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def _build_settings(
|
|
154
|
-
settings_cls: type[FromConfigMixinSettings],
|
|
155
|
-
raw: dict[str, Any],
|
|
156
|
-
) -> FromConfigMixinSettings:
|
|
157
|
-
resolved = _remove_env_placeholders(raw)
|
|
158
|
-
for field_name, field_info in settings_cls.model_fields.items():
|
|
159
|
-
if field_name in resolved:
|
|
160
|
-
resolved[field_name] = _resolve_settings_value(field_info.annotation, resolved[field_name])
|
|
161
|
-
return settings_cls(**resolved)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def comp(
|
|
165
|
-
path: str,
|
|
166
|
-
key: str | None = None,
|
|
167
|
-
) -> FromConfigMixin[Any]:
|
|
168
|
-
"""
|
|
169
|
-
Generic factory:
|
|
170
|
-
- loads YAML
|
|
171
|
-
- resolves module_path
|
|
172
|
-
- instantiates component via FromConfigMixin
|
|
173
|
-
"""
|
|
174
|
-
yaml_path = Path(path)
|
|
175
|
-
with yaml_path.open("r") as f:
|
|
176
|
-
raw: dict[str, Any] = yaml.safe_load(f)
|
|
177
|
-
if key is not None:
|
|
178
|
-
raw = raw[key]
|
|
179
|
-
module_path = raw["module_path"]
|
|
180
|
-
cls: type[FromConfigMixin[Any]] = load_class(module_path)
|
|
181
|
-
settings = _build_settings(cls.get_config_class(), raw)
|
|
182
|
-
return cls.from_config(settings)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/__init__.py
RENAMED
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/preprocess.py
RENAMED
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/token_counter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/python/scores.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/retrievers/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_dataset_base_contracts.py
RENAMED
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_dataset_mixins_more.py
RENAMED
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_polars_lazy_paths.py
RENAMED
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_preprocess_filters.py
RENAMED
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_token_counter_hf.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_async_batcher.py
RENAMED
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_bm25_retriever.py
RENAMED
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_dense_retriever.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_hybrid_retriever.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_retriever_base.py
RENAMED
|
File without changes
|
|
File without changes
|
{retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_vector_stores.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|