retrievalbase 2.2.0__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/CHANGELOG.md +7 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/PKG-INFO +1 -1
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/pyproject.toml +1 -1
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/connector/__init__.py +24 -0
- retrievalbase-2.3.0/src/retrievalbase/connector/minio.py +96 -0
- retrievalbase-2.3.0/src/retrievalbase/connector/parquet.py +43 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/exceptions.py +22 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/fixtures/components.py +3 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_connector/test_connectors.py +123 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_base_contracts.py +5 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/uv.lock +1 -1
- retrievalbase-2.2.0/src/retrievalbase/connector/minio.py +0 -55
- retrievalbase-2.2.0/src/retrievalbase/connector/parquet.py +0 -24
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/.gitignore +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/.gitlab-ci.yml +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/.pre-commit-config.yaml +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/.releaserc.json +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/AGENTS.md +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/Makefile +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/README.md +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/codecov.yml +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/commitlint.config.cjs +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/connector/settings.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/constants.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/hf.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/mixins.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/polars.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/preprocess.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/token_counter.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/settings.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/enums.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/async_batcher.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/embedders.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/python/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/python/evaluators.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/python/scores.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/processors.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/rerankers.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/retrievers/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/retrievers/dense/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/retrievers/dense/retrievers.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/settings.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/vector_stores.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/ingestion/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/ingestion/settings.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/mixins.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/py.typed +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/settings.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/types.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/utils.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/conftest.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/fixtures/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/fixtures/data.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_dataset/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_dataset/test_huggingface_adapter.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_evaluation/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_evaluation/conftest.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_evaluation/test_python_evaluator.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_config/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_config/test_mixins.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_config/test_settings.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_connector/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/conftest.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_dataset_base_contracts.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_dataset_mixins_more.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_polars_dataset.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_polars_lazy_paths.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_preprocess_filters.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_token_counter_hf.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_token_counters.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/conftest.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_async_batcher.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_bm25_retriever.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_dense_retriever.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_embedders.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_hf_reranker.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_hybrid_retriever.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_hybrid_retriever_runtime.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_processors.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_python_evaluator_classes.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_python_evaluator_runtime.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_rerankers.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_retriever_base.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_scores.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_vector_stores.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_ingestion/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_ingestion/test_text_ingestion_pipeline.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_utils/__init__.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_utils/test_utils.py +0 -0
- {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_utils/test_utils_connectors.py +0 -0
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
# [2.3.0](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.2.0...v2.3.0) (2026-05-24)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* add recursive dataset connector loading ([3e7e46f](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/3e7e46f6d8533612cb3e47375985aeb3ebf156ff))
|
|
7
|
+
|
|
1
8
|
# [2.2.0](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.1.3...v2.2.0) (2026-05-22)
|
|
2
9
|
|
|
3
10
|
|
|
@@ -29,6 +29,10 @@ class DatasetConnector[TCDatasetConnector: DatasetConnectorSettings](
|
|
|
29
29
|
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
30
30
|
raise NotImplementedError
|
|
31
31
|
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def _load_recursive(self, *paths: str) -> pl.DataFrame | pl.LazyFrame:
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
32
36
|
@abstractmethod
|
|
33
37
|
def to(self, ds: "Dataset[Any]") -> None:
|
|
34
38
|
raise NotImplementedError
|
|
@@ -57,6 +61,26 @@ class DatasetConnector[TCDatasetConnector: DatasetConnectorSettings](
|
|
|
57
61
|
|
|
58
62
|
return PolarsTextDataset.from_polars(df)
|
|
59
63
|
|
|
64
|
+
def load_recursive(self, *paths: str) -> "Dataset[pl.DataFrame | pl.LazyFrame]":
|
|
65
|
+
from retrievalbase.dataset.polars import PolarsDataset
|
|
66
|
+
|
|
67
|
+
_logger.info(f"Loading recursive dataset | connector={self.__class__.__name__}")
|
|
68
|
+
|
|
69
|
+
df = self._load_recursive(*paths)
|
|
70
|
+
self._log_polars_info(df)
|
|
71
|
+
|
|
72
|
+
return PolarsDataset.from_polars(df)
|
|
73
|
+
|
|
74
|
+
def load_recursive_text(self, *paths: str) -> "TextDataset[pl.DataFrame | pl.LazyFrame]":
|
|
75
|
+
from retrievalbase.dataset.polars import PolarsTextDataset
|
|
76
|
+
|
|
77
|
+
_logger.info(f"Loading recursive text dataset | connector={self.__class__.__name__}")
|
|
78
|
+
|
|
79
|
+
df = self._load_recursive(*paths)
|
|
80
|
+
self._log_polars_info(df)
|
|
81
|
+
|
|
82
|
+
return PolarsTextDataset.from_polars(df)
|
|
83
|
+
|
|
60
84
|
# ------------------------------------------------------------------
|
|
61
85
|
# Helpers
|
|
62
86
|
# ------------------------------------------------------------------
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
from minio import Minio
|
|
7
|
+
from minio.error import S3Error
|
|
8
|
+
|
|
9
|
+
from retrievalbase.connector import DatasetConnector
|
|
10
|
+
from retrievalbase.connector.settings import MinioDatasetConnectorSettings
|
|
11
|
+
from retrievalbase.exceptions import MinioParquetObjectsNotFoundError
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from retrievalbase.dataset import Dataset
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
|
|
18
|
+
def __init__(self, config: MinioDatasetConnectorSettings):
|
|
19
|
+
super().__init__(config)
|
|
20
|
+
self.client = Minio(
|
|
21
|
+
self.config.endpoint.replace("http://", "").replace("https://", ""),
|
|
22
|
+
access_key=self.config.access_key.get_secret_value(),
|
|
23
|
+
secret_key=self.config.secret_key.get_secret_value(),
|
|
24
|
+
secure=self.config.endpoint.startswith("https://"),
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
28
|
+
return self._read_parquet_object(self.config.bucket, self.config.key)
|
|
29
|
+
|
|
30
|
+
def _load_recursive(self, *paths: str) -> pl.DataFrame:
|
|
31
|
+
dataframes = [
|
|
32
|
+
self._read_parquet_object(bucket, object_name)
|
|
33
|
+
for bucket, object_name in self._iter_parquet_objects(paths or (self.config.key,))
|
|
34
|
+
]
|
|
35
|
+
if not dataframes:
|
|
36
|
+
raise MinioParquetObjectsNotFoundError(paths or (self.config.key,))
|
|
37
|
+
return pl.concat(dataframes)
|
|
38
|
+
|
|
39
|
+
def _read_parquet_object(self, bucket: str, object_name: str) -> pl.DataFrame:
|
|
40
|
+
response = self.client.get_object(bucket, object_name)
|
|
41
|
+
try:
|
|
42
|
+
buffer = io.BytesIO(response.read())
|
|
43
|
+
finally:
|
|
44
|
+
response.close()
|
|
45
|
+
response.release_conn()
|
|
46
|
+
return pl.read_parquet(buffer)
|
|
47
|
+
|
|
48
|
+
def _iter_parquet_objects(self, paths: tuple[str, ...]) -> list[tuple[str, str]]:
|
|
49
|
+
objects: list[tuple[str, str]] = []
|
|
50
|
+
for path in paths:
|
|
51
|
+
bucket, object_path = self._resolve_path(path)
|
|
52
|
+
if object_path.endswith(".parquet"):
|
|
53
|
+
objects.append((bucket, object_path))
|
|
54
|
+
continue
|
|
55
|
+
prefix = object_path.rstrip("/")
|
|
56
|
+
if prefix:
|
|
57
|
+
prefix = f"{prefix}/"
|
|
58
|
+
objects.extend(
|
|
59
|
+
(bucket, item.object_name)
|
|
60
|
+
for item in self.client.list_objects(bucket, prefix=prefix, recursive=True)
|
|
61
|
+
if item.object_name.endswith(".parquet")
|
|
62
|
+
)
|
|
63
|
+
return sorted(objects)
|
|
64
|
+
|
|
65
|
+
def _resolve_path(self, path: str) -> tuple[str, str]:
|
|
66
|
+
parsed = urlparse(path)
|
|
67
|
+
if parsed.scheme in {"s3", "minio"}:
|
|
68
|
+
return parsed.netloc, parsed.path.lstrip("/")
|
|
69
|
+
|
|
70
|
+
normalized = path.lstrip("/")
|
|
71
|
+
bucket, separator, key = normalized.partition("/")
|
|
72
|
+
if separator and bucket == self.config.bucket:
|
|
73
|
+
return bucket, key
|
|
74
|
+
return self.config.bucket, normalized
|
|
75
|
+
|
|
76
|
+
def to(self, ds: "Dataset[Any]") -> None:
|
|
77
|
+
df = ds.polars
|
|
78
|
+
buffer = io.BytesIO()
|
|
79
|
+
df.write_parquet(buffer)
|
|
80
|
+
buffer.seek(0)
|
|
81
|
+
self.client.put_object(
|
|
82
|
+
bucket_name=self.config.bucket,
|
|
83
|
+
object_name=self.config.key,
|
|
84
|
+
data=buffer,
|
|
85
|
+
length=buffer.getbuffer().nbytes,
|
|
86
|
+
content_type="application/octet-stream",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def target_exists(self, target: str) -> bool:
|
|
90
|
+
try:
|
|
91
|
+
self.client.stat_object(self.config.bucket, target)
|
|
92
|
+
except S3Error as error:
|
|
93
|
+
if error.code in {"NoSuchBucket", "NoSuchKey"}:
|
|
94
|
+
return False
|
|
95
|
+
raise
|
|
96
|
+
return True
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
3
|
+
|
|
4
|
+
import polars as pl
|
|
5
|
+
|
|
6
|
+
from retrievalbase.connector import DatasetConnector
|
|
7
|
+
from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
|
|
8
|
+
from retrievalbase.exceptions import ParquetFilesNotFoundError
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from retrievalbase.dataset import Dataset
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings]):
|
|
15
|
+
def __init__(self, config: ParquetDatasetConnectorSettings):
|
|
16
|
+
super().__init__(config)
|
|
17
|
+
|
|
18
|
+
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
19
|
+
return pl.scan_parquet(self.config.path) if self.config.lazy else pl.read_parquet(self.config.path)
|
|
20
|
+
|
|
21
|
+
def _load_recursive(self, *paths: str) -> pl.DataFrame | pl.LazyFrame:
|
|
22
|
+
parquet_paths = self._iter_parquet_paths(paths or (self.config.path,))
|
|
23
|
+
if not parquet_paths:
|
|
24
|
+
raise ParquetFilesNotFoundError(paths or (self.config.path,))
|
|
25
|
+
if self.config.lazy:
|
|
26
|
+
return pl.scan_parquet([str(path) for path in parquet_paths])
|
|
27
|
+
return pl.concat([pl.read_parquet(path) for path in parquet_paths])
|
|
28
|
+
|
|
29
|
+
def _iter_parquet_paths(self, paths: tuple[str, ...]) -> list[Path]:
|
|
30
|
+
parquet_paths: list[Path] = []
|
|
31
|
+
for raw_path in paths:
|
|
32
|
+
path = Path(raw_path)
|
|
33
|
+
if path.suffix == ".parquet":
|
|
34
|
+
parquet_paths.append(path)
|
|
35
|
+
continue
|
|
36
|
+
parquet_paths.extend(path.rglob("*.parquet"))
|
|
37
|
+
return sorted(parquet_paths)
|
|
38
|
+
|
|
39
|
+
def to(self, ds: "Dataset[Any]") -> None:
|
|
40
|
+
ds.polars.write_parquet(self.config.path)
|
|
41
|
+
|
|
42
|
+
def target_exists(self, target: str) -> bool:
|
|
43
|
+
return Path(target).exists()
|
|
@@ -37,6 +37,28 @@ class DatasetError(RetrievalBaseError):
|
|
|
37
37
|
pass
|
|
38
38
|
|
|
39
39
|
|
|
40
|
+
class DatasetConnectorError(RetrievalBaseError):
|
|
41
|
+
"""Base error for dataset connector failures."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MinioParquetObjectsNotFoundError(DatasetConnectorError, FileNotFoundError):
|
|
45
|
+
"""Raised when a MinIO path does not contain parquet objects."""
|
|
46
|
+
|
|
47
|
+
def __init__(self, paths: tuple[str, ...]):
|
|
48
|
+
self.paths = paths
|
|
49
|
+
roots = ", ".join(paths)
|
|
50
|
+
super().__init__(f"No parquet objects found in MinIO paths: {roots}")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ParquetFilesNotFoundError(DatasetConnectorError, FileNotFoundError):
|
|
54
|
+
"""Raised when local paths do not contain parquet files."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, paths: tuple[str, ...]):
|
|
57
|
+
self.paths = paths
|
|
58
|
+
roots = ", ".join(paths)
|
|
59
|
+
super().__init__(f"No parquet files found in paths: {roots}")
|
|
60
|
+
|
|
61
|
+
|
|
40
62
|
class DatasetSplitError(DatasetError, ValueError):
|
|
41
63
|
"""Raised when dataset split parameters are invalid."""
|
|
42
64
|
|
|
@@ -76,6 +76,9 @@ class FakeDatasetConnector(DatasetConnector[FakeDatasetConnectorSettings]):
|
|
|
76
76
|
def _load(self) -> pl.DataFrame:
|
|
77
77
|
return pl.DataFrame(self.config.rows)
|
|
78
78
|
|
|
79
|
+
def _load_recursive(self, *paths: str) -> pl.DataFrame:
|
|
80
|
+
return self._load()
|
|
81
|
+
|
|
79
82
|
def to(self, ds: Any) -> None:
|
|
80
83
|
self.__class__.last_written = ds.polars
|
|
81
84
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import io
|
|
4
|
+
from types import SimpleNamespace
|
|
4
5
|
from typing import TypedDict, cast
|
|
5
6
|
|
|
6
7
|
import polars as pl
|
|
@@ -17,6 +18,7 @@ from retrievalbase.connector.settings import (
|
|
|
17
18
|
)
|
|
18
19
|
from retrievalbase.dataset import Dataset, TextDataset
|
|
19
20
|
from retrievalbase.dataset.polars import PolarsDataset, PolarsTextDataset
|
|
21
|
+
from retrievalbase.exceptions import MinioParquetObjectsNotFoundError, ParquetFilesNotFoundError
|
|
20
22
|
from tests.fixtures.data import make_text_dataframe
|
|
21
23
|
|
|
22
24
|
|
|
@@ -96,6 +98,36 @@ def test_parquet_connector_reports_target_existence(tmp_path) -> None:
|
|
|
96
98
|
assert connector.target_exists(str(missing_path)) is False
|
|
97
99
|
|
|
98
100
|
|
|
101
|
+
def test_parquet_connector_loads_recursive_paths(tmp_path) -> None:
|
|
102
|
+
root = tmp_path / "root"
|
|
103
|
+
nested = root / "nested"
|
|
104
|
+
nested.mkdir(parents=True)
|
|
105
|
+
first = make_text_dataframe([{"page_content": "one", "metadata": {"doc_id": "a"}}])
|
|
106
|
+
second = make_text_dataframe([{"page_content": "two", "metadata": {"doc_id": "b"}}])
|
|
107
|
+
ignored = root / "ignored.txt"
|
|
108
|
+
first.write_parquet(root / "first.parquet")
|
|
109
|
+
second.write_parquet(nested / "second.parquet")
|
|
110
|
+
ignored.write_text("ignore me")
|
|
111
|
+
|
|
112
|
+
connector = ParquetDatasetConnector(ParquetDatasetConnectorSettings(module_path="x", path=str(root), lazy=False))
|
|
113
|
+
|
|
114
|
+
loaded = connector.load_recursive_text()
|
|
115
|
+
|
|
116
|
+
assert loaded.polars.to_dict(as_series=False) == {
|
|
117
|
+
"page_content": ["one", "two"],
|
|
118
|
+
"metadata": [{"doc_id": "a"}, {"doc_id": "b"}],
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def test_parquet_connector_raises_when_recursive_path_has_no_parquet(tmp_path) -> None:
|
|
123
|
+
empty = tmp_path / "empty"
|
|
124
|
+
empty.mkdir()
|
|
125
|
+
connector = ParquetDatasetConnector(ParquetDatasetConnectorSettings(module_path="x", path=str(empty), lazy=False))
|
|
126
|
+
|
|
127
|
+
with pytest.raises(ParquetFilesNotFoundError, match="No parquet files found"):
|
|
128
|
+
connector.load_recursive()
|
|
129
|
+
|
|
130
|
+
|
|
99
131
|
def test_minio_connector_reads_and_writes_parquet_payloads(
|
|
100
132
|
monkeypatch: pytest.MonkeyPatch,
|
|
101
133
|
) -> None:
|
|
@@ -207,3 +239,94 @@ def test_minio_connector_reports_target_existence(monkeypatch: pytest.MonkeyPatc
|
|
|
207
239
|
assert connector.target_exists("sample.parquet") is True
|
|
208
240
|
assert connector.target_exists("missing.parquet") is False
|
|
209
241
|
assert calls == [("datasets", "sample.parquet"), ("datasets", "missing.parquet")]
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def test_minio_connector_loads_recursive_parquet_prefixes(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
245
|
+
frames = {
|
|
246
|
+
"a/df.parquet": pl.DataFrame({"id": [1], "text": ["one"]}),
|
|
247
|
+
"a/B/df2.parquet": pl.DataFrame({"id": [2], "text": ["two"]}),
|
|
248
|
+
"another_root/df3.parquet": pl.DataFrame({"id": [3], "text": ["three"]}),
|
|
249
|
+
"another_root/ignored.txt": pl.DataFrame({"id": [99], "text": ["ignored"]}),
|
|
250
|
+
}
|
|
251
|
+
calls: dict[str, list[object]] = {"listed": [], "read": []}
|
|
252
|
+
|
|
253
|
+
def parquet_payload(df: pl.DataFrame) -> bytes:
|
|
254
|
+
buffer = io.BytesIO()
|
|
255
|
+
df.write_parquet(buffer)
|
|
256
|
+
return buffer.getvalue()
|
|
257
|
+
|
|
258
|
+
class FakeResponse:
|
|
259
|
+
def __init__(self, payload: bytes) -> None:
|
|
260
|
+
self.payload = payload
|
|
261
|
+
|
|
262
|
+
def read(self) -> bytes:
|
|
263
|
+
return self.payload
|
|
264
|
+
|
|
265
|
+
def close(self) -> None:
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
def release_conn(self) -> None:
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
class FakeMinio:
|
|
272
|
+
def __init__(self, endpoint: str, access_key: str, secret_key: str, secure: bool) -> None:
|
|
273
|
+
pass
|
|
274
|
+
|
|
275
|
+
def list_objects(self, bucket: str, prefix: str, recursive: bool) -> list[SimpleNamespace]:
|
|
276
|
+
calls["listed"].append((bucket, prefix, recursive))
|
|
277
|
+
return [SimpleNamespace(object_name=key) for key in frames if key.startswith(prefix)]
|
|
278
|
+
|
|
279
|
+
def get_object(self, bucket: str, key: str) -> FakeResponse:
|
|
280
|
+
calls["read"].append((bucket, key))
|
|
281
|
+
return FakeResponse(parquet_payload(frames[key]))
|
|
282
|
+
|
|
283
|
+
monkeypatch.setattr("retrievalbase.connector.minio.Minio", FakeMinio)
|
|
284
|
+
|
|
285
|
+
connector = MinioDatasetConnector(
|
|
286
|
+
MinioDatasetConnectorSettings(
|
|
287
|
+
module_path="x",
|
|
288
|
+
endpoint="https://minio.local",
|
|
289
|
+
bucket="datasets",
|
|
290
|
+
key="a",
|
|
291
|
+
access_key=SecretStr("access"),
|
|
292
|
+
secret_key=SecretStr("secret"),
|
|
293
|
+
)
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
loaded = connector.load_recursive("datasets/a", "s3://datasets/another_root")
|
|
297
|
+
|
|
298
|
+
assert calls["listed"] == [("datasets", "a/", True), ("datasets", "another_root/", True)]
|
|
299
|
+
assert calls["read"] == [
|
|
300
|
+
("datasets", "a/B/df2.parquet"),
|
|
301
|
+
("datasets", "a/df.parquet"),
|
|
302
|
+
("datasets", "another_root/df3.parquet"),
|
|
303
|
+
]
|
|
304
|
+
assert loaded.polars.to_dict(as_series=False) == {
|
|
305
|
+
"id": [2, 1, 3],
|
|
306
|
+
"text": ["two", "one", "three"],
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def test_minio_connector_raises_when_prefix_has_no_parquet(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
311
|
+
class FakeMinio:
|
|
312
|
+
def __init__(self, endpoint: str, access_key: str, secret_key: str, secure: bool) -> None:
|
|
313
|
+
pass
|
|
314
|
+
|
|
315
|
+
def list_objects(self, bucket: str, prefix: str, recursive: bool) -> list[SimpleNamespace]:
|
|
316
|
+
return [SimpleNamespace(object_name="empty/readme.txt")]
|
|
317
|
+
|
|
318
|
+
monkeypatch.setattr("retrievalbase.connector.minio.Minio", FakeMinio)
|
|
319
|
+
|
|
320
|
+
connector = MinioDatasetConnector(
|
|
321
|
+
MinioDatasetConnectorSettings(
|
|
322
|
+
module_path="x",
|
|
323
|
+
endpoint="https://minio.local",
|
|
324
|
+
bucket="datasets",
|
|
325
|
+
key="empty",
|
|
326
|
+
access_key=SecretStr("access"),
|
|
327
|
+
secret_key=SecretStr("secret"),
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
with pytest.raises(MinioParquetObjectsNotFoundError, match="No parquet objects found"):
|
|
332
|
+
connector.load_recursive()
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_base_contracts.py
RENAMED
|
@@ -30,6 +30,9 @@ class RaisingConnector(DatasetConnector[DatasetConnectorSettings]):
|
|
|
30
30
|
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
31
31
|
return super()._load()
|
|
32
32
|
|
|
33
|
+
def _load_recursive(self, *paths: str) -> pl.DataFrame | pl.LazyFrame:
|
|
34
|
+
return super()._load_recursive(*paths)
|
|
35
|
+
|
|
33
36
|
def to(self, ds) -> None:
|
|
34
37
|
super().to(ds)
|
|
35
38
|
|
|
@@ -95,6 +98,8 @@ def test_base_contracts_raise_not_implemented_and_noops() -> None:
|
|
|
95
98
|
|
|
96
99
|
with pytest.raises(NotImplementedError):
|
|
97
100
|
connector._load()
|
|
101
|
+
with pytest.raises(NotImplementedError):
|
|
102
|
+
connector._load_recursive("x")
|
|
98
103
|
with pytest.raises(NotImplementedError):
|
|
99
104
|
connector.to(PolarsTextDataset.from_records([("x", {"doc_id": "1"})]))
|
|
100
105
|
with pytest.raises(NotImplementedError):
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
import io
|
|
2
|
-
from typing import TYPE_CHECKING, Any
|
|
3
|
-
|
|
4
|
-
import polars as pl
|
|
5
|
-
from minio import Minio
|
|
6
|
-
from minio.error import S3Error
|
|
7
|
-
|
|
8
|
-
from retrievalbase.connector import DatasetConnector
|
|
9
|
-
from retrievalbase.connector.settings import MinioDatasetConnectorSettings
|
|
10
|
-
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from retrievalbase.dataset import Dataset
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
|
|
16
|
-
def __init__(self, config: MinioDatasetConnectorSettings):
|
|
17
|
-
super().__init__(config)
|
|
18
|
-
self.client = Minio(
|
|
19
|
-
self.config.endpoint.replace("http://", "").replace("https://", ""),
|
|
20
|
-
access_key=self.config.access_key.get_secret_value(),
|
|
21
|
-
secret_key=self.config.secret_key.get_secret_value(),
|
|
22
|
-
secure=self.config.endpoint.startswith("https://"),
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
26
|
-
response = self.client.get_object(self.config.bucket, self.config.key)
|
|
27
|
-
try:
|
|
28
|
-
buffer = io.BytesIO(response.read())
|
|
29
|
-
finally:
|
|
30
|
-
response.close()
|
|
31
|
-
response.release_conn()
|
|
32
|
-
df = pl.read_parquet(buffer)
|
|
33
|
-
return df
|
|
34
|
-
|
|
35
|
-
def to(self, ds: "Dataset[Any]") -> None:
|
|
36
|
-
df = ds.polars
|
|
37
|
-
buffer = io.BytesIO()
|
|
38
|
-
df.write_parquet(buffer)
|
|
39
|
-
buffer.seek(0)
|
|
40
|
-
self.client.put_object(
|
|
41
|
-
bucket_name=self.config.bucket,
|
|
42
|
-
object_name=self.config.key,
|
|
43
|
-
data=buffer,
|
|
44
|
-
length=buffer.getbuffer().nbytes,
|
|
45
|
-
content_type="application/octet-stream",
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
def target_exists(self, target: str) -> bool:
|
|
49
|
-
try:
|
|
50
|
-
self.client.stat_object(self.config.bucket, target)
|
|
51
|
-
except S3Error as error:
|
|
52
|
-
if error.code in {"NoSuchBucket", "NoSuchKey"}:
|
|
53
|
-
return False
|
|
54
|
-
raise
|
|
55
|
-
return True
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import TYPE_CHECKING, Any
|
|
3
|
-
|
|
4
|
-
import polars as pl
|
|
5
|
-
|
|
6
|
-
from retrievalbase.connector import DatasetConnector
|
|
7
|
-
from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
|
|
8
|
-
|
|
9
|
-
if TYPE_CHECKING:
|
|
10
|
-
from retrievalbase.dataset import Dataset
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings]):
|
|
14
|
-
def __init__(self, config: ParquetDatasetConnectorSettings):
|
|
15
|
-
super().__init__(config)
|
|
16
|
-
|
|
17
|
-
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
18
|
-
return pl.scan_parquet(self.config.path) if self.config.lazy else pl.read_parquet(self.config.path)
|
|
19
|
-
|
|
20
|
-
def to(self, ds: "Dataset[Any]") -> None:
|
|
21
|
-
ds.polars.write_parquet(self.config.path)
|
|
22
|
-
|
|
23
|
-
def target_exists(self, target: str) -> bool:
|
|
24
|
-
return Path(target).exists()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/__init__.py
RENAMED
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/preprocess.py
RENAMED
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/token_counter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/python/scores.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/retrievers/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_dataset_base_contracts.py
RENAMED
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_dataset_mixins_more.py
RENAMED
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_polars_lazy_paths.py
RENAMED
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_preprocess_filters.py
RENAMED
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_token_counter_hf.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_async_batcher.py
RENAMED
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_bm25_retriever.py
RENAMED
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_dense_retriever.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_hybrid_retriever.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_retriever_base.py
RENAMED
|
File without changes
|
|
File without changes
|
{retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_vector_stores.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|