retrievalbase 2.2.0__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/CHANGELOG.md +7 -0
  2. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/PKG-INFO +1 -1
  3. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/pyproject.toml +1 -1
  4. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/connector/__init__.py +24 -0
  5. retrievalbase-2.3.0/src/retrievalbase/connector/minio.py +96 -0
  6. retrievalbase-2.3.0/src/retrievalbase/connector/parquet.py +43 -0
  7. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/exceptions.py +22 -0
  8. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/fixtures/components.py +3 -0
  9. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_connector/test_connectors.py +123 -0
  10. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_base_contracts.py +5 -0
  11. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/uv.lock +1 -1
  12. retrievalbase-2.2.0/src/retrievalbase/connector/minio.py +0 -55
  13. retrievalbase-2.2.0/src/retrievalbase/connector/parquet.py +0 -24
  14. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/.gitignore +0 -0
  15. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/.gitlab-ci.yml +0 -0
  16. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/.pre-commit-config.yaml +0 -0
  17. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/.releaserc.json +0 -0
  18. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/AGENTS.md +0 -0
  19. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/Makefile +0 -0
  20. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/README.md +0 -0
  21. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/codecov.yml +0 -0
  22. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/commitlint.config.cjs +0 -0
  23. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/__init__.py +0 -0
  24. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/connector/settings.py +0 -0
  25. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/constants.py +0 -0
  26. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/__init__.py +0 -0
  27. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/hf.py +0 -0
  28. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/mixins.py +0 -0
  29. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/polars.py +0 -0
  30. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/__init__.py +0 -0
  31. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/preprocess.py +0 -0
  32. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/preprocess/token_counter.py +0 -0
  33. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/dataset/settings.py +0 -0
  34. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/enums.py +0 -0
  35. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/__init__.py +0 -0
  36. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/async_batcher.py +0 -0
  37. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/embedders.py +0 -0
  38. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/__init__.py +0 -0
  39. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/python/__init__.py +0 -0
  40. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/python/evaluators.py +0 -0
  41. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/evaluators/python/scores.py +0 -0
  42. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/processors.py +0 -0
  43. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/rerankers.py +0 -0
  44. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/retrievers/__init__.py +0 -0
  45. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/retrievers/dense/__init__.py +0 -0
  46. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/retrievers/dense/retrievers.py +0 -0
  47. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/settings.py +0 -0
  48. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/evaluation/vector_stores.py +0 -0
  49. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/ingestion/__init__.py +0 -0
  50. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/ingestion/settings.py +0 -0
  51. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/mixins.py +0 -0
  52. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/py.typed +0 -0
  53. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/settings.py +0 -0
  54. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/types.py +0 -0
  55. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/src/retrievalbase/utils.py +0 -0
  56. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/__init__.py +0 -0
  57. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/conftest.py +0 -0
  58. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/fixtures/__init__.py +0 -0
  59. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/fixtures/data.py +0 -0
  60. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/__init__.py +0 -0
  61. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_dataset/__init__.py +0 -0
  62. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_dataset/test_huggingface_adapter.py +0 -0
  63. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_evaluation/__init__.py +0 -0
  64. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_evaluation/conftest.py +0 -0
  65. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/integration/test_evaluation/test_python_evaluator.py +0 -0
  66. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/__init__.py +0 -0
  67. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_config/__init__.py +0 -0
  68. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_config/test_mixins.py +0 -0
  69. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_config/test_settings.py +0 -0
  70. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_connector/__init__.py +0 -0
  71. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/__init__.py +0 -0
  72. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/conftest.py +0 -0
  73. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_dataset_base_contracts.py +0 -0
  74. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_dataset_mixins_more.py +0 -0
  75. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_polars_dataset.py +0 -0
  76. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_polars_lazy_paths.py +0 -0
  77. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_preprocess_filters.py +0 -0
  78. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_token_counter_hf.py +0 -0
  79. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_dataset/test_token_counters.py +0 -0
  80. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/__init__.py +0 -0
  81. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/conftest.py +0 -0
  82. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_async_batcher.py +0 -0
  83. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_bm25_retriever.py +0 -0
  84. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_dense_retriever.py +0 -0
  85. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_embedders.py +0 -0
  86. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_hf_reranker.py +0 -0
  87. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_hybrid_retriever.py +0 -0
  88. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_hybrid_retriever_runtime.py +0 -0
  89. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_processors.py +0 -0
  90. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_python_evaluator_classes.py +0 -0
  91. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_python_evaluator_runtime.py +0 -0
  92. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_rerankers.py +0 -0
  93. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_retriever_base.py +0 -0
  94. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_scores.py +0 -0
  95. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_evaluation/test_vector_stores.py +0 -0
  96. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_ingestion/__init__.py +0 -0
  97. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_ingestion/test_text_ingestion_pipeline.py +0 -0
  98. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_utils/__init__.py +0 -0
  99. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_utils/test_utils.py +0 -0
  100. {retrievalbase-2.2.0 → retrievalbase-2.3.0}/tests/unit/test_utils/test_utils_connectors.py +0 -0
@@ -1,3 +1,10 @@
1
+ # [2.3.0](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.2.0...v2.3.0) (2026-05-24)
2
+
3
+
4
+ ### Features
5
+
6
+ * add recursive dataset connector loading ([3e7e46f](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/3e7e46f6d8533612cb3e47375985aeb3ebf156ff))
7
+
1
8
  # [2.2.0](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.1.3...v2.2.0) (2026-05-22)
2
9
 
3
10
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: retrievalbase
3
- Version: 2.2.0
3
+ Version: 2.3.0
4
4
  Author-email: jalal <jalalkhaldi3@gmail.com>
5
5
  Requires-Python: <3.13,>=3.11
6
6
  Requires-Dist: faiss-cpu<2.0.0,>=1.13.2
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "retrievalbase"
3
- version = "2.2.0"
3
+ version = "2.3.0"
4
4
  description = ""
5
5
  authors = [
6
6
  { name = "jalal", email = "jalalkhaldi3@gmail.com" }
@@ -29,6 +29,10 @@ class DatasetConnector[TCDatasetConnector: DatasetConnectorSettings](
29
29
  def _load(self) -> pl.DataFrame | pl.LazyFrame:
30
30
  raise NotImplementedError
31
31
 
32
+ @abstractmethod
33
+ def _load_recursive(self, *paths: str) -> pl.DataFrame | pl.LazyFrame:
34
+ raise NotImplementedError
35
+
32
36
  @abstractmethod
33
37
  def to(self, ds: "Dataset[Any]") -> None:
34
38
  raise NotImplementedError
@@ -57,6 +61,26 @@ class DatasetConnector[TCDatasetConnector: DatasetConnectorSettings](
57
61
 
58
62
  return PolarsTextDataset.from_polars(df)
59
63
 
64
+ def load_recursive(self, *paths: str) -> "Dataset[pl.DataFrame | pl.LazyFrame]":
65
+ from retrievalbase.dataset.polars import PolarsDataset
66
+
67
+ _logger.info(f"Loading recursive dataset | connector={self.__class__.__name__}")
68
+
69
+ df = self._load_recursive(*paths)
70
+ self._log_polars_info(df)
71
+
72
+ return PolarsDataset.from_polars(df)
73
+
74
+ def load_recursive_text(self, *paths: str) -> "TextDataset[pl.DataFrame | pl.LazyFrame]":
75
+ from retrievalbase.dataset.polars import PolarsTextDataset
76
+
77
+ _logger.info(f"Loading recursive text dataset | connector={self.__class__.__name__}")
78
+
79
+ df = self._load_recursive(*paths)
80
+ self._log_polars_info(df)
81
+
82
+ return PolarsTextDataset.from_polars(df)
83
+
60
84
  # ------------------------------------------------------------------
61
85
  # Helpers
62
86
  # ------------------------------------------------------------------
@@ -0,0 +1,96 @@
1
+ import io
2
+ from typing import TYPE_CHECKING, Any
3
+ from urllib.parse import urlparse
4
+
5
+ import polars as pl
6
+ from minio import Minio
7
+ from minio.error import S3Error
8
+
9
+ from retrievalbase.connector import DatasetConnector
10
+ from retrievalbase.connector.settings import MinioDatasetConnectorSettings
11
+ from retrievalbase.exceptions import MinioParquetObjectsNotFoundError
12
+
13
+ if TYPE_CHECKING:
14
+ from retrievalbase.dataset import Dataset
15
+
16
+
17
+ class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
18
+ def __init__(self, config: MinioDatasetConnectorSettings):
19
+ super().__init__(config)
20
+ self.client = Minio(
21
+ self.config.endpoint.replace("http://", "").replace("https://", ""),
22
+ access_key=self.config.access_key.get_secret_value(),
23
+ secret_key=self.config.secret_key.get_secret_value(),
24
+ secure=self.config.endpoint.startswith("https://"),
25
+ )
26
+
27
+ def _load(self) -> pl.DataFrame | pl.LazyFrame:
28
+ return self._read_parquet_object(self.config.bucket, self.config.key)
29
+
30
+ def _load_recursive(self, *paths: str) -> pl.DataFrame:
31
+ dataframes = [
32
+ self._read_parquet_object(bucket, object_name)
33
+ for bucket, object_name in self._iter_parquet_objects(paths or (self.config.key,))
34
+ ]
35
+ if not dataframes:
36
+ raise MinioParquetObjectsNotFoundError(paths or (self.config.key,))
37
+ return pl.concat(dataframes)
38
+
39
+ def _read_parquet_object(self, bucket: str, object_name: str) -> pl.DataFrame:
40
+ response = self.client.get_object(bucket, object_name)
41
+ try:
42
+ buffer = io.BytesIO(response.read())
43
+ finally:
44
+ response.close()
45
+ response.release_conn()
46
+ return pl.read_parquet(buffer)
47
+
48
+ def _iter_parquet_objects(self, paths: tuple[str, ...]) -> list[tuple[str, str]]:
49
+ objects: list[tuple[str, str]] = []
50
+ for path in paths:
51
+ bucket, object_path = self._resolve_path(path)
52
+ if object_path.endswith(".parquet"):
53
+ objects.append((bucket, object_path))
54
+ continue
55
+ prefix = object_path.rstrip("/")
56
+ if prefix:
57
+ prefix = f"{prefix}/"
58
+ objects.extend(
59
+ (bucket, item.object_name)
60
+ for item in self.client.list_objects(bucket, prefix=prefix, recursive=True)
61
+ if item.object_name.endswith(".parquet")
62
+ )
63
+ return sorted(objects)
64
+
65
+ def _resolve_path(self, path: str) -> tuple[str, str]:
66
+ parsed = urlparse(path)
67
+ if parsed.scheme in {"s3", "minio"}:
68
+ return parsed.netloc, parsed.path.lstrip("/")
69
+
70
+ normalized = path.lstrip("/")
71
+ bucket, separator, key = normalized.partition("/")
72
+ if separator and bucket == self.config.bucket:
73
+ return bucket, key
74
+ return self.config.bucket, normalized
75
+
76
+ def to(self, ds: "Dataset[Any]") -> None:
77
+ df = ds.polars
78
+ buffer = io.BytesIO()
79
+ df.write_parquet(buffer)
80
+ buffer.seek(0)
81
+ self.client.put_object(
82
+ bucket_name=self.config.bucket,
83
+ object_name=self.config.key,
84
+ data=buffer,
85
+ length=buffer.getbuffer().nbytes,
86
+ content_type="application/octet-stream",
87
+ )
88
+
89
+ def target_exists(self, target: str) -> bool:
90
+ try:
91
+ self.client.stat_object(self.config.bucket, target)
92
+ except S3Error as error:
93
+ if error.code in {"NoSuchBucket", "NoSuchKey"}:
94
+ return False
95
+ raise
96
+ return True
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+ from typing import TYPE_CHECKING, Any
3
+
4
+ import polars as pl
5
+
6
+ from retrievalbase.connector import DatasetConnector
7
+ from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
8
+ from retrievalbase.exceptions import ParquetFilesNotFoundError
9
+
10
+ if TYPE_CHECKING:
11
+ from retrievalbase.dataset import Dataset
12
+
13
+
14
+ class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings]):
15
+ def __init__(self, config: ParquetDatasetConnectorSettings):
16
+ super().__init__(config)
17
+
18
+ def _load(self) -> pl.DataFrame | pl.LazyFrame:
19
+ return pl.scan_parquet(self.config.path) if self.config.lazy else pl.read_parquet(self.config.path)
20
+
21
+ def _load_recursive(self, *paths: str) -> pl.DataFrame | pl.LazyFrame:
22
+ parquet_paths = self._iter_parquet_paths(paths or (self.config.path,))
23
+ if not parquet_paths:
24
+ raise ParquetFilesNotFoundError(paths or (self.config.path,))
25
+ if self.config.lazy:
26
+ return pl.scan_parquet([str(path) for path in parquet_paths])
27
+ return pl.concat([pl.read_parquet(path) for path in parquet_paths])
28
+
29
+ def _iter_parquet_paths(self, paths: tuple[str, ...]) -> list[Path]:
30
+ parquet_paths: list[Path] = []
31
+ for raw_path in paths:
32
+ path = Path(raw_path)
33
+ if path.suffix == ".parquet":
34
+ parquet_paths.append(path)
35
+ continue
36
+ parquet_paths.extend(path.rglob("*.parquet"))
37
+ return sorted(parquet_paths)
38
+
39
+ def to(self, ds: "Dataset[Any]") -> None:
40
+ ds.polars.write_parquet(self.config.path)
41
+
42
+ def target_exists(self, target: str) -> bool:
43
+ return Path(target).exists()
@@ -37,6 +37,28 @@ class DatasetError(RetrievalBaseError):
37
37
  pass
38
38
 
39
39
 
40
+ class DatasetConnectorError(RetrievalBaseError):
41
+ """Base error for dataset connector failures."""
42
+
43
+
44
+ class MinioParquetObjectsNotFoundError(DatasetConnectorError, FileNotFoundError):
45
+ """Raised when a MinIO path does not contain parquet objects."""
46
+
47
+ def __init__(self, paths: tuple[str, ...]):
48
+ self.paths = paths
49
+ roots = ", ".join(paths)
50
+ super().__init__(f"No parquet objects found in MinIO paths: {roots}")
51
+
52
+
53
+ class ParquetFilesNotFoundError(DatasetConnectorError, FileNotFoundError):
54
+ """Raised when local paths do not contain parquet files."""
55
+
56
+ def __init__(self, paths: tuple[str, ...]):
57
+ self.paths = paths
58
+ roots = ", ".join(paths)
59
+ super().__init__(f"No parquet files found in paths: {roots}")
60
+
61
+
40
62
  class DatasetSplitError(DatasetError, ValueError):
41
63
  """Raised when dataset split parameters are invalid."""
42
64
 
@@ -76,6 +76,9 @@ class FakeDatasetConnector(DatasetConnector[FakeDatasetConnectorSettings]):
76
76
  def _load(self) -> pl.DataFrame:
77
77
  return pl.DataFrame(self.config.rows)
78
78
 
79
+ def _load_recursive(self, *paths: str) -> pl.DataFrame:
80
+ return self._load()
81
+
79
82
  def to(self, ds: Any) -> None:
80
83
  self.__class__.last_written = ds.polars
81
84
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import io
4
+ from types import SimpleNamespace
4
5
  from typing import TypedDict, cast
5
6
 
6
7
  import polars as pl
@@ -17,6 +18,7 @@ from retrievalbase.connector.settings import (
17
18
  )
18
19
  from retrievalbase.dataset import Dataset, TextDataset
19
20
  from retrievalbase.dataset.polars import PolarsDataset, PolarsTextDataset
21
+ from retrievalbase.exceptions import MinioParquetObjectsNotFoundError, ParquetFilesNotFoundError
20
22
  from tests.fixtures.data import make_text_dataframe
21
23
 
22
24
 
@@ -96,6 +98,36 @@ def test_parquet_connector_reports_target_existence(tmp_path) -> None:
96
98
  assert connector.target_exists(str(missing_path)) is False
97
99
 
98
100
 
101
+ def test_parquet_connector_loads_recursive_paths(tmp_path) -> None:
102
+ root = tmp_path / "root"
103
+ nested = root / "nested"
104
+ nested.mkdir(parents=True)
105
+ first = make_text_dataframe([{"page_content": "one", "metadata": {"doc_id": "a"}}])
106
+ second = make_text_dataframe([{"page_content": "two", "metadata": {"doc_id": "b"}}])
107
+ ignored = root / "ignored.txt"
108
+ first.write_parquet(root / "first.parquet")
109
+ second.write_parquet(nested / "second.parquet")
110
+ ignored.write_text("ignore me")
111
+
112
+ connector = ParquetDatasetConnector(ParquetDatasetConnectorSettings(module_path="x", path=str(root), lazy=False))
113
+
114
+ loaded = connector.load_recursive_text()
115
+
116
+ assert loaded.polars.to_dict(as_series=False) == {
117
+ "page_content": ["one", "two"],
118
+ "metadata": [{"doc_id": "a"}, {"doc_id": "b"}],
119
+ }
120
+
121
+
122
+ def test_parquet_connector_raises_when_recursive_path_has_no_parquet(tmp_path) -> None:
123
+ empty = tmp_path / "empty"
124
+ empty.mkdir()
125
+ connector = ParquetDatasetConnector(ParquetDatasetConnectorSettings(module_path="x", path=str(empty), lazy=False))
126
+
127
+ with pytest.raises(ParquetFilesNotFoundError, match="No parquet files found"):
128
+ connector.load_recursive()
129
+
130
+
99
131
  def test_minio_connector_reads_and_writes_parquet_payloads(
100
132
  monkeypatch: pytest.MonkeyPatch,
101
133
  ) -> None:
@@ -207,3 +239,94 @@ def test_minio_connector_reports_target_existence(monkeypatch: pytest.MonkeyPatc
207
239
  assert connector.target_exists("sample.parquet") is True
208
240
  assert connector.target_exists("missing.parquet") is False
209
241
  assert calls == [("datasets", "sample.parquet"), ("datasets", "missing.parquet")]
242
+
243
+
244
+ def test_minio_connector_loads_recursive_parquet_prefixes(monkeypatch: pytest.MonkeyPatch) -> None:
245
+ frames = {
246
+ "a/df.parquet": pl.DataFrame({"id": [1], "text": ["one"]}),
247
+ "a/B/df2.parquet": pl.DataFrame({"id": [2], "text": ["two"]}),
248
+ "another_root/df3.parquet": pl.DataFrame({"id": [3], "text": ["three"]}),
249
+ "another_root/ignored.txt": pl.DataFrame({"id": [99], "text": ["ignored"]}),
250
+ }
251
+ calls: dict[str, list[object]] = {"listed": [], "read": []}
252
+
253
+ def parquet_payload(df: pl.DataFrame) -> bytes:
254
+ buffer = io.BytesIO()
255
+ df.write_parquet(buffer)
256
+ return buffer.getvalue()
257
+
258
+ class FakeResponse:
259
+ def __init__(self, payload: bytes) -> None:
260
+ self.payload = payload
261
+
262
+ def read(self) -> bytes:
263
+ return self.payload
264
+
265
+ def close(self) -> None:
266
+ pass
267
+
268
+ def release_conn(self) -> None:
269
+ pass
270
+
271
+ class FakeMinio:
272
+ def __init__(self, endpoint: str, access_key: str, secret_key: str, secure: bool) -> None:
273
+ pass
274
+
275
+ def list_objects(self, bucket: str, prefix: str, recursive: bool) -> list[SimpleNamespace]:
276
+ calls["listed"].append((bucket, prefix, recursive))
277
+ return [SimpleNamespace(object_name=key) for key in frames if key.startswith(prefix)]
278
+
279
+ def get_object(self, bucket: str, key: str) -> FakeResponse:
280
+ calls["read"].append((bucket, key))
281
+ return FakeResponse(parquet_payload(frames[key]))
282
+
283
+ monkeypatch.setattr("retrievalbase.connector.minio.Minio", FakeMinio)
284
+
285
+ connector = MinioDatasetConnector(
286
+ MinioDatasetConnectorSettings(
287
+ module_path="x",
288
+ endpoint="https://minio.local",
289
+ bucket="datasets",
290
+ key="a",
291
+ access_key=SecretStr("access"),
292
+ secret_key=SecretStr("secret"),
293
+ )
294
+ )
295
+
296
+ loaded = connector.load_recursive("datasets/a", "s3://datasets/another_root")
297
+
298
+ assert calls["listed"] == [("datasets", "a/", True), ("datasets", "another_root/", True)]
299
+ assert calls["read"] == [
300
+ ("datasets", "a/B/df2.parquet"),
301
+ ("datasets", "a/df.parquet"),
302
+ ("datasets", "another_root/df3.parquet"),
303
+ ]
304
+ assert loaded.polars.to_dict(as_series=False) == {
305
+ "id": [2, 1, 3],
306
+ "text": ["two", "one", "three"],
307
+ }
308
+
309
+
310
+ def test_minio_connector_raises_when_prefix_has_no_parquet(monkeypatch: pytest.MonkeyPatch) -> None:
311
+ class FakeMinio:
312
+ def __init__(self, endpoint: str, access_key: str, secret_key: str, secure: bool) -> None:
313
+ pass
314
+
315
+ def list_objects(self, bucket: str, prefix: str, recursive: bool) -> list[SimpleNamespace]:
316
+ return [SimpleNamespace(object_name="empty/readme.txt")]
317
+
318
+ monkeypatch.setattr("retrievalbase.connector.minio.Minio", FakeMinio)
319
+
320
+ connector = MinioDatasetConnector(
321
+ MinioDatasetConnectorSettings(
322
+ module_path="x",
323
+ endpoint="https://minio.local",
324
+ bucket="datasets",
325
+ key="empty",
326
+ access_key=SecretStr("access"),
327
+ secret_key=SecretStr("secret"),
328
+ )
329
+ )
330
+
331
+ with pytest.raises(MinioParquetObjectsNotFoundError, match="No parquet objects found"):
332
+ connector.load_recursive()
@@ -30,6 +30,9 @@ class RaisingConnector(DatasetConnector[DatasetConnectorSettings]):
30
30
  def _load(self) -> pl.DataFrame | pl.LazyFrame:
31
31
  return super()._load()
32
32
 
33
+ def _load_recursive(self, *paths: str) -> pl.DataFrame | pl.LazyFrame:
34
+ return super()._load_recursive(*paths)
35
+
33
36
  def to(self, ds) -> None:
34
37
  super().to(ds)
35
38
 
@@ -95,6 +98,8 @@ def test_base_contracts_raise_not_implemented_and_noops() -> None:
95
98
 
96
99
  with pytest.raises(NotImplementedError):
97
100
  connector._load()
101
+ with pytest.raises(NotImplementedError):
102
+ connector._load_recursive("x")
98
103
  with pytest.raises(NotImplementedError):
99
104
  connector.to(PolarsTextDataset.from_records([("x", {"doc_id": "1"})]))
100
105
  with pytest.raises(NotImplementedError):
@@ -2584,7 +2584,7 @@ wheels = [
2584
2584
 
2585
2585
  [[package]]
2586
2586
  name = "retrievalbase"
2587
- version = "2.2.0"
2587
+ version = "2.3.0"
2588
2588
  source = { editable = "." }
2589
2589
  dependencies = [
2590
2590
  { name = "faiss-cpu" },
@@ -1,55 +0,0 @@
1
- import io
2
- from typing import TYPE_CHECKING, Any
3
-
4
- import polars as pl
5
- from minio import Minio
6
- from minio.error import S3Error
7
-
8
- from retrievalbase.connector import DatasetConnector
9
- from retrievalbase.connector.settings import MinioDatasetConnectorSettings
10
-
11
- if TYPE_CHECKING:
12
- from retrievalbase.dataset import Dataset
13
-
14
-
15
- class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
16
- def __init__(self, config: MinioDatasetConnectorSettings):
17
- super().__init__(config)
18
- self.client = Minio(
19
- self.config.endpoint.replace("http://", "").replace("https://", ""),
20
- access_key=self.config.access_key.get_secret_value(),
21
- secret_key=self.config.secret_key.get_secret_value(),
22
- secure=self.config.endpoint.startswith("https://"),
23
- )
24
-
25
- def _load(self) -> pl.DataFrame | pl.LazyFrame:
26
- response = self.client.get_object(self.config.bucket, self.config.key)
27
- try:
28
- buffer = io.BytesIO(response.read())
29
- finally:
30
- response.close()
31
- response.release_conn()
32
- df = pl.read_parquet(buffer)
33
- return df
34
-
35
- def to(self, ds: "Dataset[Any]") -> None:
36
- df = ds.polars
37
- buffer = io.BytesIO()
38
- df.write_parquet(buffer)
39
- buffer.seek(0)
40
- self.client.put_object(
41
- bucket_name=self.config.bucket,
42
- object_name=self.config.key,
43
- data=buffer,
44
- length=buffer.getbuffer().nbytes,
45
- content_type="application/octet-stream",
46
- )
47
-
48
- def target_exists(self, target: str) -> bool:
49
- try:
50
- self.client.stat_object(self.config.bucket, target)
51
- except S3Error as error:
52
- if error.code in {"NoSuchBucket", "NoSuchKey"}:
53
- return False
54
- raise
55
- return True
@@ -1,24 +0,0 @@
1
- from pathlib import Path
2
- from typing import TYPE_CHECKING, Any
3
-
4
- import polars as pl
5
-
6
- from retrievalbase.connector import DatasetConnector
7
- from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
8
-
9
- if TYPE_CHECKING:
10
- from retrievalbase.dataset import Dataset
11
-
12
-
13
- class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings]):
14
- def __init__(self, config: ParquetDatasetConnectorSettings):
15
- super().__init__(config)
16
-
17
- def _load(self) -> pl.DataFrame | pl.LazyFrame:
18
- return pl.scan_parquet(self.config.path) if self.config.lazy else pl.read_parquet(self.config.path)
19
-
20
- def to(self, ds: "Dataset[Any]") -> None:
21
- ds.polars.write_parquet(self.config.path)
22
-
23
- def target_exists(self, target: str) -> bool:
24
- return Path(target).exists()
File without changes
File without changes
File without changes
File without changes
File without changes