retrievalbase 2.1.2__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/CHANGELOG.md +14 -0
  2. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/PKG-INFO +1 -1
  3. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/pyproject.toml +1 -1
  4. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/connector/__init__.py +4 -0
  5. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/connector/minio.py +10 -0
  6. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/connector/parquet.py +4 -0
  7. retrievalbase-2.2.0/src/retrievalbase/utils.py +107 -0
  8. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/fixtures/components.py +3 -0
  9. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_connector/test_connectors.py +47 -0
  10. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_base_contracts.py +5 -0
  11. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_utils/test_utils.py +1 -2
  12. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/uv.lock +1 -1
  13. retrievalbase-2.1.2/src/retrievalbase/utils.py +0 -182
  14. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/.gitignore +0 -0
  15. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/.gitlab-ci.yml +0 -0
  16. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/.pre-commit-config.yaml +0 -0
  17. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/.releaserc.json +0 -0
  18. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/AGENTS.md +0 -0
  19. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/Makefile +0 -0
  20. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/README.md +0 -0
  21. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/codecov.yml +0 -0
  22. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/commitlint.config.cjs +0 -0
  23. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/__init__.py +0 -0
  24. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/connector/settings.py +0 -0
  25. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/constants.py +0 -0
  26. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/__init__.py +0 -0
  27. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/hf.py +0 -0
  28. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/mixins.py +0 -0
  29. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/polars.py +0 -0
  30. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/__init__.py +0 -0
  31. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/preprocess.py +0 -0
  32. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/preprocess/token_counter.py +0 -0
  33. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/dataset/settings.py +0 -0
  34. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/enums.py +0 -0
  35. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/__init__.py +0 -0
  36. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/async_batcher.py +0 -0
  37. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/embedders.py +0 -0
  38. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/__init__.py +0 -0
  39. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/python/__init__.py +0 -0
  40. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/python/evaluators.py +0 -0
  41. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/evaluators/python/scores.py +0 -0
  42. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/processors.py +0 -0
  43. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/rerankers.py +0 -0
  44. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/retrievers/__init__.py +0 -0
  45. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/retrievers/dense/__init__.py +0 -0
  46. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/retrievers/dense/retrievers.py +0 -0
  47. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/settings.py +0 -0
  48. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/evaluation/vector_stores.py +0 -0
  49. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/exceptions.py +0 -0
  50. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/ingestion/__init__.py +0 -0
  51. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/ingestion/settings.py +0 -0
  52. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/mixins.py +0 -0
  53. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/py.typed +0 -0
  54. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/settings.py +0 -0
  55. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/src/retrievalbase/types.py +0 -0
  56. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/__init__.py +0 -0
  57. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/conftest.py +0 -0
  58. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/fixtures/__init__.py +0 -0
  59. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/fixtures/data.py +0 -0
  60. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/__init__.py +0 -0
  61. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_dataset/__init__.py +0 -0
  62. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_dataset/test_huggingface_adapter.py +0 -0
  63. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_evaluation/__init__.py +0 -0
  64. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_evaluation/conftest.py +0 -0
  65. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/integration/test_evaluation/test_python_evaluator.py +0 -0
  66. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/__init__.py +0 -0
  67. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_config/__init__.py +0 -0
  68. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_config/test_mixins.py +0 -0
  69. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_config/test_settings.py +0 -0
  70. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_connector/__init__.py +0 -0
  71. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/__init__.py +0 -0
  72. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/conftest.py +0 -0
  73. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_dataset_base_contracts.py +0 -0
  74. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_dataset_mixins_more.py +0 -0
  75. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_polars_dataset.py +0 -0
  76. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_polars_lazy_paths.py +0 -0
  77. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_preprocess_filters.py +0 -0
  78. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_token_counter_hf.py +0 -0
  79. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_dataset/test_token_counters.py +0 -0
  80. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/__init__.py +0 -0
  81. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/conftest.py +0 -0
  82. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_async_batcher.py +0 -0
  83. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_bm25_retriever.py +0 -0
  84. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_dense_retriever.py +0 -0
  85. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_embedders.py +0 -0
  86. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_hf_reranker.py +0 -0
  87. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_hybrid_retriever.py +0 -0
  88. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_hybrid_retriever_runtime.py +0 -0
  89. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_processors.py +0 -0
  90. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_python_evaluator_classes.py +0 -0
  91. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_python_evaluator_runtime.py +0 -0
  92. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_rerankers.py +0 -0
  93. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_retriever_base.py +0 -0
  94. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_scores.py +0 -0
  95. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_evaluation/test_vector_stores.py +0 -0
  96. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_ingestion/__init__.py +0 -0
  97. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_ingestion/test_text_ingestion_pipeline.py +0 -0
  98. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_utils/__init__.py +0 -0
  99. {retrievalbase-2.1.2 → retrievalbase-2.2.0}/tests/unit/test_utils/test_utils_connectors.py +0 -0
@@ -1,3 +1,17 @@
1
+ # [2.2.0](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.1.3...v2.2.0) (2026-05-22)
2
+
3
+
4
+ ### Features
5
+
6
+ * **connector:** add target existence checks ([80e739f](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/80e739fddeca65a66d326c1bb673e1ff6a674ba6))
7
+
8
+ ## [2.1.3](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.1.2...v2.1.3) (2026-05-19)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * resolve env vars loading errors ([061d549](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/061d549dea621dc2ab34b4af7f23c090f6d70350))
14
+
1
15
  ## [2.1.2](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.1.1...v2.1.2) (2026-05-19)
2
16
 
3
17
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: retrievalbase
3
- Version: 2.1.2
3
+ Version: 2.2.0
4
4
  Author-email: jalal <jalalkhaldi3@gmail.com>
5
5
  Requires-Python: <3.13,>=3.11
6
6
  Requires-Dist: faiss-cpu<2.0.0,>=1.13.2
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "retrievalbase"
3
- version = "2.1.2"
3
+ version = "2.2.0"
4
4
  description = ""
5
5
  authors = [
6
6
  { name = "jalal", email = "jalalkhaldi3@gmail.com" }
@@ -33,6 +33,10 @@ class DatasetConnector[TCDatasetConnector: DatasetConnectorSettings](
33
33
  def to(self, ds: "Dataset[Any]") -> None:
34
34
  raise NotImplementedError
35
35
 
36
+ @abstractmethod
37
+ def target_exists(self, target: str) -> bool:
38
+ raise NotImplementedError
39
+
36
40
  def load(self) -> "Dataset[pl.DataFrame | pl.LazyFrame]":
37
41
  from retrievalbase.dataset.polars import PolarsDataset
38
42
 
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any
3
3
 
4
4
  import polars as pl
5
5
  from minio import Minio
6
+ from minio.error import S3Error
6
7
 
7
8
  from retrievalbase.connector import DatasetConnector
8
9
  from retrievalbase.connector.settings import MinioDatasetConnectorSettings
@@ -43,3 +44,12 @@ class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
43
44
  length=buffer.getbuffer().nbytes,
44
45
  content_type="application/octet-stream",
45
46
  )
47
+
48
+ def target_exists(self, target: str) -> bool:
49
+ try:
50
+ self.client.stat_object(self.config.bucket, target)
51
+ except S3Error as error:
52
+ if error.code in {"NoSuchBucket", "NoSuchKey"}:
53
+ return False
54
+ raise
55
+ return True
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from typing import TYPE_CHECKING, Any
2
3
 
3
4
  import polars as pl
@@ -18,3 +19,6 @@ class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings])
18
19
 
19
20
  def to(self, ds: "Dataset[Any]") -> None:
20
21
  ds.polars.write_parquet(self.config.path)
22
+
23
+ def target_exists(self, target: str) -> bool:
24
+ return Path(target).exists()
@@ -0,0 +1,107 @@
1
+ import importlib
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ import polars as pl
6
+ import yaml
7
+ from pydantic import SecretStr
8
+
9
+ from retrievalbase.mixins import FromConfigMixin
10
+
11
+ if TYPE_CHECKING:
12
+ from retrievalbase.connector.minio import MinioDatasetConnector
13
+ from retrievalbase.connector.parquet import ParquetDatasetConnector
14
+
15
+
16
+ def load_class(path: str) -> Any:
17
+ module_path, class_name = path.rsplit(".", 1)
18
+ module = importlib.import_module(module_path)
19
+ return getattr(module, class_name)
20
+
21
+
22
+ def _get_minio_connector(
23
+ bucket: str, key: str, endpoint: str, access_key: str, secret_key: str
24
+ ) -> "MinioDatasetConnector":
25
+ from retrievalbase.connector.minio import MinioDatasetConnector
26
+ from retrievalbase.connector.settings import MinioDatasetConnectorSettings
27
+
28
+ config = MinioDatasetConnectorSettings(
29
+ module_path="",
30
+ endpoint=endpoint,
31
+ bucket=bucket,
32
+ key=key,
33
+ access_key=SecretStr(access_key),
34
+ secret_key=SecretStr(secret_key),
35
+ )
36
+ return MinioDatasetConnector(config)
37
+
38
+
39
+ def _get_parquet_connector(path: str, *, lazy: bool) -> "ParquetDatasetConnector":
40
+ from retrievalbase.connector.parquet import ParquetDatasetConnector
41
+ from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
42
+
43
+ config = ParquetDatasetConnectorSettings(module_path="", path=path, lazy=lazy)
44
+ return ParquetDatasetConnector(config)
45
+
46
+
47
+ def extract_schema_columns(
48
+ schema: dict[str, pl.DataType] | list[pl.Field],
49
+ prefix: str = "",
50
+ ) -> list[str]:
51
+ cols: list[str] = []
52
+ if isinstance(schema, dict):
53
+ items = schema.items()
54
+ else:
55
+ items = ((field.name, field.dtype) for field in schema) # type: ignore[assignment]
56
+ for name, dtype in items:
57
+ full_name = f"{prefix}.{name}" if prefix else name
58
+
59
+ if isinstance(dtype, pl.Struct):
60
+ nested = extract_schema_columns(dtype.fields, full_name)
61
+ cols.extend(nested)
62
+ else:
63
+ cols.append(full_name)
64
+ return cols
65
+
66
+
67
+ def resolve_column(path: str) -> pl.Expr:
68
+ parts = path.split(".")
69
+
70
+ expr = pl.col(parts[0])
71
+ for p in parts[1:]:
72
+ expr = expr.struct.field(p)
73
+
74
+ return expr
75
+
76
+
77
+ def build_schema(
78
+ df: pl.DataFrame,
79
+ schema: dict[str, str],
80
+ ) -> pl.DataFrame:
81
+ exprs = []
82
+
83
+ for target_col, source_path in schema.items():
84
+ expr = resolve_column(source_path).alias(target_col)
85
+ exprs.append(expr)
86
+
87
+ return df.select(exprs)
88
+
89
+
90
+ def comp(
91
+ path: str,
92
+ key: str | None = None,
93
+ ) -> FromConfigMixin[Any]:
94
+ """
95
+ Generic factory:
96
+ - loads YAML
97
+ - resolves module_path
98
+ - instantiates component via FromConfigMixin
99
+ """
100
+ yaml_path = Path(path)
101
+ with yaml_path.open("r") as f:
102
+ raw: dict[str, Any] = yaml.safe_load(f)
103
+ if key is not None:
104
+ raw = raw[key]
105
+ module_path = raw["module_path"]
106
+ cls: type[FromConfigMixin[Any]] = load_class(module_path)
107
+ return cls.from_kwargs(**raw)
@@ -79,6 +79,9 @@ class FakeDatasetConnector(DatasetConnector[FakeDatasetConnectorSettings]):
79
79
  def to(self, ds: Any) -> None:
80
80
  self.__class__.last_written = ds.polars
81
81
 
82
+ def target_exists(self, target: str) -> bool:
83
+ return target == "exists"
84
+
82
85
 
83
86
  class FakeTextPreprocessorSettings(TextPreprocessorSettings):
84
87
  kind: str = "suffix"
@@ -5,7 +5,9 @@ from typing import TypedDict, cast
5
5
 
6
6
  import polars as pl
7
7
  import pytest
8
+ from minio.error import S3Error
8
9
  from pydantic import SecretStr
10
+ from urllib3.response import BaseHTTPResponse
9
11
 
10
12
  from retrievalbase.connector.minio import MinioDatasetConnector
11
13
  from retrievalbase.connector.parquet import ParquetDatasetConnector
@@ -82,6 +84,18 @@ def test_parquet_connector_round_trips_dataframe(tmp_path, lazy: bool) -> None:
82
84
  assert loaded.polars.to_dict(as_series=False) == sample_text_df.to_dict(as_series=False)
83
85
 
84
86
 
87
+ def test_parquet_connector_reports_target_existence(tmp_path) -> None:
88
+ existing_path = tmp_path / "dataset.parquet"
89
+ missing_path = tmp_path / "missing.parquet"
90
+ existing_path.write_bytes(b"not a real parquet file")
91
+ connector = ParquetDatasetConnector(
92
+ ParquetDatasetConnectorSettings(module_path="x", path=str(existing_path), lazy=False)
93
+ )
94
+
95
+ assert connector.target_exists(str(existing_path)) is True
96
+ assert connector.target_exists(str(missing_path)) is False
97
+
98
+
85
99
  def test_minio_connector_reads_and_writes_parquet_payloads(
86
100
  monkeypatch: pytest.MonkeyPatch,
87
101
  ) -> None:
@@ -160,3 +174,36 @@ def test_minio_connector_reads_and_writes_parquet_payloads(
160
174
  assert loaded.polars.to_dict(as_series=False) == sample_text_df.to_dict(as_series=False)
161
175
  assert writes["closed"] is True
162
176
  assert writes["released"] is True
177
+
178
+
179
+ def test_minio_connector_reports_target_existence(monkeypatch: pytest.MonkeyPatch) -> None:
180
+ calls: list[tuple[str, str]] = []
181
+
182
+ class FakeMinio:
183
+ def __init__(self, endpoint: str, access_key: str, secret_key: str, secure: bool) -> None:
184
+ pass
185
+
186
+ def stat_object(self, bucket: str, key: str) -> object:
187
+ calls.append((bucket, key))
188
+ if key == "missing.parquet":
189
+ raise S3Error(
190
+ cast(BaseHTTPResponse, None), "NoSuchKey", "not found", key, "request-id", "host-id", bucket, key
191
+ )
192
+ return object()
193
+
194
+ monkeypatch.setattr("retrievalbase.connector.minio.Minio", FakeMinio)
195
+
196
+ connector = MinioDatasetConnector(
197
+ MinioDatasetConnectorSettings(
198
+ module_path="x",
199
+ endpoint="https://minio.local",
200
+ bucket="datasets",
201
+ key="sample.parquet",
202
+ access_key=SecretStr("access"),
203
+ secret_key=SecretStr("secret"),
204
+ )
205
+ )
206
+
207
+ assert connector.target_exists("sample.parquet") is True
208
+ assert connector.target_exists("missing.parquet") is False
209
+ assert calls == [("datasets", "sample.parquet"), ("datasets", "missing.parquet")]
@@ -33,6 +33,9 @@ class RaisingConnector(DatasetConnector[DatasetConnectorSettings]):
33
33
  def to(self, ds) -> None:
34
34
  super().to(ds)
35
35
 
36
+ def target_exists(self, target: str) -> bool:
37
+ return super().target_exists(target)
38
+
36
39
 
37
40
  class RaisingPreprocessor(TextPreprocessor[TextPreprocessorSettings]):
38
41
  def apply(self, ds):
@@ -94,6 +97,8 @@ def test_base_contracts_raise_not_implemented_and_noops() -> None:
94
97
  connector._load()
95
98
  with pytest.raises(NotImplementedError):
96
99
  connector.to(PolarsTextDataset.from_records([("x", {"doc_id": "1"})]))
100
+ with pytest.raises(NotImplementedError):
101
+ connector.target_exists("x")
97
102
  with pytest.raises(NotImplementedError):
98
103
  preprocessor.apply(PolarsTextDataset.from_records([("x", {"doc_id": "1"})]))
99
104
  with pytest.raises(NotImplementedError):
@@ -25,7 +25,7 @@ def test_load_class_and_comp_create_runtime_from_module_path(tmp_path) -> None:
25
25
  assert instance.config.value == 17
26
26
 
27
27
 
28
- def test_comp_resolves_nested_settings_from_environment(tmp_path, monkeypatch) -> None:
28
+ def test_comp_loads_missing_nested_settings_fields_from_environment(tmp_path, monkeypatch) -> None:
29
29
  monkeypatch.setenv("FAKE_CHILD_TOKEN", "token-from-env")
30
30
  config_path = tmp_path / "component.yaml"
31
31
  config_path.write_text(
@@ -35,7 +35,6 @@ def test_comp_resolves_nested_settings_from_environment(tmp_path, monkeypatch) -
35
35
  f" module_path: {FAKE_PARENT_RUNTIME_PATH}",
36
36
  " child:",
37
37
  f" module_path: {FAKE_ENV_CHILD_RUNTIME_PATH}",
38
- " token: ${FAKE_CHILD_TOKEN}",
39
38
  "",
40
39
  ]
41
40
  ),
@@ -2584,7 +2584,7 @@ wheels = [
2584
2584
 
2585
2585
  [[package]]
2586
2586
  name = "retrievalbase"
2587
- version = "2.1.2"
2587
+ version = "2.2.0"
2588
2588
  source = { editable = "." }
2589
2589
  dependencies = [
2590
2590
  { name = "faiss-cpu" },
@@ -1,182 +0,0 @@
1
- import importlib
2
- from pathlib import Path
3
- from types import UnionType
4
- from typing import TYPE_CHECKING, Any, get_args, get_origin
5
-
6
- import polars as pl
7
- import yaml
8
- from pydantic import SecretStr
9
-
10
- from retrievalbase.mixins import FromConfigMixin
11
- from retrievalbase.settings import FromConfigMixinSettings
12
-
13
- if TYPE_CHECKING:
14
- from retrievalbase.connector.minio import MinioDatasetConnector
15
- from retrievalbase.connector.parquet import ParquetDatasetConnector
16
-
17
-
18
- def load_class(path: str) -> Any:
19
- module_path, class_name = path.rsplit(".", 1)
20
- module = importlib.import_module(module_path)
21
- return getattr(module, class_name)
22
-
23
-
24
- def _get_minio_connector(
25
- bucket: str, key: str, endpoint: str, access_key: str, secret_key: str
26
- ) -> "MinioDatasetConnector":
27
- from retrievalbase.connector.minio import MinioDatasetConnector
28
- from retrievalbase.connector.settings import MinioDatasetConnectorSettings
29
-
30
- config = MinioDatasetConnectorSettings(
31
- module_path="",
32
- endpoint=endpoint,
33
- bucket=bucket,
34
- key=key,
35
- access_key=SecretStr(access_key),
36
- secret_key=SecretStr(secret_key),
37
- )
38
- return MinioDatasetConnector(config)
39
-
40
-
41
- def _get_parquet_connector(path: str, *, lazy: bool) -> "ParquetDatasetConnector":
42
- from retrievalbase.connector.parquet import ParquetDatasetConnector
43
- from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
44
-
45
- config = ParquetDatasetConnectorSettings(module_path="", path=path, lazy=lazy)
46
- return ParquetDatasetConnector(config)
47
-
48
-
49
- def extract_schema_columns(
50
- schema: dict[str, pl.DataType] | list[pl.Field],
51
- prefix: str = "",
52
- ) -> list[str]:
53
- cols: list[str] = []
54
- if isinstance(schema, dict):
55
- items = schema.items()
56
- else:
57
- items = ((field.name, field.dtype) for field in schema) # type: ignore[assignment]
58
- for name, dtype in items:
59
- full_name = f"{prefix}.{name}" if prefix else name
60
-
61
- if isinstance(dtype, pl.Struct):
62
- nested = extract_schema_columns(dtype.fields, full_name)
63
- cols.extend(nested)
64
- else:
65
- cols.append(full_name)
66
- return cols
67
-
68
-
69
- def resolve_column(path: str) -> pl.Expr:
70
- parts = path.split(".")
71
-
72
- expr = pl.col(parts[0])
73
- for p in parts[1:]:
74
- expr = expr.struct.field(p)
75
-
76
- return expr
77
-
78
-
79
- def build_schema(
80
- df: pl.DataFrame,
81
- schema: dict[str, str],
82
- ) -> pl.DataFrame:
83
- exprs = []
84
-
85
- for target_col, source_path in schema.items():
86
- expr = resolve_column(source_path).alias(target_col)
87
- exprs.append(expr)
88
-
89
- return df.select(exprs)
90
-
91
-
92
- def _is_env_placeholder(value: Any) -> bool:
93
- return isinstance(value, str) and value.startswith("${") and value.endswith("}")
94
-
95
-
96
- def _remove_env_placeholders(raw: dict[str, Any]) -> dict[str, Any]:
97
- return {key: value for key, value in raw.items() if not _is_env_placeholder(value)}
98
-
99
-
100
- def _is_settings_class(value: Any) -> bool:
101
- return isinstance(value, type) and issubclass(value, FromConfigMixinSettings)
102
-
103
-
104
- def _settings_classes_from_annotation(annotation: Any) -> list[type[FromConfigMixinSettings]]:
105
- origin = get_origin(annotation)
106
- if _is_settings_class(annotation):
107
- return [annotation]
108
- if _is_settings_class(origin):
109
- return [origin]
110
- if origin in (UnionType,):
111
- return [settings_cls for arg in get_args(annotation) for settings_cls in _settings_classes_from_annotation(arg)]
112
- return [settings_cls for arg in get_args(annotation) for settings_cls in _settings_classes_from_annotation(arg)]
113
-
114
-
115
- def _settings_class_from_module_path(module_path: str) -> type[FromConfigMixinSettings] | None:
116
- try:
117
- cls = load_class(module_path)
118
- except (ImportError, AttributeError, ValueError):
119
- return None
120
- if isinstance(cls, type) and issubclass(cls, FromConfigMixin):
121
- return cls.get_config_class()
122
- return None
123
-
124
-
125
- def _settings_class_for_field(annotation: Any, value: dict[str, Any]) -> type[FromConfigMixinSettings] | None:
126
- module_path = value.get("module_path")
127
- if isinstance(module_path, str):
128
- settings_cls = _settings_class_from_module_path(module_path)
129
- if settings_cls is not None:
130
- return settings_cls
131
-
132
- settings_classes = _settings_classes_from_annotation(annotation)
133
- if len(settings_classes) == 1:
134
- return settings_classes[0]
135
- return None
136
-
137
-
138
- def _resolve_settings_value(annotation: Any, value: Any) -> Any:
139
- if isinstance(value, list):
140
- return [_resolve_settings_value(Any, item) for item in value]
141
- if not isinstance(value, dict):
142
- return value
143
-
144
- settings_cls = _settings_class_for_field(annotation, value)
145
- if settings_cls is None:
146
- return {
147
- key: _resolve_settings_value(Any, nested_value)
148
- for key, nested_value in _remove_env_placeholders(value).items()
149
- }
150
- return _build_settings(settings_cls, value)
151
-
152
-
153
- def _build_settings(
154
- settings_cls: type[FromConfigMixinSettings],
155
- raw: dict[str, Any],
156
- ) -> FromConfigMixinSettings:
157
- resolved = _remove_env_placeholders(raw)
158
- for field_name, field_info in settings_cls.model_fields.items():
159
- if field_name in resolved:
160
- resolved[field_name] = _resolve_settings_value(field_info.annotation, resolved[field_name])
161
- return settings_cls(**resolved)
162
-
163
-
164
- def comp(
165
- path: str,
166
- key: str | None = None,
167
- ) -> FromConfigMixin[Any]:
168
- """
169
- Generic factory:
170
- - loads YAML
171
- - resolves module_path
172
- - instantiates component via FromConfigMixin
173
- """
174
- yaml_path = Path(path)
175
- with yaml_path.open("r") as f:
176
- raw: dict[str, Any] = yaml.safe_load(f)
177
- if key is not None:
178
- raw = raw[key]
179
- module_path = raw["module_path"]
180
- cls: type[FromConfigMixin[Any]] = load_class(module_path)
181
- settings = _build_settings(cls.get_config_class(), raw)
182
- return cls.from_config(settings)
File without changes
File without changes
File without changes
File without changes
File without changes