retrievalbase 2.3.0__tar.gz → 2.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/CHANGELOG.md +7 -0
  2. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/PKG-INFO +1 -1
  3. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/pyproject.toml +1 -1
  4. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/connector/minio.py +1 -1
  5. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/connector/parquet.py +1 -1
  6. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_connector/test_connectors.py +75 -0
  7. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/uv.lock +1 -1
  8. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/.gitignore +0 -0
  9. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/.gitlab-ci.yml +0 -0
  10. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/.pre-commit-config.yaml +0 -0
  11. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/.releaserc.json +0 -0
  12. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/AGENTS.md +0 -0
  13. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/Makefile +0 -0
  14. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/README.md +0 -0
  15. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/codecov.yml +0 -0
  16. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/commitlint.config.cjs +0 -0
  17. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/__init__.py +0 -0
  18. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/connector/__init__.py +0 -0
  19. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/connector/settings.py +0 -0
  20. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/constants.py +0 -0
  21. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/dataset/__init__.py +0 -0
  22. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/dataset/hf.py +0 -0
  23. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/dataset/mixins.py +0 -0
  24. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/dataset/polars.py +0 -0
  25. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/dataset/preprocess/__init__.py +0 -0
  26. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/dataset/preprocess/preprocess.py +0 -0
  27. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/dataset/preprocess/token_counter.py +0 -0
  28. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/dataset/settings.py +0 -0
  29. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/enums.py +0 -0
  30. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/__init__.py +0 -0
  31. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/async_batcher.py +0 -0
  32. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/embedders.py +0 -0
  33. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/evaluators/__init__.py +0 -0
  34. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/evaluators/python/__init__.py +0 -0
  35. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/evaluators/python/evaluators.py +0 -0
  36. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/evaluators/python/scores.py +0 -0
  37. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/processors.py +0 -0
  38. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/rerankers.py +0 -0
  39. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/retrievers/__init__.py +0 -0
  40. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/retrievers/dense/__init__.py +0 -0
  41. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/retrievers/dense/retrievers.py +0 -0
  42. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/settings.py +0 -0
  43. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/evaluation/vector_stores.py +0 -0
  44. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/exceptions.py +0 -0
  45. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/ingestion/__init__.py +0 -0
  46. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/ingestion/settings.py +0 -0
  47. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/mixins.py +0 -0
  48. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/py.typed +0 -0
  49. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/settings.py +0 -0
  50. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/types.py +0 -0
  51. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/src/retrievalbase/utils.py +0 -0
  52. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/__init__.py +0 -0
  53. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/conftest.py +0 -0
  54. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/fixtures/__init__.py +0 -0
  55. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/fixtures/components.py +0 -0
  56. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/fixtures/data.py +0 -0
  57. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/integration/__init__.py +0 -0
  58. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/integration/test_dataset/__init__.py +0 -0
  59. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/integration/test_dataset/test_huggingface_adapter.py +0 -0
  60. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/integration/test_evaluation/__init__.py +0 -0
  61. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/integration/test_evaluation/conftest.py +0 -0
  62. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/integration/test_evaluation/test_python_evaluator.py +0 -0
  63. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/__init__.py +0 -0
  64. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_config/__init__.py +0 -0
  65. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_config/test_mixins.py +0 -0
  66. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_config/test_settings.py +0 -0
  67. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_connector/__init__.py +0 -0
  68. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/__init__.py +0 -0
  69. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/conftest.py +0 -0
  70. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/test_dataset_base_contracts.py +0 -0
  71. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/test_dataset_mixins_more.py +0 -0
  72. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/test_polars_dataset.py +0 -0
  73. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/test_polars_lazy_paths.py +0 -0
  74. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/test_preprocess_filters.py +0 -0
  75. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/test_token_counter_hf.py +0 -0
  76. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_dataset/test_token_counters.py +0 -0
  77. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/__init__.py +0 -0
  78. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/conftest.py +0 -0
  79. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_async_batcher.py +0 -0
  80. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_base_contracts.py +0 -0
  81. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_bm25_retriever.py +0 -0
  82. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_dense_retriever.py +0 -0
  83. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_embedders.py +0 -0
  84. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_hf_reranker.py +0 -0
  85. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_hybrid_retriever.py +0 -0
  86. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_hybrid_retriever_runtime.py +0 -0
  87. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_processors.py +0 -0
  88. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_python_evaluator_classes.py +0 -0
  89. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_python_evaluator_runtime.py +0 -0
  90. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_rerankers.py +0 -0
  91. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_retriever_base.py +0 -0
  92. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_scores.py +0 -0
  93. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_evaluation/test_vector_stores.py +0 -0
  94. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_ingestion/__init__.py +0 -0
  95. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_ingestion/test_text_ingestion_pipeline.py +0 -0
  96. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_utils/__init__.py +0 -0
  97. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_utils/test_utils.py +0 -0
  98. {retrievalbase-2.3.0 → retrievalbase-2.3.1}/tests/unit/test_utils/test_utils_connectors.py +0 -0
@@ -1,3 +1,10 @@
1
+ ## [2.3.1](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.3.0...v2.3.1) (2026-05-24)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * add diagonal relaxed dataframe concat ([cbc9d66](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/cbc9d66985389cc6b28fce916acc699af6df4aa7))
7
+
1
8
  # [2.3.0](https://gitlab.com/efysent/agentic-core/retrievalbase/compare/v2.2.0...v2.3.0) (2026-05-24)
2
9
 
3
10
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: retrievalbase
3
- Version: 2.3.0
3
+ Version: 2.3.1
4
4
  Author-email: jalal <jalalkhaldi3@gmail.com>
5
5
  Requires-Python: <3.13,>=3.11
6
6
  Requires-Dist: faiss-cpu<2.0.0,>=1.13.2
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "retrievalbase"
3
- version = "2.3.0"
3
+ version = "2.3.1"
4
4
  description = ""
5
5
  authors = [
6
6
  { name = "jalal", email = "jalalkhaldi3@gmail.com" }
@@ -34,7 +34,7 @@ class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
34
34
  ]
35
35
  if not dataframes:
36
36
  raise MinioParquetObjectsNotFoundError(paths or (self.config.key,))
37
- return pl.concat(dataframes)
37
+ return pl.concat(dataframes, how="diagonal_relaxed")
38
38
 
39
39
  def _read_parquet_object(self, bucket: str, object_name: str) -> pl.DataFrame:
40
40
  response = self.client.get_object(bucket, object_name)
@@ -24,7 +24,7 @@ class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings])
24
24
  raise ParquetFilesNotFoundError(paths or (self.config.path,))
25
25
  if self.config.lazy:
26
26
  return pl.scan_parquet([str(path) for path in parquet_paths])
27
- return pl.concat([pl.read_parquet(path) for path in parquet_paths])
27
+ return pl.concat([pl.read_parquet(path) for path in parquet_paths], how="diagonal_relaxed")
28
28
 
29
29
  def _iter_parquet_paths(self, paths: tuple[str, ...]) -> list[Path]:
30
30
  parquet_paths: list[Path] = []
@@ -119,6 +119,24 @@ def test_parquet_connector_loads_recursive_paths(tmp_path) -> None:
119
119
  }
120
120
 
121
121
 
122
+ def test_parquet_connector_loads_recursive_paths_with_relaxed_schema(tmp_path) -> None:
123
+ root = tmp_path / "root"
124
+ root.mkdir()
125
+ first = pl.DataFrame({"page_content": ["one"], "metadata": pl.Series([None], dtype=pl.Null)})
126
+ second = pl.DataFrame({"page_content": ["two"], "metadata": ["source"]})
127
+ first.write_parquet(root / "first.parquet")
128
+ second.write_parquet(root / "second.parquet")
129
+
130
+ connector = ParquetDatasetConnector(ParquetDatasetConnectorSettings(module_path="x", path=str(root), lazy=False))
131
+
132
+ loaded = connector.load_recursive_text()
133
+
134
+ assert loaded.polars.to_dict(as_series=False) == {
135
+ "page_content": ["one", "two"],
136
+ "metadata": [None, "source"],
137
+ }
138
+
139
+
122
140
  def test_parquet_connector_raises_when_recursive_path_has_no_parquet(tmp_path) -> None:
123
141
  empty = tmp_path / "empty"
124
142
  empty.mkdir()
@@ -307,6 +325,63 @@ def test_minio_connector_loads_recursive_parquet_prefixes(monkeypatch: pytest.Mo
307
325
  }
308
326
 
309
327
 
328
+ def test_minio_connector_loads_recursive_parquet_prefixes_with_relaxed_schema(
329
+ monkeypatch: pytest.MonkeyPatch,
330
+ ) -> None:
331
+ frames = {
332
+ "a/first.parquet": pl.DataFrame({"page_content": ["one"], "metadata": pl.Series([None], dtype=pl.Null)}),
333
+ "a/second.parquet": pl.DataFrame({"page_content": ["two"], "metadata": ["source"]}),
334
+ }
335
+
336
+ def parquet_payload(df: pl.DataFrame) -> bytes:
337
+ buffer = io.BytesIO()
338
+ df.write_parquet(buffer)
339
+ return buffer.getvalue()
340
+
341
+ class FakeResponse:
342
+ def __init__(self, payload: bytes) -> None:
343
+ self.payload = payload
344
+
345
+ def read(self) -> bytes:
346
+ return self.payload
347
+
348
+ def close(self) -> None:
349
+ pass
350
+
351
+ def release_conn(self) -> None:
352
+ pass
353
+
354
+ class FakeMinio:
355
+ def __init__(self, endpoint: str, access_key: str, secret_key: str, secure: bool) -> None:
356
+ pass
357
+
358
+ def list_objects(self, bucket: str, prefix: str, recursive: bool) -> list[SimpleNamespace]:
359
+ return [SimpleNamespace(object_name=key) for key in frames if key.startswith(prefix)]
360
+
361
+ def get_object(self, bucket: str, key: str) -> FakeResponse:
362
+ return FakeResponse(parquet_payload(frames[key]))
363
+
364
+ monkeypatch.setattr("retrievalbase.connector.minio.Minio", FakeMinio)
365
+
366
+ connector = MinioDatasetConnector(
367
+ MinioDatasetConnectorSettings(
368
+ module_path="x",
369
+ endpoint="https://minio.local",
370
+ bucket="datasets",
371
+ key="a",
372
+ access_key=SecretStr("access"),
373
+ secret_key=SecretStr("secret"),
374
+ )
375
+ )
376
+
377
+ loaded = connector.load_recursive_text()
378
+
379
+ assert loaded.polars.to_dict(as_series=False) == {
380
+ "page_content": ["one", "two"],
381
+ "metadata": [None, "source"],
382
+ }
383
+
384
+
310
385
  def test_minio_connector_raises_when_prefix_has_no_parquet(monkeypatch: pytest.MonkeyPatch) -> None:
311
386
  class FakeMinio:
312
387
  def __init__(self, endpoint: str, access_key: str, secret_key: str, secure: bool) -> None:
@@ -2584,7 +2584,7 @@ wheels = [
2584
2584
 
2585
2585
  [[package]]
2586
2586
  name = "retrievalbase"
2587
- version = "2.3.0"
2587
+ version = "2.3.1"
2588
2588
  source = { editable = "." }
2589
2589
  dependencies = [
2590
2590
  { name = "faiss-cpu" },
File without changes
File without changes
File without changes
File without changes
File without changes