retrievalbase 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. retrievalbase-1.0.0/.gitignore +38 -0
  2. retrievalbase-1.0.0/.gitlab-ci.yml +110 -0
  3. retrievalbase-1.0.0/.pre-commit-config.yaml +48 -0
  4. retrievalbase-1.0.0/.releaserc.json +28 -0
  5. retrievalbase-1.0.0/CHANGELOG.md +11 -0
  6. retrievalbase-1.0.0/Makefile +87 -0
  7. retrievalbase-1.0.0/PKG-INFO +23 -0
  8. retrievalbase-1.0.0/README.md +0 -0
  9. retrievalbase-1.0.0/codecov.yml +30 -0
  10. retrievalbase-1.0.0/commitlint.config.cjs +22 -0
  11. retrievalbase-1.0.0/pyproject.toml +105 -0
  12. retrievalbase-1.0.0/src/retrievalbase/__init__.py +0 -0
  13. retrievalbase-1.0.0/src/retrievalbase/connector/__init__.py +69 -0
  14. retrievalbase-1.0.0/src/retrievalbase/connector/minio.py +45 -0
  15. retrievalbase-1.0.0/src/retrievalbase/connector/parquet.py +20 -0
  16. retrievalbase-1.0.0/src/retrievalbase/connector/settings.py +22 -0
  17. retrievalbase-1.0.0/src/retrievalbase/constants.py +1 -0
  18. retrievalbase-1.0.0/src/retrievalbase/dataset/__init__.py +146 -0
  19. retrievalbase-1.0.0/src/retrievalbase/dataset/hf.py +49 -0
  20. retrievalbase-1.0.0/src/retrievalbase/dataset/mixins.py +108 -0
  21. retrievalbase-1.0.0/src/retrievalbase/dataset/polars.py +43 -0
  22. retrievalbase-1.0.0/src/retrievalbase/dataset/preprocess/__init__.py +29 -0
  23. retrievalbase-1.0.0/src/retrievalbase/dataset/preprocess/preprocess.py +96 -0
  24. retrievalbase-1.0.0/src/retrievalbase/dataset/preprocess/token_counter.py +41 -0
  25. retrievalbase-1.0.0/src/retrievalbase/dataset/settings.py +63 -0
  26. retrievalbase-1.0.0/src/retrievalbase/enums.py +11 -0
  27. retrievalbase-1.0.0/src/retrievalbase/evaluation/__init__.py +179 -0
  28. retrievalbase-1.0.0/src/retrievalbase/evaluation/async_batcher.py +79 -0
  29. retrievalbase-1.0.0/src/retrievalbase/evaluation/embedders.py +28 -0
  30. retrievalbase-1.0.0/src/retrievalbase/evaluation/evaluators/__init__.py +37 -0
  31. retrievalbase-1.0.0/src/retrievalbase/evaluation/evaluators/python/__init__.py +149 -0
  32. retrievalbase-1.0.0/src/retrievalbase/evaluation/evaluators/python/evaluators.py +71 -0
  33. retrievalbase-1.0.0/src/retrievalbase/evaluation/evaluators/python/scores.py +118 -0
  34. retrievalbase-1.0.0/src/retrievalbase/evaluation/processors.py +15 -0
  35. retrievalbase-1.0.0/src/retrievalbase/evaluation/rerankers.py +182 -0
  36. retrievalbase-1.0.0/src/retrievalbase/evaluation/retrievers/__init__.py +112 -0
  37. retrievalbase-1.0.0/src/retrievalbase/evaluation/retrievers/dense/__init__.py +56 -0
  38. retrievalbase-1.0.0/src/retrievalbase/evaluation/retrievers/dense/retrievers.py +86 -0
  39. retrievalbase-1.0.0/src/retrievalbase/evaluation/settings.py +204 -0
  40. retrievalbase-1.0.0/src/retrievalbase/evaluation/vector_stores.py +146 -0
  41. retrievalbase-1.0.0/src/retrievalbase/exceptions.py +61 -0
  42. retrievalbase-1.0.0/src/retrievalbase/ingestion/__init__.py +50 -0
  43. retrievalbase-1.0.0/src/retrievalbase/ingestion/settings.py +10 -0
  44. retrievalbase-1.0.0/src/retrievalbase/mixins.py +85 -0
  45. retrievalbase-1.0.0/src/retrievalbase/py.typed +0 -0
  46. retrievalbase-1.0.0/src/retrievalbase/settings.py +33 -0
  47. retrievalbase-1.0.0/src/retrievalbase/types.py +55 -0
  48. retrievalbase-1.0.0/src/retrievalbase/utils.py +107 -0
  49. retrievalbase-1.0.0/tests/__init__.py +1 -0
  50. retrievalbase-1.0.0/tests/conftest.py +15 -0
  51. retrievalbase-1.0.0/tests/fixtures/__init__.py +1 -0
  52. retrievalbase-1.0.0/tests/fixtures/dataframes.py +35 -0
  53. retrievalbase-1.0.0/tests/fixtures/fakes.py +124 -0
  54. retrievalbase-1.0.0/tests/integration/__init__.py +1 -0
  55. retrievalbase-1.0.0/tests/integration/dataset/__init__.py +1 -0
  56. retrievalbase-1.0.0/tests/integration/dataset/test_huggingface_adapter.py +1 -0
  57. retrievalbase-1.0.0/tests/integration/evaluation/__init__.py +1 -0
  58. retrievalbase-1.0.0/tests/integration/evaluation/test_python_evaluator.py +50 -0
  59. retrievalbase-1.0.0/tests/unit/__init__.py +1 -0
  60. retrievalbase-1.0.0/tests/unit/config/__init__.py +1 -0
  61. retrievalbase-1.0.0/tests/unit/config/test_mixins.py +63 -0
  62. retrievalbase-1.0.0/tests/unit/config/test_settings.py +48 -0
  63. retrievalbase-1.0.0/tests/unit/dataset/__init__.py +1 -0
  64. retrievalbase-1.0.0/tests/unit/dataset/test_connectors.py +160 -0
  65. retrievalbase-1.0.0/tests/unit/dataset/test_polars_dataset.py +48 -0
  66. retrievalbase-1.0.0/tests/unit/evaluation/__init__.py +1 -0
  67. retrievalbase-1.0.0/tests/unit/evaluation/test_async_batcher.py +45 -0
  68. retrievalbase-1.0.0/tests/unit/evaluation/test_embedders.py +66 -0
  69. retrievalbase-1.0.0/tests/unit/evaluation/test_processors.py +12 -0
  70. retrievalbase-1.0.0/tests/unit/evaluation/test_rerankers.py +95 -0
  71. retrievalbase-1.0.0/tests/unit/evaluation/test_scores.py +41 -0
  72. retrievalbase-1.0.0/tests/unit/evaluation/test_vector_stores.py +71 -0
  73. retrievalbase-1.0.0/tests/unit/ingestion/__init__.py +1 -0
  74. retrievalbase-1.0.0/tests/unit/ingestion/test_text_ingestion_pipeline.py +47 -0
  75. retrievalbase-1.0.0/tests/unit/preprocess/__init__.py +1 -0
  76. retrievalbase-1.0.0/tests/unit/preprocess/test_filters.py +127 -0
  77. retrievalbase-1.0.0/tests/unit/preprocess/test_token_counters.py +18 -0
  78. retrievalbase-1.0.0/tests/unit/retrievers/__init__.py +1 -0
  79. retrievalbase-1.0.0/tests/unit/retrievers/test_bm25_retriever.py +48 -0
  80. retrievalbase-1.0.0/tests/unit/retrievers/test_dense_retriever.py +52 -0
  81. retrievalbase-1.0.0/tests/unit/retrievers/test_hybrid_retriever.py +33 -0
  82. retrievalbase-1.0.0/tests/unit/retrievers/test_retriever_base.py +40 -0
  83. retrievalbase-1.0.0/tests/unit/utils/__init__.py +1 -0
  84. retrievalbase-1.0.0/tests/unit/utils/test_utils.py +45 -0
  85. retrievalbase-1.0.0/uv.lock +3521 -0
@@ -0,0 +1,38 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # env files
13
+ .env
14
+
15
+ # Pycharm
16
+ .idea/
17
+
18
+ # VSCode
19
+ .vscode/
20
+
21
+ # Testing
22
+ .coverage
23
+ coverage.xml
24
+ htmlcov/
25
+ .pytest_cache/
26
+ .mypy_cache/
27
+
28
+ # Ruff
29
+ .ruff_cache/
30
+
31
+ # Pre-commit
32
+ .pre-commit-cache/
33
+
34
+ # MacOS
35
+ .DS_Store
36
+
37
+ # Claude
38
+ .claude/
@@ -0,0 +1,110 @@
1
+ image: python:3.12
2
+
3
+ workflow:
4
+ rules:
5
+ - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "dev"'
6
+ - if: '$CI_COMMIT_BRANCH == "dev"'
7
+ - when: never
8
+
9
+ stages:
10
+ - checks
11
+ - test
12
+ - deploy
13
+
14
+ variables:
15
+ UV_SYSTEM_PYTHON: "1"
16
+ PIP_DISABLE_PIP_VERSION_CHECK: "1"
17
+ GIT_DEPTH: 0
18
+
19
+
20
+ cache:
21
+ key:
22
+ files:
23
+ - uv.lock
24
+ paths:
25
+ - .venv/
26
+ - ~/.cache/uv
27
+
28
+
29
+ before_script:
30
+ - pip install uv
31
+ - uv sync --group dev --all-extras
32
+
33
+ ruff:
34
+ stage: checks
35
+ script:
36
+ - uv run ruff check .
37
+ - uv run ruff format --check .
38
+
39
+ ty:
40
+ stage: checks
41
+ script:
42
+ - uv run ty check .
43
+
44
+ bandit:
45
+ stage: checks
46
+ script:
47
+ - uv run bandit -c pyproject.toml -r .
48
+
49
+ pytest:
50
+ stage: test
51
+ script:
52
+ - |
53
+ if [ "$CI_PIPELINE_SOURCE" = "merge_request_event" ]; then
54
+ uv run pytest --cov=src --cov-report=xml --cov-report=term --cov-fail-under=80
55
+ else
56
+ uv run pytest
57
+ fi
58
+ artifacts:
59
+ paths:
60
+ - coverage.xml
61
+
62
+ check-package:
63
+ stage: test
64
+ needs: ["pytest"]
65
+ script:
66
+ - uv build
67
+ - pip install dist/*.whl
68
+
69
+ codecov:
70
+ stage: test
71
+ needs: ["pytest"]
72
+ script:
73
+ - curl -Os https://cli.codecov.io/latest/linux/codecov
74
+ - chmod +x codecov
75
+ - ./codecov upload-process -t $CODECOV_TOKEN
76
+ rules:
77
+ - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
78
+
79
+ release:
80
+ stage: deploy
81
+ image: node:22
82
+ needs: ["check-package"]
83
+ before_script:
84
+ - apt-get update && apt-get install -y python3 python3-pip
85
+ - pip install --break-system-packages uv
86
+ rules:
87
+ - if: '$CI_COMMIT_BRANCH == "dev"'
88
+ script:
89
+ - npm install -g semantic-release @semantic-release/gitlab @semantic-release/changelog @semantic-release/git @semantic-release/exec
90
+ - semantic-release
91
+
92
+ conventional-commits:
93
+ stage: checks
94
+ image: node:20
95
+ rules:
96
+ - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
97
+ before_script: []
98
+ script:
99
+ - npm install -g @commitlint/cli @commitlint/config-conventional
100
+ - echo "$CI_MERGE_REQUEST_TITLE" > pr_title.txt
101
+ - |
102
+ npx commitlint --config commitlint.config.cjs --edit pr_title.txt || {
103
+ echo "❌ PR title must follow Conventional Commits format (e.g. feat:, fix:)"
104
+ exit 1
105
+ }
106
+
107
+ default:
108
+ interruptible: true
109
+ retry: 1
110
+ timeout: 10m
@@ -0,0 +1,48 @@
1
+ repos:
2
+ # Ruff for linting and formatting
3
+ - repo: https://github.com/astral-sh/ruff-pre-commit
4
+ rev: v0.13.3
5
+ hooks:
6
+ # Run the linter
7
+ - id: ruff
8
+ args: [--fix]
9
+ # Run the formatter
10
+ - id: ruff-format
11
+
12
+ # Type checking with ty (disabled in pre-commit, enabled in CI)
13
+ # - repo: https://github.com/astral-sh/ty-pre-commit
14
+ # rev: v0.0.17
15
+ # hooks:
16
+ # - id: ty
17
+ # exclude: ^(tests/|alembic/)
18
+
19
+ # Conventional commits enforcement
20
+ - repo: https://github.com/compilerla/conventional-pre-commit
21
+ rev: v3.6.0
22
+ hooks:
23
+ - id: conventional-pre-commit
24
+ stages: [commit-msg]
25
+
26
+ # General file checks
27
+ - repo: https://github.com/pre-commit/pre-commit-hooks
28
+ rev: v5.0.0
29
+ hooks:
30
+ - id: trailing-whitespace
31
+ - id: end-of-file-fixer
32
+ - id: check-yaml
33
+ - id: check-added-large-files
34
+ args: [--maxkb=1000]
35
+ - id: check-json
36
+ - id: check-toml
37
+ - id: detect-private-key
38
+ - id: mixed-line-ending
39
+ - id: check-merge-conflict
40
+
41
+ # Security checks with bandit
42
+ - repo: https://github.com/PyCQA/bandit
43
+ rev: 1.8.0
44
+ hooks:
45
+ - id: bandit
46
+ args: [-c, pyproject.toml]
47
+ additional_dependencies: ["bandit[toml]"]
48
+ exclude: ^tests/
@@ -0,0 +1,28 @@
1
+ {
2
+ "branches": [
3
+ "dev"
4
+ ],
5
+ "plugins": [
6
+ "@semantic-release/commit-analyzer",
7
+ "@semantic-release/release-notes-generator",
8
+ "@semantic-release/changelog",
9
+ [
10
+ "@semantic-release/exec",
11
+ {
12
+ "prepareCmd": "uv version ${nextRelease.version}",
13
+ "publishCmd": "uv build && uv publish --token $PYPI_TOKEN"
14
+ }
15
+ ],
16
+ [
17
+ "@semantic-release/git",
18
+ {
19
+ "assets": [
20
+ "CHANGELOG.md",
21
+ "pyproject.toml"
22
+ ],
23
+ "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
24
+ }
25
+ ],
26
+ "@semantic-release/gitlab"
27
+ ]
28
+ }
@@ -0,0 +1,11 @@
1
+ # 1.0.0 (2026-04-19)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * fix deps in ci ([dc69d67](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/dc69d67cc33794c6cd477c7391576db3cf317800))
7
+
8
+
9
+ ### Features
10
+
11
+ * first commit, setup ci ([47b9e29](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/47b9e29b747b07eef2689a6c332c7107d84abc2d))
@@ -0,0 +1,87 @@
1
+ .PHONY: help install dev-install setup-hooks \
2
+ format lint type-check security \
3
+ test test-cov coverage-diff \
4
+ ci ci-fast clean run
5
+
6
+ # ------------------------
7
+ # 📚 Help
8
+ # ------------------------
9
+ help:
10
+ @echo "Available commands:"
11
+ @echo " make install - Install production dependencies"
12
+ @echo " make dev-install - Install all dependencies + hooks"
13
+ @echo " make setup-hooks - Install pre-commit hooks"
14
+ @echo " make format - Format code (ruff)"
15
+ @echo " make lint - Lint code (ruff)"
16
+ @echo " make type-check - Run type checking (ty)"
17
+ @echo " make security - Run security checks (bandit)"
18
+ @echo " make test - Run tests"
19
+ @echo " make test-cov - Run tests with coverage"
20
+ @echo " make ci - Full CI locally"
21
+ @echo " make ci-fast - CI without slow steps"
22
+ @echo " make clean - Clean cache files"
23
+
24
+ # ------------------------
25
+ # 📦 Install
26
+ # ------------------------
27
+ install:
28
+ uv sync --no-dev
29
+
30
+ dev-install:
31
+ uv sync --group dev --all-extras
32
+ uv run pre-commit install
33
+ uv run pre-commit install --hook-type commit-msg
34
+ @echo "✅ Dev environment ready"
35
+
36
+ setup-hooks:
37
+ uv run pre-commit install
38
+ uv run pre-commit install --hook-type commit-msg
39
+
40
+ # ------------------------
41
+ # 🧹 Code quality
42
+ # ------------------------
43
+ format:
44
+ uv run ruff format .
45
+ uv run ruff check --fix .
46
+
47
+ lint:
48
+ uv run ruff check .
49
+
50
+ type-check:
51
+ uv run ty check .
52
+
53
+ security:
54
+ uv run bandit -c pyproject.toml -r .
55
+
56
+ # ------------------------
57
+ # 🧪 Tests
58
+ # ------------------------
59
+ test:
60
+ uv run pytest
61
+
62
+ test-cov:
63
+ uv run pytest \
64
+ --cov=src \
65
+ --cov-report=term \
66
+ --cov-report=xml:coverage.xml \
67
+ --cov-fail-under=80
68
+
69
+ # ------------------------
70
+ # 🚀 CI equivalent
71
+ # ------------------------
72
+ ci: lint type-check security test-cov
73
+ @echo ""
74
+ @echo "✅ Full CI checks passed"
75
+
76
+ # Faster version (no bandit blocking)
77
+ ci-fast: lint type-check test
78
+ @echo ""
79
+ @echo "⚡ Fast CI checks passed"
80
+
81
+ # ------------------------
82
+ # 🧹 Cleanup
83
+ # ------------------------
84
+ clean:
85
+ find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
86
+ find . -type f -name "*.pyc" -delete 2>/dev/null || true
87
+ rm -rf .pytest_cache .ruff_cache .ty_cache htmlcov coverage.xml 2>/dev/null || true
@@ -0,0 +1,23 @@
1
+ Metadata-Version: 2.4
2
+ Name: retrievalbase
3
+ Version: 1.0.0
4
+ Author-email: jalal <jalalkhaldi3@gmail.com>
5
+ Requires-Python: <3.13,>=3.11
6
+ Requires-Dist: faiss-cpu<2.0.0,>=1.13.2
7
+ Requires-Dist: langchain<2.0.0,>=1.2.10
8
+ Requires-Dist: minio<8.0.0,>=7.2.20
9
+ Requires-Dist: numpy<3.0.0,>=2.4.2
10
+ Requires-Dist: openai<3.0.0,>=2.21.0
11
+ Requires-Dist: polars<2.0.0,>=1.38.1
12
+ Requires-Dist: pydantic-settings<3.0.0,>=2.13.0
13
+ Requires-Dist: qdrant-client<2.0.0,>=1.16.2
14
+ Requires-Dist: rank-bm25<0.3.0,>=0.2.2
15
+ Provides-Extra: torch
16
+ Requires-Dist: datasets<5.0.0,>=4.5.0; extra == 'torch'
17
+ Requires-Dist: sentence-transformers<6.0.0,>=5.1.2; extra == 'torch'
18
+ Requires-Dist: torch<3.0.0,>=2.10.0; extra == 'torch'
19
+ Requires-Dist: transformers<6.0.0,>=5.3.0; extra == 'torch'
20
+ Provides-Extra: transformers
21
+ Requires-Dist: datasets<5.0.0,>=4.5.0; extra == 'transformers'
22
+ Requires-Dist: sentence-transformers<6.0.0,>=5.1.2; extra == 'transformers'
23
+ Requires-Dist: transformers<6.0.0,>=5.3.0; extra == 'transformers'
File without changes
@@ -0,0 +1,30 @@
1
+ coverage:
2
+ # Controls how the coverage badge is colored
3
+ range: 50..80
4
+ round: down
5
+ precision: 2
6
+
7
+ status:
8
+ project:
9
+ default:
10
+ target: auto
11
+ threshold: 2%
12
+ patch:
13
+ default:
14
+ target: 80%
15
+
16
+ comment:
17
+ # Compact PR comment
18
+ layout: "condensed_header, condensed_files, condensed_footer"
19
+
20
+ # Always comment, even if coverage didn’t change
21
+ require_changes: false
22
+
23
+ # Don’t block comment if baseline is missing
24
+ require_base: false
25
+
26
+ # Ensure we have coverage on the PR commit
27
+ require_head: true
28
+
29
+ # Hide global coverage in PR (focus on patch)
30
+ hide_project_coverage: true
@@ -0,0 +1,22 @@
1
+ module.exports = {
2
+ extends: ['@commitlint/config-conventional'],
3
+ rules: {
4
+ 'type-enum': [
5
+ 2,
6
+ 'always',
7
+ [
8
+ 'feat',
9
+ 'fix',
10
+ 'docs',
11
+ 'style',
12
+ 'refactor',
13
+ 'perf',
14
+ 'test',
15
+ 'chore',
16
+ 'ci',
17
+ 'build',
18
+ ],
19
+ ],
20
+ 'subject-empty': [2, 'never'],
21
+ },
22
+ };
@@ -0,0 +1,105 @@
1
+ [project]
2
+ name = "retrievalbase"
3
+ version = "1.0.0"
4
+ description = ""
5
+ authors = [
6
+ { name = "jalal", email = "jalalkhaldi3@gmail.com" }
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.11,<3.13"
10
+
11
+ dependencies = [
12
+ "polars>=1.38.1,<2.0.0",
13
+ "langchain>=1.2.10,<2.0.0",
14
+ "numpy>=2.4.2,<3.0.0",
15
+ "rank-bm25>=0.2.2,<0.3.0",
16
+ "openai>=2.21.0,<3.0.0",
17
+ "qdrant-client>=1.16.2,<2.0.0",
18
+ "pydantic-settings>=2.13.0,<3.0.0",
19
+ "faiss-cpu>=1.13.2,<2.0.0",
20
+ "minio>=7.2.20,<8.0.0",
21
+ ]
22
+
23
+ [build-system]
24
+ requires = ["hatchling"]
25
+ build-backend = "hatchling.build"
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["src/retrievalbase"]
29
+
30
+ [tool.ruff]
31
+ line-length = 120
32
+ target-version = "py312"
33
+
34
+ [tool.ruff.lint]
35
+ select = [
36
+ "E", # pycodestyle errors
37
+ "W", # pycodestyle warnings
38
+ "F", # pyflakes
39
+ "I", # isort
40
+ "B", # flake8-bugbear
41
+ "C4", # flake8-comprehensions
42
+ "UP", # pyupgrade
43
+ "ARG", # flake8-unused-arguments
44
+ "SIM", # flake8-simplify
45
+ ]
46
+ ignore = [
47
+ "E501", # line too long (handled by formatter)
48
+ "B008", # do not perform function calls in argument defaults
49
+ "B904", # Within an except clause, raise exceptions with `raise ... from err`
50
+ "ARG001", # Unused function argument
51
+ "ARG002", # Unused method argument
52
+ "ARG005", # Unused lambda argument
53
+ "SIM108", # Use ternary operator (less readable in many cases)
54
+ ]
55
+
56
+ [tool.ruff.lint.isort]
57
+ known-first-party = ["retrievalbase"]
58
+
59
+ [tool.pytest.ini_options]
60
+ asyncio_mode = "auto"
61
+ testpaths = ["tests"]
62
+ markers = [
63
+ ]
64
+
65
+ [tool.ty.environment]
66
+ python-version = "3.12"
67
+
68
+ [tool.bandit]
69
+ exclude_dirs = ["tests", ".venv"]
70
+ skips = [
71
+ "B101", # assert statements
72
+ "B404", # import subprocess (needed for CLI)
73
+ "B603", # subprocess without shell=True (needed for CLI)
74
+ "B607", # partial executable path (needed for CLI to run docker, systemctl, etc.)
75
+ ]
76
+
77
+ [dependency-groups]
78
+ dev = [
79
+ "pytest>=9.0.2",
80
+ "pytest-asyncio>=1.3.0",
81
+ "pytest-mock>=3.15.1",
82
+ "pytest-cov>=6.0.0",
83
+ "ruff>=0.13.0",
84
+ "ty>=0.0.17,<0.1.0",
85
+ "bandit>=1.8.0",
86
+ "types-pyyaml>=6.0.12.20250915,<7.0.0.0",
87
+ "types-tqdm>=4.67.3.20260205,<5.0.0.0",
88
+ "jupyter>=1.1.1,<2.0.0",
89
+ "transformers>=5.2.0,<6.0.0",
90
+ "torch>=2.10.0,<3.0.0",
91
+ ]
92
+
93
+ [project.optional-dependencies]
94
+ torch = [
95
+ "transformers>=5.3.0,<6.0.0",
96
+ "datasets>=4.5.0,<5.0.0",
97
+ "sentence-transformers>=5.1.2,<6.0.0",
98
+ "torch>=2.10.0,<3.0.0",
99
+
100
+ ]
101
+ transformers = [
102
+ "transformers>=5.3.0,<6.0.0",
103
+ "datasets>=4.5.0,<5.0.0",
104
+ "sentence-transformers>=5.1.2,<6.0.0",
105
+ ]
File without changes
@@ -0,0 +1,69 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ import polars as pl
6
+
7
+ from retrievalbase.connector.settings import DatasetConnectorSettings
8
+ from retrievalbase.mixins import FromConfigMixin
9
+ from retrievalbase.types import TCDatasetConnector as TCDatasetConnector
10
+
11
+ if TYPE_CHECKING:
12
+ from retrievalbase.dataset import Dataset, TextDataset
13
+
14
+ _logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DatasetConnector[TCDatasetConnector: DatasetConnectorSettings](
18
+ FromConfigMixin[TCDatasetConnector],
19
+ ABC,
20
+ ):
21
+ def __init__(self, config: TCDatasetConnector):
22
+ super().__init__(config)
23
+
24
+ _logger.info(
25
+ f"Initializing dataset connector | class={self.__class__.__name__} | module={self.__class__.__module__}"
26
+ )
27
+
28
+ @abstractmethod
29
+ def _load(self) -> pl.DataFrame | pl.LazyFrame:
30
+ """
31
+ Load raw data as Polars DataFrame or LazyFrame.
32
+ """
33
+ raise NotImplementedError()
34
+
35
+ @abstractmethod
36
+ def to(self, ds: "Dataset[Any]") -> None:
37
+ raise NotImplementedError()
38
+
39
+ def load(self) -> "Dataset[pl.DataFrame | pl.LazyFrame]":
40
+ from retrievalbase.dataset.polars import PolarsDataset
41
+
42
+ _logger.info(f"Loading dataset | connector={self.__class__.__name__}")
43
+
44
+ df = self._load()
45
+ self._log_polars_info(df)
46
+
47
+ return PolarsDataset.from_polars(df)
48
+
49
+ def load_text(self) -> "TextDataset[pl.DataFrame | pl.LazyFrame]":
50
+ from retrievalbase.dataset.polars import PolarsTextDataset
51
+
52
+ _logger.info(f"Loading text dataset | connector={self.__class__.__name__}")
53
+
54
+ df = self._load()
55
+ self._log_polars_info(df)
56
+
57
+ return PolarsTextDataset.from_polars(df)
58
+
59
+ # ------------------------------------------------------------------
60
+ # Helpers
61
+ # ------------------------------------------------------------------
62
+
63
+ def _log_polars_info(self, df: pl.DataFrame | pl.LazyFrame) -> None:
64
+ """
65
+ Log dataset structure without forcing materialization.
66
+ """
67
+ if isinstance(df, pl.DataFrame):
68
+ schema = df.schema
69
+ _logger.info(f"Loaded DataFrame | columns={len(schema)} | schema={list(schema.items())}")
@@ -0,0 +1,45 @@
1
+ import io
2
+ from typing import TYPE_CHECKING, Any
3
+
4
+ import polars as pl
5
+ from minio import Minio
6
+
7
+ from retrievalbase.connector import DatasetConnector
8
+ from retrievalbase.connector.settings import MinioDatasetConnectorSettings
9
+
10
+ if TYPE_CHECKING:
11
+ from retrievalbase.dataset import Dataset
12
+
13
+
14
+ class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
15
+ def __init__(self, config: MinioDatasetConnectorSettings):
16
+ super().__init__(config)
17
+ self.client = Minio(
18
+ self.config.endpoint.replace("http://", "").replace("https://", ""),
19
+ access_key=self.config.access_key.get_secret_value(),
20
+ secret_key=self.config.secret_key.get_secret_value(),
21
+ secure=self.config.endpoint.startswith("https://"),
22
+ )
23
+
24
+ def _load(self) -> pl.DataFrame | pl.LazyFrame:
25
+ response = self.client.get_object(self.config.bucket, self.config.key)
26
+ try:
27
+ buffer = io.BytesIO(response.read())
28
+ finally:
29
+ response.close()
30
+ response.release_conn()
31
+ df = pl.read_parquet(buffer)
32
+ return df
33
+
34
+ def to(self, ds: "Dataset[Any]") -> None:
35
+ df = ds.polars
36
+ buffer = io.BytesIO()
37
+ df.write_parquet(buffer)
38
+ buffer.seek(0)
39
+ self.client.put_object(
40
+ bucket_name=self.config.bucket,
41
+ object_name=self.config.key,
42
+ data=buffer,
43
+ length=buffer.getbuffer().nbytes,
44
+ content_type="application/octet-stream",
45
+ )
@@ -0,0 +1,20 @@
1
+ from typing import TYPE_CHECKING, Any
2
+
3
+ import polars as pl
4
+
5
+ from retrievalbase.connector import DatasetConnector
6
+ from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
7
+
8
+ if TYPE_CHECKING:
9
+ from retrievalbase.dataset import Dataset
10
+
11
+
12
+ class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings]):
13
+ def __init__(self, config: ParquetDatasetConnectorSettings):
14
+ super().__init__(config)
15
+
16
+ def _load(self) -> pl.DataFrame | pl.LazyFrame:
17
+ return pl.scan_parquet(self.config.path) if self.config.lazy else pl.read_parquet(self.config.path)
18
+
19
+ def to(self, ds: "Dataset[Any]") -> None:
20
+ ds.polars.write_parquet(self.config.path)
@@ -0,0 +1,22 @@
1
+ from pydantic import SecretStr
2
+ from pydantic_settings import SettingsConfigDict
3
+
4
+ from retrievalbase.settings import FromConfigMixinSettings
5
+
6
+
7
+ class DatasetConnectorSettings(FromConfigMixinSettings):
8
+ pass
9
+
10
+
11
+ class ParquetDatasetConnectorSettings(DatasetConnectorSettings):
12
+ path: str
13
+ lazy: bool
14
+
15
+
16
+ class MinioDatasetConnectorSettings(DatasetConnectorSettings):
17
+ endpoint: str
18
+ bucket: str
19
+ key: str
20
+ access_key: SecretStr
21
+ secret_key: SecretStr
22
+ model_config = SettingsConfigDict(env_prefix="MINIO_", extra="ignore")
@@ -0,0 +1 @@
1
+ CONFIG_PATH = "/config/config.yaml"