retrievalbase 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retrievalbase-1.0.0/.gitignore +38 -0
- retrievalbase-1.0.0/.gitlab-ci.yml +110 -0
- retrievalbase-1.0.0/.pre-commit-config.yaml +48 -0
- retrievalbase-1.0.0/.releaserc.json +28 -0
- retrievalbase-1.0.0/CHANGELOG.md +11 -0
- retrievalbase-1.0.0/Makefile +87 -0
- retrievalbase-1.0.0/PKG-INFO +23 -0
- retrievalbase-1.0.0/README.md +0 -0
- retrievalbase-1.0.0/codecov.yml +30 -0
- retrievalbase-1.0.0/commitlint.config.cjs +22 -0
- retrievalbase-1.0.0/pyproject.toml +105 -0
- retrievalbase-1.0.0/src/retrievalbase/__init__.py +0 -0
- retrievalbase-1.0.0/src/retrievalbase/connector/__init__.py +69 -0
- retrievalbase-1.0.0/src/retrievalbase/connector/minio.py +45 -0
- retrievalbase-1.0.0/src/retrievalbase/connector/parquet.py +20 -0
- retrievalbase-1.0.0/src/retrievalbase/connector/settings.py +22 -0
- retrievalbase-1.0.0/src/retrievalbase/constants.py +1 -0
- retrievalbase-1.0.0/src/retrievalbase/dataset/__init__.py +146 -0
- retrievalbase-1.0.0/src/retrievalbase/dataset/hf.py +49 -0
- retrievalbase-1.0.0/src/retrievalbase/dataset/mixins.py +108 -0
- retrievalbase-1.0.0/src/retrievalbase/dataset/polars.py +43 -0
- retrievalbase-1.0.0/src/retrievalbase/dataset/preprocess/__init__.py +29 -0
- retrievalbase-1.0.0/src/retrievalbase/dataset/preprocess/preprocess.py +96 -0
- retrievalbase-1.0.0/src/retrievalbase/dataset/preprocess/token_counter.py +41 -0
- retrievalbase-1.0.0/src/retrievalbase/dataset/settings.py +63 -0
- retrievalbase-1.0.0/src/retrievalbase/enums.py +11 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/__init__.py +179 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/async_batcher.py +79 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/embedders.py +28 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/evaluators/__init__.py +37 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/evaluators/python/__init__.py +149 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/evaluators/python/evaluators.py +71 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/evaluators/python/scores.py +118 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/processors.py +15 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/rerankers.py +182 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/retrievers/__init__.py +112 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/retrievers/dense/__init__.py +56 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/retrievers/dense/retrievers.py +86 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/settings.py +204 -0
- retrievalbase-1.0.0/src/retrievalbase/evaluation/vector_stores.py +146 -0
- retrievalbase-1.0.0/src/retrievalbase/exceptions.py +61 -0
- retrievalbase-1.0.0/src/retrievalbase/ingestion/__init__.py +50 -0
- retrievalbase-1.0.0/src/retrievalbase/ingestion/settings.py +10 -0
- retrievalbase-1.0.0/src/retrievalbase/mixins.py +85 -0
- retrievalbase-1.0.0/src/retrievalbase/py.typed +0 -0
- retrievalbase-1.0.0/src/retrievalbase/settings.py +33 -0
- retrievalbase-1.0.0/src/retrievalbase/types.py +55 -0
- retrievalbase-1.0.0/src/retrievalbase/utils.py +107 -0
- retrievalbase-1.0.0/tests/__init__.py +1 -0
- retrievalbase-1.0.0/tests/conftest.py +15 -0
- retrievalbase-1.0.0/tests/fixtures/__init__.py +1 -0
- retrievalbase-1.0.0/tests/fixtures/dataframes.py +35 -0
- retrievalbase-1.0.0/tests/fixtures/fakes.py +124 -0
- retrievalbase-1.0.0/tests/integration/__init__.py +1 -0
- retrievalbase-1.0.0/tests/integration/dataset/__init__.py +1 -0
- retrievalbase-1.0.0/tests/integration/dataset/test_huggingface_adapter.py +1 -0
- retrievalbase-1.0.0/tests/integration/evaluation/__init__.py +1 -0
- retrievalbase-1.0.0/tests/integration/evaluation/test_python_evaluator.py +50 -0
- retrievalbase-1.0.0/tests/unit/__init__.py +1 -0
- retrievalbase-1.0.0/tests/unit/config/__init__.py +1 -0
- retrievalbase-1.0.0/tests/unit/config/test_mixins.py +63 -0
- retrievalbase-1.0.0/tests/unit/config/test_settings.py +48 -0
- retrievalbase-1.0.0/tests/unit/dataset/__init__.py +1 -0
- retrievalbase-1.0.0/tests/unit/dataset/test_connectors.py +160 -0
- retrievalbase-1.0.0/tests/unit/dataset/test_polars_dataset.py +48 -0
- retrievalbase-1.0.0/tests/unit/evaluation/__init__.py +1 -0
- retrievalbase-1.0.0/tests/unit/evaluation/test_async_batcher.py +45 -0
- retrievalbase-1.0.0/tests/unit/evaluation/test_embedders.py +66 -0
- retrievalbase-1.0.0/tests/unit/evaluation/test_processors.py +12 -0
- retrievalbase-1.0.0/tests/unit/evaluation/test_rerankers.py +95 -0
- retrievalbase-1.0.0/tests/unit/evaluation/test_scores.py +41 -0
- retrievalbase-1.0.0/tests/unit/evaluation/test_vector_stores.py +71 -0
- retrievalbase-1.0.0/tests/unit/ingestion/__init__.py +1 -0
- retrievalbase-1.0.0/tests/unit/ingestion/test_text_ingestion_pipeline.py +47 -0
- retrievalbase-1.0.0/tests/unit/preprocess/__init__.py +1 -0
- retrievalbase-1.0.0/tests/unit/preprocess/test_filters.py +127 -0
- retrievalbase-1.0.0/tests/unit/preprocess/test_token_counters.py +18 -0
- retrievalbase-1.0.0/tests/unit/retrievers/__init__.py +1 -0
- retrievalbase-1.0.0/tests/unit/retrievers/test_bm25_retriever.py +48 -0
- retrievalbase-1.0.0/tests/unit/retrievers/test_dense_retriever.py +52 -0
- retrievalbase-1.0.0/tests/unit/retrievers/test_hybrid_retriever.py +33 -0
- retrievalbase-1.0.0/tests/unit/retrievers/test_retriever_base.py +40 -0
- retrievalbase-1.0.0/tests/unit/utils/__init__.py +1 -0
- retrievalbase-1.0.0/tests/unit/utils/test_utils.py +45 -0
- retrievalbase-1.0.0/uv.lock +3521 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# env files
|
|
13
|
+
.env
|
|
14
|
+
|
|
15
|
+
# Pycharm
|
|
16
|
+
.idea/
|
|
17
|
+
|
|
18
|
+
# VSCode
|
|
19
|
+
.vscode/
|
|
20
|
+
|
|
21
|
+
# Testing
|
|
22
|
+
.coverage
|
|
23
|
+
coverage.xml
|
|
24
|
+
htmlcov/
|
|
25
|
+
.pytest_cache/
|
|
26
|
+
.mypy_cache/
|
|
27
|
+
|
|
28
|
+
# Ruff
|
|
29
|
+
.ruff_cache/
|
|
30
|
+
|
|
31
|
+
# Pre-commit
|
|
32
|
+
.pre-commit-cache/
|
|
33
|
+
|
|
34
|
+
# MacOS
|
|
35
|
+
.DS_Store
|
|
36
|
+
|
|
37
|
+
# Claude
|
|
38
|
+
.claude/
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
image: python:3.12
|
|
2
|
+
|
|
3
|
+
workflow:
|
|
4
|
+
rules:
|
|
5
|
+
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "dev"'
|
|
6
|
+
- if: '$CI_COMMIT_BRANCH == "dev"'
|
|
7
|
+
- when: never
|
|
8
|
+
|
|
9
|
+
stages:
|
|
10
|
+
- checks
|
|
11
|
+
- test
|
|
12
|
+
- deploy
|
|
13
|
+
|
|
14
|
+
variables:
|
|
15
|
+
UV_SYSTEM_PYTHON: "1"
|
|
16
|
+
PIP_DISABLE_PIP_VERSION_CHECK: "1"
|
|
17
|
+
GIT_DEPTH: 0
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
cache:
|
|
21
|
+
key:
|
|
22
|
+
files:
|
|
23
|
+
- uv.lock
|
|
24
|
+
paths:
|
|
25
|
+
- .venv/
|
|
26
|
+
- ~/.cache/uv
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
before_script:
|
|
30
|
+
- pip install uv
|
|
31
|
+
- uv sync --group dev --all-extras
|
|
32
|
+
|
|
33
|
+
ruff:
|
|
34
|
+
stage: checks
|
|
35
|
+
script:
|
|
36
|
+
- uv run ruff check .
|
|
37
|
+
- uv run ruff format --check .
|
|
38
|
+
|
|
39
|
+
ty:
|
|
40
|
+
stage: checks
|
|
41
|
+
script:
|
|
42
|
+
- uv run ty check .
|
|
43
|
+
|
|
44
|
+
bandit:
|
|
45
|
+
stage: checks
|
|
46
|
+
script:
|
|
47
|
+
- uv run bandit -c pyproject.toml -r .
|
|
48
|
+
|
|
49
|
+
pytest:
|
|
50
|
+
stage: test
|
|
51
|
+
script:
|
|
52
|
+
- |
|
|
53
|
+
if [ "$CI_PIPELINE_SOURCE" = "merge_request_event" ]; then
|
|
54
|
+
uv run pytest --cov=src --cov-report=xml --cov-report=term --cov-fail-under=80
|
|
55
|
+
else
|
|
56
|
+
uv run pytest
|
|
57
|
+
fi
|
|
58
|
+
artifacts:
|
|
59
|
+
paths:
|
|
60
|
+
- coverage.xml
|
|
61
|
+
|
|
62
|
+
check-package:
|
|
63
|
+
stage: test
|
|
64
|
+
needs: ["pytest"]
|
|
65
|
+
script:
|
|
66
|
+
- uv build
|
|
67
|
+
- pip install dist/*.whl
|
|
68
|
+
|
|
69
|
+
codecov:
|
|
70
|
+
stage: test
|
|
71
|
+
needs: ["pytest"]
|
|
72
|
+
script:
|
|
73
|
+
- curl -Os https://cli.codecov.io/latest/linux/codecov
|
|
74
|
+
- chmod +x codecov
|
|
75
|
+
- ./codecov upload-process -t $CODECOV_TOKEN
|
|
76
|
+
rules:
|
|
77
|
+
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
|
|
78
|
+
|
|
79
|
+
release:
|
|
80
|
+
stage: deploy
|
|
81
|
+
image: node:22
|
|
82
|
+
needs: ["check-package"]
|
|
83
|
+
before_script:
|
|
84
|
+
- apt-get update && apt-get install -y python3 python3-pip
|
|
85
|
+
- pip install --break-system-packages uv
|
|
86
|
+
rules:
|
|
87
|
+
- if: '$CI_COMMIT_BRANCH == "dev"'
|
|
88
|
+
script:
|
|
89
|
+
- npm install -g semantic-release @semantic-release/gitlab @semantic-release/changelog @semantic-release/git @semantic-release/exec
|
|
90
|
+
- semantic-release
|
|
91
|
+
|
|
92
|
+
conventional-commits:
|
|
93
|
+
stage: checks
|
|
94
|
+
image: node:20
|
|
95
|
+
rules:
|
|
96
|
+
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
|
|
97
|
+
before_script: []
|
|
98
|
+
script:
|
|
99
|
+
- npm install -g @commitlint/cli @commitlint/config-conventional
|
|
100
|
+
- echo "$CI_MERGE_REQUEST_TITLE" > pr_title.txt
|
|
101
|
+
- |
|
|
102
|
+
npx commitlint --config commitlint.config.cjs --edit pr_title.txt || {
|
|
103
|
+
echo "❌ PR title must follow Conventional Commits format (e.g. feat:, fix:)"
|
|
104
|
+
exit 1
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
default:
|
|
108
|
+
interruptible: true
|
|
109
|
+
retry: 1
|
|
110
|
+
timeout: 10m
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
# Ruff for linting and formatting
|
|
3
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
4
|
+
rev: v0.13.3
|
|
5
|
+
hooks:
|
|
6
|
+
# Run the linter
|
|
7
|
+
- id: ruff
|
|
8
|
+
args: [--fix]
|
|
9
|
+
# Run the formatter
|
|
10
|
+
- id: ruff-format
|
|
11
|
+
|
|
12
|
+
# Type checking with ty (disabled in pre-commit, enabled in CI)
|
|
13
|
+
# - repo: https://github.com/astral-sh/ty-pre-commit
|
|
14
|
+
# rev: v0.0.17
|
|
15
|
+
# hooks:
|
|
16
|
+
# - id: ty
|
|
17
|
+
# exclude: ^(tests/|alembic/)
|
|
18
|
+
|
|
19
|
+
# Conventional commits enforcement
|
|
20
|
+
- repo: https://github.com/compilerla/conventional-pre-commit
|
|
21
|
+
rev: v3.6.0
|
|
22
|
+
hooks:
|
|
23
|
+
- id: conventional-pre-commit
|
|
24
|
+
stages: [commit-msg]
|
|
25
|
+
|
|
26
|
+
# General file checks
|
|
27
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
28
|
+
rev: v5.0.0
|
|
29
|
+
hooks:
|
|
30
|
+
- id: trailing-whitespace
|
|
31
|
+
- id: end-of-file-fixer
|
|
32
|
+
- id: check-yaml
|
|
33
|
+
- id: check-added-large-files
|
|
34
|
+
args: [--maxkb=1000]
|
|
35
|
+
- id: check-json
|
|
36
|
+
- id: check-toml
|
|
37
|
+
- id: detect-private-key
|
|
38
|
+
- id: mixed-line-ending
|
|
39
|
+
- id: check-merge-conflict
|
|
40
|
+
|
|
41
|
+
# Security checks with bandit
|
|
42
|
+
- repo: https://github.com/PyCQA/bandit
|
|
43
|
+
rev: 1.8.0
|
|
44
|
+
hooks:
|
|
45
|
+
- id: bandit
|
|
46
|
+
args: [-c, pyproject.toml]
|
|
47
|
+
additional_dependencies: ["bandit[toml]"]
|
|
48
|
+
exclude: ^tests/
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"branches": [
|
|
3
|
+
"dev"
|
|
4
|
+
],
|
|
5
|
+
"plugins": [
|
|
6
|
+
"@semantic-release/commit-analyzer",
|
|
7
|
+
"@semantic-release/release-notes-generator",
|
|
8
|
+
"@semantic-release/changelog",
|
|
9
|
+
[
|
|
10
|
+
"@semantic-release/exec",
|
|
11
|
+
{
|
|
12
|
+
"prepareCmd": "uv version ${nextRelease.version}",
|
|
13
|
+
"publishCmd": "uv build && uv publish --token $PYPI_TOKEN"
|
|
14
|
+
}
|
|
15
|
+
],
|
|
16
|
+
[
|
|
17
|
+
"@semantic-release/git",
|
|
18
|
+
{
|
|
19
|
+
"assets": [
|
|
20
|
+
"CHANGELOG.md",
|
|
21
|
+
"pyproject.toml"
|
|
22
|
+
],
|
|
23
|
+
"message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"@semantic-release/gitlab"
|
|
27
|
+
]
|
|
28
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# 1.0.0 (2026-04-19)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* fix deps in ci ([dc69d67](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/dc69d67cc33794c6cd477c7391576db3cf317800))
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
### Features
|
|
10
|
+
|
|
11
|
+
* first commit, setup ci ([47b9e29](https://gitlab.com/efysent/agentic-core/retrievalbase/commit/47b9e29b747b07eef2689a6c332c7107d84abc2d))
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
.PHONY: help install dev-install setup-hooks \
|
|
2
|
+
format lint type-check security \
|
|
3
|
+
test test-cov coverage-diff \
|
|
4
|
+
ci ci-fast clean run
|
|
5
|
+
|
|
6
|
+
# ------------------------
|
|
7
|
+
# 📚 Help
|
|
8
|
+
# ------------------------
|
|
9
|
+
help:
|
|
10
|
+
@echo "Available commands:"
|
|
11
|
+
@echo " make install - Install production dependencies"
|
|
12
|
+
@echo " make dev-install - Install all dependencies + hooks"
|
|
13
|
+
@echo " make setup-hooks - Install pre-commit hooks"
|
|
14
|
+
@echo " make format - Format code (ruff)"
|
|
15
|
+
@echo " make lint - Lint code (ruff)"
|
|
16
|
+
@echo " make type-check - Run type checking (ty)"
|
|
17
|
+
@echo " make security - Run security checks (bandit)"
|
|
18
|
+
@echo " make test - Run tests"
|
|
19
|
+
@echo " make test-cov - Run tests with coverage"
|
|
20
|
+
@echo " make ci - Full CI locally"
|
|
21
|
+
@echo " make ci-fast - CI without slow steps"
|
|
22
|
+
@echo " make clean - Clean cache files"
|
|
23
|
+
|
|
24
|
+
# ------------------------
|
|
25
|
+
# 📦 Install
|
|
26
|
+
# ------------------------
|
|
27
|
+
install:
|
|
28
|
+
uv sync --no-dev
|
|
29
|
+
|
|
30
|
+
dev-install:
|
|
31
|
+
uv sync --group dev --all-extras
|
|
32
|
+
uv run pre-commit install
|
|
33
|
+
uv run pre-commit install --hook-type commit-msg
|
|
34
|
+
@echo "✅ Dev environment ready"
|
|
35
|
+
|
|
36
|
+
setup-hooks:
|
|
37
|
+
uv run pre-commit install
|
|
38
|
+
uv run pre-commit install --hook-type commit-msg
|
|
39
|
+
|
|
40
|
+
# ------------------------
|
|
41
|
+
# 🧹 Code quality
|
|
42
|
+
# ------------------------
|
|
43
|
+
format:
|
|
44
|
+
uv run ruff format .
|
|
45
|
+
uv run ruff check --fix .
|
|
46
|
+
|
|
47
|
+
lint:
|
|
48
|
+
uv run ruff check .
|
|
49
|
+
|
|
50
|
+
type-check:
|
|
51
|
+
uv run ty check .
|
|
52
|
+
|
|
53
|
+
security:
|
|
54
|
+
uv run bandit -c pyproject.toml -r .
|
|
55
|
+
|
|
56
|
+
# ------------------------
|
|
57
|
+
# 🧪 Tests
|
|
58
|
+
# ------------------------
|
|
59
|
+
test:
|
|
60
|
+
uv run pytest
|
|
61
|
+
|
|
62
|
+
test-cov:
|
|
63
|
+
uv run pytest \
|
|
64
|
+
--cov=src \
|
|
65
|
+
--cov-report=term \
|
|
66
|
+
--cov-report=xml:coverage.xml \
|
|
67
|
+
--cov-fail-under=80
|
|
68
|
+
|
|
69
|
+
# ------------------------
|
|
70
|
+
# 🚀 CI equivalent
|
|
71
|
+
# ------------------------
|
|
72
|
+
ci: lint type-check security test-cov
|
|
73
|
+
@echo ""
|
|
74
|
+
@echo "✅ Full CI checks passed"
|
|
75
|
+
|
|
76
|
+
# Faster version (no bandit blocking)
|
|
77
|
+
ci-fast: lint type-check test
|
|
78
|
+
@echo ""
|
|
79
|
+
@echo "⚡ Fast CI checks passed"
|
|
80
|
+
|
|
81
|
+
# ------------------------
|
|
82
|
+
# 🧹 Cleanup
|
|
83
|
+
# ------------------------
|
|
84
|
+
clean:
|
|
85
|
+
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
|
|
86
|
+
find . -type f -name "*.pyc" -delete 2>/dev/null || true
|
|
87
|
+
rm -rf .pytest_cache .ruff_cache .ty_cache htmlcov coverage.xml 2>/dev/null || true
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: retrievalbase
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Author-email: jalal <jalalkhaldi3@gmail.com>
|
|
5
|
+
Requires-Python: <3.13,>=3.11
|
|
6
|
+
Requires-Dist: faiss-cpu<2.0.0,>=1.13.2
|
|
7
|
+
Requires-Dist: langchain<2.0.0,>=1.2.10
|
|
8
|
+
Requires-Dist: minio<8.0.0,>=7.2.20
|
|
9
|
+
Requires-Dist: numpy<3.0.0,>=2.4.2
|
|
10
|
+
Requires-Dist: openai<3.0.0,>=2.21.0
|
|
11
|
+
Requires-Dist: polars<2.0.0,>=1.38.1
|
|
12
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.13.0
|
|
13
|
+
Requires-Dist: qdrant-client<2.0.0,>=1.16.2
|
|
14
|
+
Requires-Dist: rank-bm25<0.3.0,>=0.2.2
|
|
15
|
+
Provides-Extra: torch
|
|
16
|
+
Requires-Dist: datasets<5.0.0,>=4.5.0; extra == 'torch'
|
|
17
|
+
Requires-Dist: sentence-transformers<6.0.0,>=5.1.2; extra == 'torch'
|
|
18
|
+
Requires-Dist: torch<3.0.0,>=2.10.0; extra == 'torch'
|
|
19
|
+
Requires-Dist: transformers<6.0.0,>=5.3.0; extra == 'torch'
|
|
20
|
+
Provides-Extra: transformers
|
|
21
|
+
Requires-Dist: datasets<5.0.0,>=4.5.0; extra == 'transformers'
|
|
22
|
+
Requires-Dist: sentence-transformers<6.0.0,>=5.1.2; extra == 'transformers'
|
|
23
|
+
Requires-Dist: transformers<6.0.0,>=5.3.0; extra == 'transformers'
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
coverage:
|
|
2
|
+
# Controls how the coverage badge is colored
|
|
3
|
+
range: 50..80
|
|
4
|
+
round: down
|
|
5
|
+
precision: 2
|
|
6
|
+
|
|
7
|
+
status:
|
|
8
|
+
project:
|
|
9
|
+
default:
|
|
10
|
+
target: auto
|
|
11
|
+
threshold: 2%
|
|
12
|
+
patch:
|
|
13
|
+
default:
|
|
14
|
+
target: 80%
|
|
15
|
+
|
|
16
|
+
comment:
|
|
17
|
+
# Compact PR comment
|
|
18
|
+
layout: "condensed_header, condensed_files, condensed_footer"
|
|
19
|
+
|
|
20
|
+
# Always comment, even if coverage didn’t change
|
|
21
|
+
require_changes: false
|
|
22
|
+
|
|
23
|
+
# Don’t block comment if baseline is missing
|
|
24
|
+
require_base: false
|
|
25
|
+
|
|
26
|
+
# Ensure we have coverage on the PR commit
|
|
27
|
+
require_head: true
|
|
28
|
+
|
|
29
|
+
# Hide global coverage in PR (focus on patch)
|
|
30
|
+
hide_project_coverage: true
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module.exports = {
|
|
2
|
+
extends: ['@commitlint/config-conventional'],
|
|
3
|
+
rules: {
|
|
4
|
+
'type-enum': [
|
|
5
|
+
2,
|
|
6
|
+
'always',
|
|
7
|
+
[
|
|
8
|
+
'feat',
|
|
9
|
+
'fix',
|
|
10
|
+
'docs',
|
|
11
|
+
'style',
|
|
12
|
+
'refactor',
|
|
13
|
+
'perf',
|
|
14
|
+
'test',
|
|
15
|
+
'chore',
|
|
16
|
+
'ci',
|
|
17
|
+
'build',
|
|
18
|
+
],
|
|
19
|
+
],
|
|
20
|
+
'subject-empty': [2, 'never'],
|
|
21
|
+
},
|
|
22
|
+
};
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "retrievalbase"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "jalal", email = "jalalkhaldi3@gmail.com" }
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.11,<3.13"
|
|
10
|
+
|
|
11
|
+
dependencies = [
|
|
12
|
+
"polars>=1.38.1,<2.0.0",
|
|
13
|
+
"langchain>=1.2.10,<2.0.0",
|
|
14
|
+
"numpy>=2.4.2,<3.0.0",
|
|
15
|
+
"rank-bm25>=0.2.2,<0.3.0",
|
|
16
|
+
"openai>=2.21.0,<3.0.0",
|
|
17
|
+
"qdrant-client>=1.16.2,<2.0.0",
|
|
18
|
+
"pydantic-settings>=2.13.0,<3.0.0",
|
|
19
|
+
"faiss-cpu>=1.13.2,<2.0.0",
|
|
20
|
+
"minio>=7.2.20,<8.0.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[build-system]
|
|
24
|
+
requires = ["hatchling"]
|
|
25
|
+
build-backend = "hatchling.build"
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = ["src/retrievalbase"]
|
|
29
|
+
|
|
30
|
+
[tool.ruff]
|
|
31
|
+
line-length = 120
|
|
32
|
+
target-version = "py312"
|
|
33
|
+
|
|
34
|
+
[tool.ruff.lint]
|
|
35
|
+
select = [
|
|
36
|
+
"E", # pycodestyle errors
|
|
37
|
+
"W", # pycodestyle warnings
|
|
38
|
+
"F", # pyflakes
|
|
39
|
+
"I", # isort
|
|
40
|
+
"B", # flake8-bugbear
|
|
41
|
+
"C4", # flake8-comprehensions
|
|
42
|
+
"UP", # pyupgrade
|
|
43
|
+
"ARG", # flake8-unused-arguments
|
|
44
|
+
"SIM", # flake8-simplify
|
|
45
|
+
]
|
|
46
|
+
ignore = [
|
|
47
|
+
"E501", # line too long (handled by formatter)
|
|
48
|
+
"B008", # do not perform function calls in argument defaults
|
|
49
|
+
"B904", # Within an except clause, raise exceptions with `raise ... from err`
|
|
50
|
+
"ARG001", # Unused function argument
|
|
51
|
+
"ARG002", # Unused method argument
|
|
52
|
+
"ARG005", # Unused lambda argument
|
|
53
|
+
"SIM108", # Use ternary operator (less readable in many cases)
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint.isort]
|
|
57
|
+
known-first-party = ["retrievalbase"]
|
|
58
|
+
|
|
59
|
+
[tool.pytest.ini_options]
|
|
60
|
+
asyncio_mode = "auto"
|
|
61
|
+
testpaths = ["tests"]
|
|
62
|
+
markers = [
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
[tool.ty.environment]
|
|
66
|
+
python-version = "3.12"
|
|
67
|
+
|
|
68
|
+
[tool.bandit]
|
|
69
|
+
exclude_dirs = ["tests", ".venv"]
|
|
70
|
+
skips = [
|
|
71
|
+
"B101", # assert statements
|
|
72
|
+
"B404", # import subprocess (needed for CLI)
|
|
73
|
+
"B603", # subprocess without shell=True (needed for CLI)
|
|
74
|
+
"B607", # partial executable path (needed for CLI to run docker, systemctl, etc.)
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
[dependency-groups]
|
|
78
|
+
dev = [
|
|
79
|
+
"pytest>=9.0.2",
|
|
80
|
+
"pytest-asyncio>=1.3.0",
|
|
81
|
+
"pytest-mock>=3.15.1",
|
|
82
|
+
"pytest-cov>=6.0.0",
|
|
83
|
+
"ruff>=0.13.0",
|
|
84
|
+
"ty>=0.0.17,<0.1.0",
|
|
85
|
+
"bandit>=1.8.0",
|
|
86
|
+
"types-pyyaml>=6.0.12.20250915,<7.0.0.0",
|
|
87
|
+
"types-tqdm>=4.67.3.20260205,<5.0.0.0",
|
|
88
|
+
"jupyter>=1.1.1,<2.0.0",
|
|
89
|
+
"transformers>=5.2.0,<6.0.0",
|
|
90
|
+
"torch>=2.10.0,<3.0.0",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
[project.optional-dependencies]
|
|
94
|
+
torch = [
|
|
95
|
+
"transformers>=5.3.0,<6.0.0",
|
|
96
|
+
"datasets>=4.5.0,<5.0.0",
|
|
97
|
+
"sentence-transformers>=5.1.2,<6.0.0",
|
|
98
|
+
"torch>=2.10.0,<3.0.0",
|
|
99
|
+
|
|
100
|
+
]
|
|
101
|
+
transformers = [
|
|
102
|
+
"transformers>=5.3.0,<6.0.0",
|
|
103
|
+
"datasets>=4.5.0,<5.0.0",
|
|
104
|
+
"sentence-transformers>=5.1.2,<6.0.0",
|
|
105
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
from retrievalbase.connector.settings import DatasetConnectorSettings
|
|
8
|
+
from retrievalbase.mixins import FromConfigMixin
|
|
9
|
+
from retrievalbase.types import TCDatasetConnector as TCDatasetConnector
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from retrievalbase.dataset import Dataset, TextDataset
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatasetConnector[TCDatasetConnector: DatasetConnectorSettings](
|
|
18
|
+
FromConfigMixin[TCDatasetConnector],
|
|
19
|
+
ABC,
|
|
20
|
+
):
|
|
21
|
+
def __init__(self, config: TCDatasetConnector):
|
|
22
|
+
super().__init__(config)
|
|
23
|
+
|
|
24
|
+
_logger.info(
|
|
25
|
+
f"Initializing dataset connector | class={self.__class__.__name__} | module={self.__class__.__module__}"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
30
|
+
"""
|
|
31
|
+
Load raw data as Polars DataFrame or LazyFrame.
|
|
32
|
+
"""
|
|
33
|
+
raise NotImplementedError()
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def to(self, ds: "Dataset[Any]") -> None:
|
|
37
|
+
raise NotImplementedError()
|
|
38
|
+
|
|
39
|
+
def load(self) -> "Dataset[pl.DataFrame | pl.LazyFrame]":
|
|
40
|
+
from retrievalbase.dataset.polars import PolarsDataset
|
|
41
|
+
|
|
42
|
+
_logger.info(f"Loading dataset | connector={self.__class__.__name__}")
|
|
43
|
+
|
|
44
|
+
df = self._load()
|
|
45
|
+
self._log_polars_info(df)
|
|
46
|
+
|
|
47
|
+
return PolarsDataset.from_polars(df)
|
|
48
|
+
|
|
49
|
+
def load_text(self) -> "TextDataset[pl.DataFrame | pl.LazyFrame]":
|
|
50
|
+
from retrievalbase.dataset.polars import PolarsTextDataset
|
|
51
|
+
|
|
52
|
+
_logger.info(f"Loading text dataset | connector={self.__class__.__name__}")
|
|
53
|
+
|
|
54
|
+
df = self._load()
|
|
55
|
+
self._log_polars_info(df)
|
|
56
|
+
|
|
57
|
+
return PolarsTextDataset.from_polars(df)
|
|
58
|
+
|
|
59
|
+
# ------------------------------------------------------------------
|
|
60
|
+
# Helpers
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
def _log_polars_info(self, df: pl.DataFrame | pl.LazyFrame) -> None:
|
|
64
|
+
"""
|
|
65
|
+
Log dataset structure without forcing materialization.
|
|
66
|
+
"""
|
|
67
|
+
if isinstance(df, pl.DataFrame):
|
|
68
|
+
schema = df.schema
|
|
69
|
+
_logger.info(f"Loaded DataFrame | columns={len(schema)} | schema={list(schema.items())}")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
3
|
+
|
|
4
|
+
import polars as pl
|
|
5
|
+
from minio import Minio
|
|
6
|
+
|
|
7
|
+
from retrievalbase.connector import DatasetConnector
|
|
8
|
+
from retrievalbase.connector.settings import MinioDatasetConnectorSettings
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from retrievalbase.dataset import Dataset
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MinioDatasetConnector(DatasetConnector[MinioDatasetConnectorSettings]):
|
|
15
|
+
def __init__(self, config: MinioDatasetConnectorSettings):
|
|
16
|
+
super().__init__(config)
|
|
17
|
+
self.client = Minio(
|
|
18
|
+
self.config.endpoint.replace("http://", "").replace("https://", ""),
|
|
19
|
+
access_key=self.config.access_key.get_secret_value(),
|
|
20
|
+
secret_key=self.config.secret_key.get_secret_value(),
|
|
21
|
+
secure=self.config.endpoint.startswith("https://"),
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
25
|
+
response = self.client.get_object(self.config.bucket, self.config.key)
|
|
26
|
+
try:
|
|
27
|
+
buffer = io.BytesIO(response.read())
|
|
28
|
+
finally:
|
|
29
|
+
response.close()
|
|
30
|
+
response.release_conn()
|
|
31
|
+
df = pl.read_parquet(buffer)
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
def to(self, ds: "Dataset[Any]") -> None:
|
|
35
|
+
df = ds.polars
|
|
36
|
+
buffer = io.BytesIO()
|
|
37
|
+
df.write_parquet(buffer)
|
|
38
|
+
buffer.seek(0)
|
|
39
|
+
self.client.put_object(
|
|
40
|
+
bucket_name=self.config.bucket,
|
|
41
|
+
object_name=self.config.key,
|
|
42
|
+
data=buffer,
|
|
43
|
+
length=buffer.getbuffer().nbytes,
|
|
44
|
+
content_type="application/octet-stream",
|
|
45
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
from retrievalbase.connector import DatasetConnector
|
|
6
|
+
from retrievalbase.connector.settings import ParquetDatasetConnectorSettings
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from retrievalbase.dataset import Dataset
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ParquetDatasetConnector(DatasetConnector[ParquetDatasetConnectorSettings]):
|
|
13
|
+
def __init__(self, config: ParquetDatasetConnectorSettings):
|
|
14
|
+
super().__init__(config)
|
|
15
|
+
|
|
16
|
+
def _load(self) -> pl.DataFrame | pl.LazyFrame:
|
|
17
|
+
return pl.scan_parquet(self.config.path) if self.config.lazy else pl.read_parquet(self.config.path)
|
|
18
|
+
|
|
19
|
+
def to(self, ds: "Dataset[Any]") -> None:
|
|
20
|
+
ds.polars.write_parquet(self.config.path)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from pydantic import SecretStr
|
|
2
|
+
from pydantic_settings import SettingsConfigDict
|
|
3
|
+
|
|
4
|
+
from retrievalbase.settings import FromConfigMixinSettings
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DatasetConnectorSettings(FromConfigMixinSettings):
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ParquetDatasetConnectorSettings(DatasetConnectorSettings):
|
|
12
|
+
path: str
|
|
13
|
+
lazy: bool
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MinioDatasetConnectorSettings(DatasetConnectorSettings):
|
|
17
|
+
endpoint: str
|
|
18
|
+
bucket: str
|
|
19
|
+
key: str
|
|
20
|
+
access_key: SecretStr
|
|
21
|
+
secret_key: SecretStr
|
|
22
|
+
model_config = SettingsConfigDict(env_prefix="MINIO_", extra="ignore")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
CONFIG_PATH = "/config/config.yaml"
|