kodit 0.1.14__tar.gz → 0.1.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit-0.1.16/.github/dependabot.yml +10 -0
- kodit-0.1.16/.github/workflows/pull_request.yaml +35 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/pypi.yaml +2 -2
- {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/test.yaml +18 -17
- kodit-0.1.16/.python-version +1 -0
- {kodit-0.1.14 → kodit-0.1.16}/Dockerfile +4 -4
- {kodit-0.1.14 → kodit-0.1.16}/PKG-INFO +4 -1
- {kodit-0.1.14 → kodit-0.1.16}/docs/_index.md +66 -1
- {kodit-0.1.14 → kodit-0.1.16}/pyproject.toml +3 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/_version.py +2 -2
- kodit-0.1.16/src/kodit/bm25/keyword_search_factory.py +17 -0
- kodit-0.1.16/src/kodit/bm25/keyword_search_service.py +34 -0
- kodit-0.1.14/src/kodit/bm25/bm25.py → kodit-0.1.16/src/kodit/bm25/local_bm25.py +40 -14
- kodit-0.1.16/src/kodit/bm25/vectorchord_bm25.py +193 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/cli.py +114 -25
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/config.py +9 -2
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/database.py +4 -2
- kodit-0.1.16/src/kodit/embedding/embedding_factory.py +44 -0
- kodit-0.1.16/src/kodit/embedding/embedding_provider/__init__.py +1 -0
- kodit-0.1.16/src/kodit/embedding/embedding_provider/embedding_provider.py +60 -0
- kodit-0.1.16/src/kodit/embedding/embedding_provider/hash_embedding_provider.py +77 -0
- kodit-0.1.16/src/kodit/embedding/embedding_provider/local_embedding_provider.py +58 -0
- kodit-0.1.16/src/kodit/embedding/embedding_provider/openai_embedding_provider.py +75 -0
- kodit-0.1.14/src/kodit/search/search_repository.py → kodit-0.1.16/src/kodit/embedding/embedding_repository.py +61 -33
- kodit-0.1.16/src/kodit/embedding/local_vector_search_service.py +50 -0
- kodit-0.1.16/src/kodit/embedding/vector_search_service.py +38 -0
- kodit-0.1.16/src/kodit/embedding/vectorchord_vector_search_service.py +154 -0
- kodit-0.1.16/src/kodit/enrichment/__init__.py +1 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_factory.py +23 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_provider/__init__.py +1 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_provider/enrichment_provider.py +16 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_provider/local_enrichment_provider.py +63 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +77 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_service.py +33 -0
- kodit-0.1.16/src/kodit/indexing/fusion.py +67 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/indexing/indexing_repository.py +44 -4
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/indexing/indexing_service.py +142 -31
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/mcp.py +31 -18
- kodit-0.1.16/src/kodit/snippets/languages/go.scm +26 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/source/source_service.py +9 -3
- kodit-0.1.16/src/kodit/util/__init__.py +1 -0
- kodit-0.1.16/src/kodit/util/spinner.py +59 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/experiments/embedding.py +3 -3
- kodit-0.1.16/tests/experiments/similarity_test.py +73 -0
- kodit-0.1.16/tests/kodit/bm25/local_bm25_test.py +155 -0
- kodit-0.1.16/tests/kodit/bm25/vectorchord_repository_test.py +182 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/cli_test.py +16 -4
- kodit-0.1.16/tests/kodit/embedding/embedding_provider/local_embedding_provider_test.py +93 -0
- kodit-0.1.16/tests/kodit/embedding/embedding_provider/openai_embedding_provider_test.py +218 -0
- kodit-0.1.16/tests/kodit/embedding/local_vector_search_service_test.py +143 -0
- kodit-0.1.16/tests/kodit/embedding/vectorchord_vector_search_service_test.py +231 -0
- kodit-0.1.16/tests/kodit/enrichment/enrichment_provider/__init__.py +0 -0
- kodit-0.1.16/tests/kodit/enrichment/enrichment_provider/openai_enrichment_provider_test.py +203 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/indexing/indexing_service_test.py +44 -25
- kodit-0.1.16/tests/kodit/snippets/__init__.py +0 -0
- kodit-0.1.16/tests/kodit/snippets/golang.go +28 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/snippets/method_extraction_test.py +38 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/smoke.sh +1 -1
- {kodit-0.1.14 → kodit-0.1.16}/uv.lock +28 -0
- kodit-0.1.14/.python-version +0 -1
- kodit-0.1.14/src/kodit/embedding/embedding.py +0 -203
- kodit-0.1.14/src/kodit/search/__init__.py +0 -1
- kodit-0.1.14/src/kodit/search/search_service.py +0 -147
- kodit-0.1.14/tests/kodit/embedding/embedding_test.py +0 -13
- kodit-0.1.14/tests/kodit/search/__init__.py +0 -1
- kodit-0.1.14/tests/kodit/search/search_repository_test.py +0 -124
- kodit-0.1.14/tests/kodit/search/search_service_test.py +0 -279
- {kodit-0.1.14 → kodit-0.1.16}/.cursor/rules/kodit.mdc +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/CODE_OF_CONDUCT.md +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/CONTRIBUTING.md +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/docker.yaml +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/docs.yaml +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/pypi-test.yaml +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.gitignore +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.vscode/launch.json +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/.vscode/settings.json +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/LICENSE +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/README.md +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/alembic.ini +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/docs/developer/index.md +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/.gitignore +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/app.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/bm25/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/embedding/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/embedding/embedding_models.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/indexing/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/indexing/indexing_models.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/log.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/middleware.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/README +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/env.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/script.py.mako +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/versions/85155663351e_initial.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/versions/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/languages/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/languages/csharp.scm +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/languages/python.scm +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/method_snippets.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/snippets.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/source/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/source/source_models.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/src/kodit/source/source_repository.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/conftest.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/experiments/cline-prompt-regression-tests/cline_prompt.txt +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/experiments/cline-prompt-regression-tests/cline_prompt_test.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/e2e.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/embedding/__init__.py +0 -0
- {kodit-0.1.14/tests/kodit/snippets → kodit-0.1.16/tests/kodit/enrichment}/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/indexing/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/mcp_test.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/snippets/csharp.cs +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/snippets/detect_language_test.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/snippets/python.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/source/__init__.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/source/source_service_test.py +0 -0
- {kodit-0.1.14 → kodit-0.1.16}/tests/performance/similarity.py +0 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# This workflow will install dependencies, create coverage tests and run Pytest Coverage Comment
|
|
2
|
+
# For more information see: https://github.com/MishaKav/pytest-coverage-comment/
|
|
3
|
+
name: pytest-coverage-comment
|
|
4
|
+
on:
|
|
5
|
+
pull_request:
|
|
6
|
+
branches:
|
|
7
|
+
- "*"
|
|
8
|
+
|
|
9
|
+
# https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
|
|
10
|
+
# `contents` is for permission to the contents of the repository.
|
|
11
|
+
# `pull-requests` is for permission to pull request
|
|
12
|
+
permissions:
|
|
13
|
+
contents: write
|
|
14
|
+
checks: write
|
|
15
|
+
pull-requests: write
|
|
16
|
+
|
|
17
|
+
jobs:
|
|
18
|
+
coverage-comment:
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
- uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version-file: ".python-version"
|
|
25
|
+
- uses: astral-sh/setup-uv@v5
|
|
26
|
+
- run: uv sync --locked --all-extras --dev
|
|
27
|
+
|
|
28
|
+
- name: Run tests
|
|
29
|
+
run: uv run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=src tests/kodit | tee pytest-coverage.txt
|
|
30
|
+
|
|
31
|
+
- name: Pytest coverage comment
|
|
32
|
+
uses: MishaKav/pytest-coverage-comment@main
|
|
33
|
+
with:
|
|
34
|
+
pytest-coverage-path: ./pytest-coverage.txt
|
|
35
|
+
junitxml-path: ./pytest.xml
|
|
@@ -42,10 +42,10 @@ jobs:
|
|
|
42
42
|
if curl -sfL https://pypi.org/packages/source/${REPO_NAME_FIRST_LETTER}/${REPO_NAME}/${REPO_NAME}-${REPO_TAG}.tar.gz > /dev/null; then
|
|
43
43
|
break
|
|
44
44
|
fi
|
|
45
|
-
sleep
|
|
45
|
+
sleep 5
|
|
46
46
|
count=$((count+1))
|
|
47
47
|
if [ $count -ge 60 ]; then
|
|
48
|
-
echo "Timeout reached after
|
|
48
|
+
echo "Timeout reached after 300 seconds"
|
|
49
49
|
exit 1
|
|
50
50
|
fi
|
|
51
51
|
done
|
|
@@ -9,7 +9,6 @@ on:
|
|
|
9
9
|
permissions:
|
|
10
10
|
contents: read # Needed to check out code
|
|
11
11
|
checks: write # Needed to report test results
|
|
12
|
-
pull-requests: write # Needed to add comments/annotations to PRs
|
|
13
12
|
|
|
14
13
|
jobs:
|
|
15
14
|
test:
|
|
@@ -37,12 +36,6 @@ jobs:
|
|
|
37
36
|
- name: Run tests
|
|
38
37
|
run: uv run pytest -s --cov=src --cov-report=xml tests/kodit
|
|
39
38
|
|
|
40
|
-
- name: Pytest coverage comment
|
|
41
|
-
if: github.event_name == 'pull_request'
|
|
42
|
-
uses: MishaKav/pytest-coverage-comment@v1.1.54
|
|
43
|
-
with:
|
|
44
|
-
pytest-xml-coverage-path: ./coverage.xml
|
|
45
|
-
|
|
46
39
|
build-package:
|
|
47
40
|
runs-on: ubuntu-latest
|
|
48
41
|
timeout-minutes: 10
|
|
@@ -58,23 +51,29 @@ jobs:
|
|
|
58
51
|
- name: Install uv
|
|
59
52
|
uses: astral-sh/setup-uv@v5
|
|
60
53
|
|
|
61
|
-
- run: uv build --
|
|
54
|
+
- run: uv build --wheel --out-dir test-build
|
|
62
55
|
|
|
63
56
|
- name: Upload built package
|
|
64
57
|
uses: actions/upload-artifact@v4
|
|
65
58
|
with:
|
|
66
59
|
name: built-package
|
|
67
|
-
path: test-build/*.
|
|
60
|
+
path: test-build/*.whl
|
|
68
61
|
|
|
69
62
|
test-package:
|
|
70
63
|
needs: build-package
|
|
71
64
|
runs-on: ubuntu-latest
|
|
65
|
+
strategy:
|
|
66
|
+
matrix:
|
|
67
|
+
python-version:
|
|
68
|
+
- 3.12
|
|
69
|
+
- 3.13
|
|
72
70
|
timeout-minutes: 10
|
|
73
71
|
steps:
|
|
74
72
|
- uses: actions/checkout@v4
|
|
75
73
|
with:
|
|
76
74
|
sparse-checkout: |
|
|
77
75
|
tests/smoke.sh
|
|
76
|
+
uv.lock
|
|
78
77
|
sparse-checkout-cone-mode: false
|
|
79
78
|
|
|
80
79
|
- name: Download built package
|
|
@@ -86,16 +85,18 @@ jobs:
|
|
|
86
85
|
- name: "Set up Python"
|
|
87
86
|
uses: actions/setup-python@v5
|
|
88
87
|
with:
|
|
89
|
-
python-version:
|
|
88
|
+
python-version: ${{ matrix.python-version }}
|
|
89
|
+
|
|
90
|
+
- name: Install uv
|
|
91
|
+
uses: astral-sh/setup-uv@v5
|
|
90
92
|
|
|
91
|
-
- name: Extract path to
|
|
92
|
-
id:
|
|
93
|
-
run: echo "
|
|
93
|
+
- name: Extract path to wheel
|
|
94
|
+
id: wheel_path
|
|
95
|
+
run: echo "wheel_path=$(ls test-build/*.whl)" >> $GITHUB_OUTPUT
|
|
94
96
|
|
|
95
|
-
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
packages: "${{ steps.sdist_path.outputs.sdist_path }}"
|
|
97
|
+
# This is equivalent to `pipx install --include-deps, but faster
|
|
98
|
+
- name: Install wheel
|
|
99
|
+
run: uv tool install "${{ steps.wheel_path.outputs.wheel_path }}"
|
|
99
100
|
|
|
100
101
|
- name: Run simple version command test
|
|
101
102
|
run: kodit version
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# syntax=docker/dockerfile:1.9
|
|
2
|
-
FROM python:3.
|
|
2
|
+
FROM python:3.13.3-slim-bookworm AS build
|
|
3
3
|
|
|
4
4
|
# The following does not work in Podman unless you build in Docker
|
|
5
5
|
# compatibility mode: <https://github.com/containers/podman/issues/8477>
|
|
@@ -23,12 +23,12 @@ COPY --from=ghcr.io/astral-sh/uv:0.7.2 /uv /usr/local/bin/uv
|
|
|
23
23
|
# - Silence uv complaining about not being able to use hard links,
|
|
24
24
|
# - tell uv to byte-compile packages for faster application startups,
|
|
25
25
|
# - prevent uv from accidentally downloading isolated Python builds,
|
|
26
|
-
# - pick a Python (use `/usr/bin/python3.
|
|
26
|
+
# - pick a Python (use `/usr/bin/python3.13` on uv 0.5.0 and later),
|
|
27
27
|
# - and finally declare `/app` as the target for `uv sync`.
|
|
28
28
|
ENV UV_LINK_MODE=copy \
|
|
29
29
|
UV_COMPILE_BYTECODE=1 \
|
|
30
30
|
UV_PYTHON_DOWNLOADS=never \
|
|
31
|
-
UV_PYTHON=python3.
|
|
31
|
+
UV_PYTHON=python3.13 \
|
|
32
32
|
UV_PROJECT_ENVIRONMENT=/app
|
|
33
33
|
|
|
34
34
|
# Synchronize DEPENDENCIES without the application itself.
|
|
@@ -60,7 +60,7 @@ RUN --mount=type=cache,target=/root/.cache \
|
|
|
60
60
|
|
|
61
61
|
##########################################################################
|
|
62
62
|
|
|
63
|
-
FROM python:3.
|
|
63
|
+
FROM python:3.13.3-slim-bookworm
|
|
64
64
|
SHELL ["sh", "-exc"]
|
|
65
65
|
|
|
66
66
|
ENV PATH=/app/bin:$PATH
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kodit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.16
|
|
4
4
|
Summary: Code indexing for better AI code generation
|
|
5
5
|
Project-URL: Homepage, https://docs.helixml.tech/kodit/
|
|
6
6
|
Project-URL: Documentation, https://docs.helixml.tech/kodit/
|
|
@@ -15,12 +15,14 @@ Keywords: ai,indexing,mcp,rag
|
|
|
15
15
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
16
|
Classifier: Intended Audience :: Developers
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
19
|
Classifier: Topic :: Software Development :: Code Generators
|
|
19
20
|
Requires-Python: >=3.12
|
|
20
21
|
Requires-Dist: aiofiles>=24.1.0
|
|
21
22
|
Requires-Dist: aiosqlite>=0.20.0
|
|
22
23
|
Requires-Dist: alembic>=1.15.2
|
|
23
24
|
Requires-Dist: asgi-correlation-id>=4.3.4
|
|
25
|
+
Requires-Dist: asyncpg>=0.30.0
|
|
24
26
|
Requires-Dist: better-exceptions>=0.3.3
|
|
25
27
|
Requires-Dist: bm25s[core]>=0.2.12
|
|
26
28
|
Requires-Dist: click>=8.1.8
|
|
@@ -41,6 +43,7 @@ Requires-Dist: sqlalchemy[asyncio]>=2.0.40
|
|
|
41
43
|
Requires-Dist: structlog>=25.3.0
|
|
42
44
|
Requires-Dist: tdqm>=0.0.1
|
|
43
45
|
Requires-Dist: tiktoken>=0.9.0
|
|
46
|
+
Requires-Dist: transformers>=4.51.3
|
|
44
47
|
Requires-Dist: tree-sitter-language-pack>=0.7.3
|
|
45
48
|
Requires-Dist: tree-sitter>=0.24.0
|
|
46
49
|
Requires-Dist: uritools>=5.0.0
|
|
@@ -169,7 +169,7 @@ recreate all indexes.
|
|
|
169
169
|
|
|
170
170
|
### Indexing
|
|
171
171
|
|
|
172
|
-
#### Default Provider
|
|
172
|
+
#### Default Indexing Provider
|
|
173
173
|
|
|
174
174
|
By default, Kodit will use small local models for semantic search and enrichment. If you
|
|
175
175
|
are using Kodit in a professional capacity, it is likely that the local model latency is
|
|
@@ -188,6 +188,71 @@ DEFAULT_ENDPOINT_BASE_URL=https://api.openai.com/v1
|
|
|
188
188
|
DEFAULT_ENDPOINT_API_KEY=sk-xxxxxx
|
|
189
189
|
```
|
|
190
190
|
|
|
191
|
+
### Database
|
|
192
|
+
|
|
193
|
+
Out of the box Kodit uses a local sqlite file to make it easier for users to get
|
|
194
|
+
started. But for production use, it's likely you will want to use a database that has
|
|
195
|
+
dedicated semantic and keyword search capabilities for reduced latency.
|
|
196
|
+
|
|
197
|
+
#### VectorChord Database
|
|
198
|
+
|
|
199
|
+
[VectorChord](https://github.com/tensorchord/VectorChord) is an optimized PostgreSQL
|
|
200
|
+
extension that provides both vector and BM25 search. (See [Search](#search))
|
|
201
|
+
|
|
202
|
+
Start a container with:
|
|
203
|
+
|
|
204
|
+
```sh
|
|
205
|
+
docker run \
|
|
206
|
+
--name kodit-vectorchord \
|
|
207
|
+
-e POSTGRES_DB=kodit \
|
|
208
|
+
-e POSTGRES_PASSWORD=mysecretpassword \
|
|
209
|
+
-p 5432:5432 \
|
|
210
|
+
-d tensorchord/vchord-suite:pg17-20250601
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
{{< warn >}}
|
|
214
|
+
Kodit assumes the database exists. In the above example I'm abusing the POSTGRES_DB
|
|
215
|
+
environmental variable from the [Postgres Docker
|
|
216
|
+
container](https://hub.docker.com/_/postgres/) to create the database for me. In
|
|
217
|
+
production setups, please create a database yourself.
|
|
218
|
+
{{< /warn >}}
|
|
219
|
+
|
|
220
|
+
Then update your `.env` file to include:
|
|
221
|
+
|
|
222
|
+
```env
|
|
223
|
+
DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Search
|
|
227
|
+
|
|
228
|
+
#### Default Search Provider
|
|
229
|
+
|
|
230
|
+
By default, Kodit will use built-in implementations of BM25 and similarity search to
|
|
231
|
+
improve the out of the box experience. If you are using Kodit in a professional
|
|
232
|
+
capacity, it is likely that the search latency is too high to provide a good developer
|
|
233
|
+
experience.
|
|
234
|
+
|
|
235
|
+
Instead, you should use the features included in your database. The settings provided
|
|
236
|
+
here will cause all search functionality to use this database by default. You can
|
|
237
|
+
override the database used for each search type if you wish. (Coming soon!)
|
|
238
|
+
|
|
239
|
+
##### VectorChord Search
|
|
240
|
+
|
|
241
|
+
Configure Kodit to use a [VectorChord database](#vectorchord-database).
|
|
242
|
+
|
|
243
|
+
Then update your `.env` file to include:
|
|
244
|
+
|
|
245
|
+
```env
|
|
246
|
+
DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
|
|
247
|
+
DEFAULT_SEARCH_PROVIDER=vectorchord
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Enrichment
|
|
251
|
+
|
|
252
|
+
#### Default Enrichment Provider
|
|
253
|
+
|
|
254
|
+
The default enrichment provider is the same as [the default indexing provider](#default-indexing-provider).
|
|
255
|
+
|
|
191
256
|
## Managing Kodit
|
|
192
257
|
|
|
193
258
|
There is limited management functionality at this time. To delete indexes you must
|
|
@@ -18,6 +18,7 @@ classifiers = [
|
|
|
18
18
|
|
|
19
19
|
# Specify the Python versions you support here.
|
|
20
20
|
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
21
22
|
]
|
|
22
23
|
requires-python = ">=3.12"
|
|
23
24
|
dependencies = [
|
|
@@ -48,6 +49,8 @@ dependencies = [
|
|
|
48
49
|
"hf-xet>=1.1.2",
|
|
49
50
|
"openai>=1.82.0",
|
|
50
51
|
"tiktoken>=0.9.0",
|
|
52
|
+
"asyncpg>=0.30.0",
|
|
53
|
+
"transformers>=4.51.3",
|
|
51
54
|
]
|
|
52
55
|
|
|
53
56
|
[dependency-groups]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Factory for creating keyword search providers."""
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
+
|
|
5
|
+
from kodit.bm25.keyword_search_service import KeywordSearchProvider
|
|
6
|
+
from kodit.bm25.local_bm25 import BM25Service
|
|
7
|
+
from kodit.bm25.vectorchord_bm25 import VectorChordBM25
|
|
8
|
+
from kodit.config import AppContext
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def keyword_search_factory(
|
|
12
|
+
app_context: AppContext, session: AsyncSession
|
|
13
|
+
) -> KeywordSearchProvider:
|
|
14
|
+
"""Create a keyword search provider."""
|
|
15
|
+
if app_context.default_search.provider == "vectorchord":
|
|
16
|
+
return VectorChordBM25(session=session)
|
|
17
|
+
return BM25Service(data_dir=app_context.get_data_dir())
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Keyword search service."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import NamedTuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BM25Document(NamedTuple):
|
|
8
|
+
"""BM25 document."""
|
|
9
|
+
|
|
10
|
+
snippet_id: int
|
|
11
|
+
text: str
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BM25Result(NamedTuple):
|
|
15
|
+
"""BM25 result."""
|
|
16
|
+
|
|
17
|
+
snippet_id: int
|
|
18
|
+
score: float
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class KeywordSearchProvider(ABC):
|
|
22
|
+
"""Interface for keyword search providers."""
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
async def index(self, corpus: list[BM25Document]) -> None:
|
|
26
|
+
"""Index a new corpus."""
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
|
|
30
|
+
"""Retrieve from the index."""
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
async def delete(self, snippet_ids: list[int]) -> None:
|
|
34
|
+
"""Delete documents from the index."""
|
|
@@ -1,23 +1,36 @@
|
|
|
1
|
-
"""BM25 service."""
|
|
1
|
+
"""Locally hosted BM25 service primarily for use with SQLite."""
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
6
|
+
import aiofiles
|
|
5
7
|
import bm25s
|
|
6
8
|
import Stemmer
|
|
7
9
|
import structlog
|
|
8
10
|
from bm25s.tokenization import Tokenized
|
|
9
11
|
|
|
12
|
+
from kodit.bm25.keyword_search_service import (
|
|
13
|
+
BM25Document,
|
|
14
|
+
BM25Result,
|
|
15
|
+
KeywordSearchProvider,
|
|
16
|
+
)
|
|
10
17
|
|
|
11
|
-
|
|
12
|
-
|
|
18
|
+
SNIPPET_IDS_FILE = "snippet_ids.jsonl"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BM25Service(KeywordSearchProvider):
|
|
22
|
+
"""LocalBM25 service."""
|
|
13
23
|
|
|
14
24
|
def __init__(self, data_dir: Path) -> None:
|
|
15
25
|
"""Initialize the BM25 service."""
|
|
16
26
|
self.log = structlog.get_logger(__name__)
|
|
17
27
|
self.index_path = data_dir / "bm25s_index"
|
|
28
|
+
self.snippet_ids: list[int] = []
|
|
18
29
|
try:
|
|
19
30
|
self.log.debug("Loading BM25 index")
|
|
20
31
|
self.retriever = bm25s.BM25.load(self.index_path, mmap=True)
|
|
32
|
+
with Path(self.index_path / SNIPPET_IDS_FILE).open() as f:
|
|
33
|
+
self.snippet_ids = json.load(f)
|
|
21
34
|
except FileNotFoundError:
|
|
22
35
|
self.log.debug("BM25 index not found, creating new index")
|
|
23
36
|
self.retriever = bm25s.BM25()
|
|
@@ -33,28 +46,34 @@ class BM25Service:
|
|
|
33
46
|
show_progress=True,
|
|
34
47
|
)
|
|
35
48
|
|
|
36
|
-
def index(self, corpus: list[
|
|
49
|
+
async def index(self, corpus: list[BM25Document]) -> None:
|
|
37
50
|
"""Index a new corpus."""
|
|
38
51
|
self.log.debug("Indexing corpus")
|
|
39
|
-
vocab = self._tokenize(corpus)
|
|
52
|
+
vocab = self._tokenize([doc.text for doc in corpus])
|
|
40
53
|
self.retriever = bm25s.BM25()
|
|
41
54
|
self.retriever.index(vocab, show_progress=False)
|
|
42
55
|
self.retriever.save(self.index_path)
|
|
56
|
+
self.snippet_ids = self.snippet_ids + [doc.snippet_id for doc in corpus]
|
|
57
|
+
async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
|
|
58
|
+
await f.write(json.dumps(self.snippet_ids))
|
|
43
59
|
|
|
44
|
-
def retrieve(
|
|
45
|
-
self, doc_ids: list[int], query: str, top_k: int = 2
|
|
46
|
-
) -> list[tuple[int, float]]:
|
|
60
|
+
async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
|
|
47
61
|
"""Retrieve from the index."""
|
|
48
62
|
if top_k == 0:
|
|
49
63
|
self.log.warning("Top k is 0, returning empty list")
|
|
50
64
|
return []
|
|
51
|
-
|
|
52
|
-
|
|
65
|
+
|
|
66
|
+
# Get the number of documents in the index
|
|
67
|
+
num_docs = self.retriever.scores["num_docs"]
|
|
68
|
+
if num_docs == 0:
|
|
53
69
|
return []
|
|
54
70
|
|
|
55
|
-
|
|
71
|
+
# Adjust top_k to not exceed corpus size
|
|
72
|
+
top_k = min(top_k, num_docs)
|
|
56
73
|
self.log.debug(
|
|
57
|
-
"Retrieving from index",
|
|
74
|
+
"Retrieving from index",
|
|
75
|
+
query=query,
|
|
76
|
+
top_k=top_k,
|
|
58
77
|
)
|
|
59
78
|
|
|
60
79
|
query_tokens = self._tokenize([query])
|
|
@@ -62,10 +81,17 @@ class BM25Service:
|
|
|
62
81
|
self.log.debug("Query tokens", query_tokens=query_tokens)
|
|
63
82
|
|
|
64
83
|
results, scores = self.retriever.retrieve(
|
|
65
|
-
query_tokens=query_tokens,
|
|
84
|
+
query_tokens=query_tokens,
|
|
85
|
+
corpus=self.snippet_ids,
|
|
86
|
+
k=top_k,
|
|
66
87
|
)
|
|
67
88
|
self.log.debug("Raw results", results=results, scores=scores)
|
|
68
89
|
return [
|
|
69
|
-
(int(result), float(score))
|
|
90
|
+
BM25Result(snippet_id=int(result), score=float(score))
|
|
70
91
|
for result, score in zip(results[0], scores[0], strict=False)
|
|
92
|
+
if score > 0.0
|
|
71
93
|
]
|
|
94
|
+
|
|
95
|
+
async def delete(self, snippet_ids: list[int]) -> None: # noqa: ARG002
|
|
96
|
+
"""Delete documents from the index."""
|
|
97
|
+
self.log.warning("Deletion not supported for local BM25 index")
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""VectorChord repository for document operations."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import Result, TextClause, bindparam, text
|
|
6
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
+
|
|
8
|
+
from kodit.bm25.keyword_search_service import (
|
|
9
|
+
BM25Document,
|
|
10
|
+
BM25Result,
|
|
11
|
+
KeywordSearchProvider,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
TABLE_NAME = "vectorchord_bm25_documents"
|
|
15
|
+
INDEX_NAME = f"{TABLE_NAME}_idx"
|
|
16
|
+
TOKENIZER_NAME = "bert"
|
|
17
|
+
|
|
18
|
+
# SQL statements
|
|
19
|
+
CREATE_VCHORD_EXTENSION = "CREATE EXTENSION IF NOT EXISTS vchord CASCADE;"
|
|
20
|
+
CREATE_PG_TOKENIZER = "CREATE EXTENSION IF NOT EXISTS pg_tokenizer CASCADE;"
|
|
21
|
+
CREATE_VCHORD_BM25 = "CREATE EXTENSION IF NOT EXISTS vchord_bm25 CASCADE;"
|
|
22
|
+
SET_SEARCH_PATH = """
|
|
23
|
+
SET search_path TO
|
|
24
|
+
"$user", public, bm25_catalog, pg_catalog, information_schema, tokenizer_catalog;
|
|
25
|
+
"""
|
|
26
|
+
CREATE_BM25_TABLE = f"""
|
|
27
|
+
CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
|
|
28
|
+
id SERIAL PRIMARY KEY,
|
|
29
|
+
snippet_id BIGINT NOT NULL,
|
|
30
|
+
passage TEXT NOT NULL,
|
|
31
|
+
embedding bm25vector,
|
|
32
|
+
UNIQUE(snippet_id)
|
|
33
|
+
)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
CREATE_BM25_INDEX = f"""
|
|
37
|
+
CREATE INDEX IF NOT EXISTS {INDEX_NAME}
|
|
38
|
+
ON {TABLE_NAME}
|
|
39
|
+
USING bm25 (embedding bm25_ops)
|
|
40
|
+
"""
|
|
41
|
+
TOKENIZER_NAME_CHECK_QUERY = (
|
|
42
|
+
f"SELECT 1 FROM tokenizer_catalog.tokenizer WHERE name = '{TOKENIZER_NAME}'" # noqa: S608
|
|
43
|
+
)
|
|
44
|
+
LOAD_TOKENIZER = """
|
|
45
|
+
SELECT create_tokenizer('bert', $$
|
|
46
|
+
model = "llmlingua2"
|
|
47
|
+
pre_tokenizer = "unicode_segmentation" # Unicode Standard Annex #29
|
|
48
|
+
[[character_filters]]
|
|
49
|
+
to_lowercase = {} # convert all characters to lowercase
|
|
50
|
+
[[character_filters]]
|
|
51
|
+
unicode_normalization = "nfkd" # Unicode Normalization Form KD
|
|
52
|
+
[[token_filters]]
|
|
53
|
+
skip_non_alphanumeric = {} # remove non-alphanumeric tokens
|
|
54
|
+
[[token_filters]]
|
|
55
|
+
stopwords = "nltk_english" # remove stopwords using the nltk dictionary
|
|
56
|
+
[[token_filters]]
|
|
57
|
+
stemmer = "english_porter2" # stem tokens using the English Porter2 stemmer
|
|
58
|
+
$$)
|
|
59
|
+
"""
|
|
60
|
+
INSERT_QUERY = f"""
|
|
61
|
+
INSERT INTO {TABLE_NAME} (snippet_id, passage)
|
|
62
|
+
VALUES (:snippet_id, :passage)
|
|
63
|
+
ON CONFLICT (snippet_id) DO UPDATE
|
|
64
|
+
SET passage = EXCLUDED.passage
|
|
65
|
+
""" # noqa: S608
|
|
66
|
+
UPDATE_QUERY = f"""
|
|
67
|
+
UPDATE {TABLE_NAME}
|
|
68
|
+
SET embedding = tokenize(passage, '{TOKENIZER_NAME}')
|
|
69
|
+
""" # noqa: S608
|
|
70
|
+
SEARCH_QUERY = f"""
|
|
71
|
+
SELECT
|
|
72
|
+
snippet_id,
|
|
73
|
+
embedding <&>
|
|
74
|
+
to_bm25query('{INDEX_NAME}', tokenize(:query_text, '{TOKENIZER_NAME}'))
|
|
75
|
+
AS bm25_score
|
|
76
|
+
FROM {TABLE_NAME}
|
|
77
|
+
ORDER BY bm25_score
|
|
78
|
+
LIMIT :limit
|
|
79
|
+
""" # noqa: S608
|
|
80
|
+
DELETE_QUERY = f"""
|
|
81
|
+
DELETE FROM {TABLE_NAME}
|
|
82
|
+
WHERE snippet_id IN :snippet_ids
|
|
83
|
+
""" # noqa: S608
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class VectorChordBM25(KeywordSearchProvider):
|
|
87
|
+
"""BM25 using VectorChord."""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
session: AsyncSession,
|
|
92
|
+
) -> None:
|
|
93
|
+
"""Initialize the VectorChord BM25."""
|
|
94
|
+
self.__session = session
|
|
95
|
+
self._initialized = False
|
|
96
|
+
|
|
97
|
+
async def _initialize(self) -> None:
|
|
98
|
+
"""Initialize the VectorChord environment."""
|
|
99
|
+
try:
|
|
100
|
+
await self._create_extensions()
|
|
101
|
+
await self._create_tokenizer_if_not_exists()
|
|
102
|
+
await self._create_tables()
|
|
103
|
+
self._initialized = True
|
|
104
|
+
except Exception as e:
|
|
105
|
+
msg = f"Failed to initialize VectorChord repository: {e}"
|
|
106
|
+
raise RuntimeError(msg) from e
|
|
107
|
+
|
|
108
|
+
async def _create_extensions(self) -> None:
|
|
109
|
+
"""Create the necessary extensions."""
|
|
110
|
+
await self.__session.execute(text(CREATE_VCHORD_EXTENSION))
|
|
111
|
+
await self.__session.execute(text(CREATE_PG_TOKENIZER))
|
|
112
|
+
await self.__session.execute(text(CREATE_VCHORD_BM25))
|
|
113
|
+
await self.__session.execute(text(SET_SEARCH_PATH))
|
|
114
|
+
await self._commit()
|
|
115
|
+
|
|
116
|
+
async def _create_tokenizer_if_not_exists(self) -> None:
|
|
117
|
+
"""Create the tokenizer if it doesn't exist."""
|
|
118
|
+
# Check if tokenizer exists in the catalog
|
|
119
|
+
result = await self.__session.execute(text(TOKENIZER_NAME_CHECK_QUERY))
|
|
120
|
+
if result.scalar_one_or_none() is None:
|
|
121
|
+
# Tokenizer doesn't exist, create it
|
|
122
|
+
await self.__session.execute(text(LOAD_TOKENIZER))
|
|
123
|
+
await self._commit()
|
|
124
|
+
|
|
125
|
+
async def _create_tables(self) -> None:
|
|
126
|
+
"""Create the necessary tables in the correct order."""
|
|
127
|
+
await self.__session.execute(text(CREATE_BM25_TABLE))
|
|
128
|
+
await self.__session.execute(text(CREATE_BM25_INDEX))
|
|
129
|
+
await self._commit()
|
|
130
|
+
|
|
131
|
+
async def _execute(
|
|
132
|
+
self, query: TextClause, param_list: list[Any] | dict[str, Any] | None = None
|
|
133
|
+
) -> Result:
|
|
134
|
+
"""Execute a query."""
|
|
135
|
+
if not self._initialized:
|
|
136
|
+
await self._initialize()
|
|
137
|
+
return await self.__session.execute(query, param_list)
|
|
138
|
+
|
|
139
|
+
async def _commit(self) -> None:
|
|
140
|
+
"""Commit the session."""
|
|
141
|
+
await self.__session.commit()
|
|
142
|
+
|
|
143
|
+
async def index(self, corpus: list[BM25Document]) -> None:
|
|
144
|
+
"""Index a new corpus."""
|
|
145
|
+
# Filter out any documents that don't have a snippet_id or text
|
|
146
|
+
corpus = [
|
|
147
|
+
doc
|
|
148
|
+
for doc in corpus
|
|
149
|
+
if doc.snippet_id is not None and doc.text is not None and doc.text != ""
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
if not corpus:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
# Execute inserts
|
|
156
|
+
await self._execute(
|
|
157
|
+
text(INSERT_QUERY),
|
|
158
|
+
[{"snippet_id": doc.snippet_id, "passage": doc.text} for doc in corpus],
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Tokenize the new documents with schema qualification
|
|
162
|
+
await self._execute(text(UPDATE_QUERY))
|
|
163
|
+
await self._commit()
|
|
164
|
+
|
|
165
|
+
async def delete(self, snippet_ids: list[int]) -> None:
|
|
166
|
+
"""Delete documents from the index."""
|
|
167
|
+
await self._execute(
|
|
168
|
+
text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
|
|
169
|
+
{"snippet_ids": snippet_ids},
|
|
170
|
+
)
|
|
171
|
+
await self._commit()
|
|
172
|
+
|
|
173
|
+
async def retrieve(
|
|
174
|
+
self,
|
|
175
|
+
query: str,
|
|
176
|
+
top_k: int = 10,
|
|
177
|
+
) -> list[BM25Result]:
|
|
178
|
+
"""Search documents using BM25 similarity."""
|
|
179
|
+
if not query or query == "":
|
|
180
|
+
return []
|
|
181
|
+
|
|
182
|
+
sql = text(SEARCH_QUERY).bindparams(query_text=query, limit=top_k)
|
|
183
|
+
try:
|
|
184
|
+
result = await self._execute(sql)
|
|
185
|
+
rows = result.mappings().all()
|
|
186
|
+
|
|
187
|
+
return [
|
|
188
|
+
BM25Result(snippet_id=row["snippet_id"], score=row["bm25_score"])
|
|
189
|
+
for row in rows
|
|
190
|
+
]
|
|
191
|
+
except Exception as e:
|
|
192
|
+
msg = f"Error during BM25 search: {e}"
|
|
193
|
+
raise RuntimeError(msg) from e
|