kodit 0.1.14__tar.gz → 0.1.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (125) hide show
  1. kodit-0.1.16/.github/dependabot.yml +10 -0
  2. kodit-0.1.16/.github/workflows/pull_request.yaml +35 -0
  3. {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/pypi.yaml +2 -2
  4. {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/test.yaml +18 -17
  5. kodit-0.1.16/.python-version +1 -0
  6. {kodit-0.1.14 → kodit-0.1.16}/Dockerfile +4 -4
  7. {kodit-0.1.14 → kodit-0.1.16}/PKG-INFO +4 -1
  8. {kodit-0.1.14 → kodit-0.1.16}/docs/_index.md +66 -1
  9. {kodit-0.1.14 → kodit-0.1.16}/pyproject.toml +3 -0
  10. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/_version.py +2 -2
  11. kodit-0.1.16/src/kodit/bm25/keyword_search_factory.py +17 -0
  12. kodit-0.1.16/src/kodit/bm25/keyword_search_service.py +34 -0
  13. kodit-0.1.14/src/kodit/bm25/bm25.py → kodit-0.1.16/src/kodit/bm25/local_bm25.py +40 -14
  14. kodit-0.1.16/src/kodit/bm25/vectorchord_bm25.py +193 -0
  15. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/cli.py +114 -25
  16. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/config.py +9 -2
  17. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/database.py +4 -2
  18. kodit-0.1.16/src/kodit/embedding/embedding_factory.py +44 -0
  19. kodit-0.1.16/src/kodit/embedding/embedding_provider/__init__.py +1 -0
  20. kodit-0.1.16/src/kodit/embedding/embedding_provider/embedding_provider.py +60 -0
  21. kodit-0.1.16/src/kodit/embedding/embedding_provider/hash_embedding_provider.py +77 -0
  22. kodit-0.1.16/src/kodit/embedding/embedding_provider/local_embedding_provider.py +58 -0
  23. kodit-0.1.16/src/kodit/embedding/embedding_provider/openai_embedding_provider.py +75 -0
  24. kodit-0.1.14/src/kodit/search/search_repository.py → kodit-0.1.16/src/kodit/embedding/embedding_repository.py +61 -33
  25. kodit-0.1.16/src/kodit/embedding/local_vector_search_service.py +50 -0
  26. kodit-0.1.16/src/kodit/embedding/vector_search_service.py +38 -0
  27. kodit-0.1.16/src/kodit/embedding/vectorchord_vector_search_service.py +154 -0
  28. kodit-0.1.16/src/kodit/enrichment/__init__.py +1 -0
  29. kodit-0.1.16/src/kodit/enrichment/enrichment_factory.py +23 -0
  30. kodit-0.1.16/src/kodit/enrichment/enrichment_provider/__init__.py +1 -0
  31. kodit-0.1.16/src/kodit/enrichment/enrichment_provider/enrichment_provider.py +16 -0
  32. kodit-0.1.16/src/kodit/enrichment/enrichment_provider/local_enrichment_provider.py +63 -0
  33. kodit-0.1.16/src/kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +77 -0
  34. kodit-0.1.16/src/kodit/enrichment/enrichment_service.py +33 -0
  35. kodit-0.1.16/src/kodit/indexing/fusion.py +67 -0
  36. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/indexing/indexing_repository.py +44 -4
  37. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/indexing/indexing_service.py +142 -31
  38. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/mcp.py +31 -18
  39. kodit-0.1.16/src/kodit/snippets/languages/go.scm +26 -0
  40. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/source/source_service.py +9 -3
  41. kodit-0.1.16/src/kodit/util/__init__.py +1 -0
  42. kodit-0.1.16/src/kodit/util/spinner.py +59 -0
  43. {kodit-0.1.14 → kodit-0.1.16}/tests/experiments/embedding.py +3 -3
  44. kodit-0.1.16/tests/experiments/similarity_test.py +73 -0
  45. kodit-0.1.16/tests/kodit/bm25/local_bm25_test.py +155 -0
  46. kodit-0.1.16/tests/kodit/bm25/vectorchord_repository_test.py +182 -0
  47. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/cli_test.py +16 -4
  48. kodit-0.1.16/tests/kodit/embedding/embedding_provider/local_embedding_provider_test.py +93 -0
  49. kodit-0.1.16/tests/kodit/embedding/embedding_provider/openai_embedding_provider_test.py +218 -0
  50. kodit-0.1.16/tests/kodit/embedding/local_vector_search_service_test.py +143 -0
  51. kodit-0.1.16/tests/kodit/embedding/vectorchord_vector_search_service_test.py +231 -0
  52. kodit-0.1.16/tests/kodit/enrichment/enrichment_provider/__init__.py +0 -0
  53. kodit-0.1.16/tests/kodit/enrichment/enrichment_provider/openai_enrichment_provider_test.py +203 -0
  54. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/indexing/indexing_service_test.py +44 -25
  55. kodit-0.1.16/tests/kodit/snippets/__init__.py +0 -0
  56. kodit-0.1.16/tests/kodit/snippets/golang.go +28 -0
  57. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/snippets/method_extraction_test.py +38 -0
  58. {kodit-0.1.14 → kodit-0.1.16}/tests/smoke.sh +1 -1
  59. {kodit-0.1.14 → kodit-0.1.16}/uv.lock +28 -0
  60. kodit-0.1.14/.python-version +0 -1
  61. kodit-0.1.14/src/kodit/embedding/embedding.py +0 -203
  62. kodit-0.1.14/src/kodit/search/__init__.py +0 -1
  63. kodit-0.1.14/src/kodit/search/search_service.py +0 -147
  64. kodit-0.1.14/tests/kodit/embedding/embedding_test.py +0 -13
  65. kodit-0.1.14/tests/kodit/search/__init__.py +0 -1
  66. kodit-0.1.14/tests/kodit/search/search_repository_test.py +0 -124
  67. kodit-0.1.14/tests/kodit/search/search_service_test.py +0 -279
  68. {kodit-0.1.14 → kodit-0.1.16}/.cursor/rules/kodit.mdc +0 -0
  69. {kodit-0.1.14 → kodit-0.1.16}/.github/CODE_OF_CONDUCT.md +0 -0
  70. {kodit-0.1.14 → kodit-0.1.16}/.github/CONTRIBUTING.md +0 -0
  71. {kodit-0.1.14 → kodit-0.1.16}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  72. {kodit-0.1.14 → kodit-0.1.16}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  73. {kodit-0.1.14 → kodit-0.1.16}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  74. {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/docker.yaml +0 -0
  75. {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/docs.yaml +0 -0
  76. {kodit-0.1.14 → kodit-0.1.16}/.github/workflows/pypi-test.yaml +0 -0
  77. {kodit-0.1.14 → kodit-0.1.16}/.gitignore +0 -0
  78. {kodit-0.1.14 → kodit-0.1.16}/.vscode/launch.json +0 -0
  79. {kodit-0.1.14 → kodit-0.1.16}/.vscode/settings.json +0 -0
  80. {kodit-0.1.14 → kodit-0.1.16}/LICENSE +0 -0
  81. {kodit-0.1.14 → kodit-0.1.16}/README.md +0 -0
  82. {kodit-0.1.14 → kodit-0.1.16}/alembic.ini +0 -0
  83. {kodit-0.1.14 → kodit-0.1.16}/docs/developer/index.md +0 -0
  84. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/.gitignore +0 -0
  85. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/__init__.py +0 -0
  86. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/app.py +0 -0
  87. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/bm25/__init__.py +0 -0
  88. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/embedding/__init__.py +0 -0
  89. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/embedding/embedding_models.py +0 -0
  90. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/indexing/__init__.py +0 -0
  91. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/indexing/indexing_models.py +0 -0
  92. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/log.py +0 -0
  93. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/middleware.py +0 -0
  94. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/README +0 -0
  95. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/__init__.py +0 -0
  96. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/env.py +0 -0
  97. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/script.py.mako +0 -0
  98. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +0 -0
  99. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/versions/85155663351e_initial.py +0 -0
  100. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/migrations/versions/__init__.py +0 -0
  101. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/__init__.py +0 -0
  102. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/languages/__init__.py +0 -0
  103. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/languages/csharp.scm +0 -0
  104. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/languages/python.scm +0 -0
  105. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/method_snippets.py +0 -0
  106. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/snippets/snippets.py +0 -0
  107. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/source/__init__.py +0 -0
  108. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/source/source_models.py +0 -0
  109. {kodit-0.1.14 → kodit-0.1.16}/src/kodit/source/source_repository.py +0 -0
  110. {kodit-0.1.14 → kodit-0.1.16}/tests/__init__.py +0 -0
  111. {kodit-0.1.14 → kodit-0.1.16}/tests/conftest.py +0 -0
  112. {kodit-0.1.14 → kodit-0.1.16}/tests/experiments/cline-prompt-regression-tests/cline_prompt.txt +0 -0
  113. {kodit-0.1.14 → kodit-0.1.16}/tests/experiments/cline-prompt-regression-tests/cline_prompt_test.py +0 -0
  114. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/__init__.py +0 -0
  115. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/e2e.py +0 -0
  116. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/embedding/__init__.py +0 -0
  117. {kodit-0.1.14/tests/kodit/snippets → kodit-0.1.16/tests/kodit/enrichment}/__init__.py +0 -0
  118. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/indexing/__init__.py +0 -0
  119. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/mcp_test.py +0 -0
  120. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/snippets/csharp.cs +0 -0
  121. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/snippets/detect_language_test.py +0 -0
  122. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/snippets/python.py +0 -0
  123. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/source/__init__.py +0 -0
  124. {kodit-0.1.14 → kodit-0.1.16}/tests/kodit/source/source_service_test.py +0 -0
  125. {kodit-0.1.14 → kodit-0.1.16}/tests/performance/similarity.py +0 -0
@@ -0,0 +1,10 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "uv"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "weekly"
7
+ - package-ecosystem: "docker"
8
+ directory: "/"
9
+ schedule:
10
+ interval: "weekly"
@@ -0,0 +1,35 @@
1
+ # This workflow will install dependencies, create coverage tests and run Pytest Coverage Comment
2
+ # For more information see: https://github.com/MishaKav/pytest-coverage-comment/
3
+ name: pytest-coverage-comment
4
+ on:
5
+ pull_request:
6
+ branches:
7
+ - "*"
8
+
9
+ # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
10
+ # `contents` is for permission to the contents of the repository.
11
+ # `pull-requests` is for permission to pull request
12
+ permissions:
13
+ contents: write
14
+ checks: write
15
+ pull-requests: write
16
+
17
+ jobs:
18
+ coverage-comment:
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - uses: actions/setup-python@v5
23
+ with:
24
+ python-version-file: ".python-version"
25
+ - uses: astral-sh/setup-uv@v5
26
+ - run: uv sync --locked --all-extras --dev
27
+
28
+ - name: Run tests
29
+ run: uv run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=src tests/kodit | tee pytest-coverage.txt
30
+
31
+ - name: Pytest coverage comment
32
+ uses: MishaKav/pytest-coverage-comment@main
33
+ with:
34
+ pytest-coverage-path: ./pytest-coverage.txt
35
+ junitxml-path: ./pytest.xml
@@ -42,10 +42,10 @@ jobs:
42
42
  if curl -sfL https://pypi.org/packages/source/${REPO_NAME_FIRST_LETTER}/${REPO_NAME}/${REPO_NAME}-${REPO_TAG}.tar.gz > /dev/null; then
43
43
  break
44
44
  fi
45
- sleep 1
45
+ sleep 5
46
46
  count=$((count+1))
47
47
  if [ $count -ge 60 ]; then
48
- echo "Timeout reached after 60 seconds"
48
+ echo "Timeout reached after 300 seconds"
49
49
  exit 1
50
50
  fi
51
51
  done
@@ -9,7 +9,6 @@ on:
9
9
  permissions:
10
10
  contents: read # Needed to check out code
11
11
  checks: write # Needed to report test results
12
- pull-requests: write # Needed to add comments/annotations to PRs
13
12
 
14
13
  jobs:
15
14
  test:
@@ -37,12 +36,6 @@ jobs:
37
36
  - name: Run tests
38
37
  run: uv run pytest -s --cov=src --cov-report=xml tests/kodit
39
38
 
40
- - name: Pytest coverage comment
41
- if: github.event_name == 'pull_request'
42
- uses: MishaKav/pytest-coverage-comment@v1.1.54
43
- with:
44
- pytest-xml-coverage-path: ./coverage.xml
45
-
46
39
  build-package:
47
40
  runs-on: ubuntu-latest
48
41
  timeout-minutes: 10
@@ -58,23 +51,29 @@ jobs:
58
51
  - name: Install uv
59
52
  uses: astral-sh/setup-uv@v5
60
53
 
61
- - run: uv build --sdist --out-dir test-build
54
+ - run: uv build --wheel --out-dir test-build
62
55
 
63
56
  - name: Upload built package
64
57
  uses: actions/upload-artifact@v4
65
58
  with:
66
59
  name: built-package
67
- path: test-build/*.tar.gz
60
+ path: test-build/*.whl
68
61
 
69
62
  test-package:
70
63
  needs: build-package
71
64
  runs-on: ubuntu-latest
65
+ strategy:
66
+ matrix:
67
+ python-version:
68
+ - 3.12
69
+ - 3.13
72
70
  timeout-minutes: 10
73
71
  steps:
74
72
  - uses: actions/checkout@v4
75
73
  with:
76
74
  sparse-checkout: |
77
75
  tests/smoke.sh
76
+ uv.lock
78
77
  sparse-checkout-cone-mode: false
79
78
 
80
79
  - name: Download built package
@@ -86,16 +85,18 @@ jobs:
86
85
  - name: "Set up Python"
87
86
  uses: actions/setup-python@v5
88
87
  with:
89
- python-version: 3.12
88
+ python-version: ${{ matrix.python-version }}
89
+
90
+ - name: Install uv
91
+ uses: astral-sh/setup-uv@v5
90
92
 
91
- - name: Extract path to sdist
92
- id: sdist_path
93
- run: echo "sdist_path=$(ls test-build/*.tar.gz)" >> $GITHUB_OUTPUT
93
+ - name: Extract path to wheel
94
+ id: wheel_path
95
+ run: echo "wheel_path=$(ls test-build/*.whl)" >> $GITHUB_OUTPUT
94
96
 
95
- - name: Install sdist
96
- uses: threeal/pipx-install-action@v1.0.0
97
- with:
98
- packages: "${{ steps.sdist_path.outputs.sdist_path }}"
97
+ # This is equivalent to `pipx install --include-deps, but faster
98
+ - name: Install wheel
99
+ run: uv tool install "${{ steps.wheel_path.outputs.wheel_path }}"
99
100
 
100
101
  - name: Run simple version command test
101
102
  run: kodit version
@@ -0,0 +1 @@
1
+ 3.13
@@ -1,5 +1,5 @@
1
1
  # syntax=docker/dockerfile:1.9
2
- FROM python:3.12.10-slim-bookworm AS build
2
+ FROM python:3.13.3-slim-bookworm AS build
3
3
 
4
4
  # The following does not work in Podman unless you build in Docker
5
5
  # compatibility mode: <https://github.com/containers/podman/issues/8477>
@@ -23,12 +23,12 @@ COPY --from=ghcr.io/astral-sh/uv:0.7.2 /uv /usr/local/bin/uv
23
23
  # - Silence uv complaining about not being able to use hard links,
24
24
  # - tell uv to byte-compile packages for faster application startups,
25
25
  # - prevent uv from accidentally downloading isolated Python builds,
26
- # - pick a Python (use `/usr/bin/python3.12` on uv 0.5.0 and later),
26
+ # - pick a Python (use `/usr/bin/python3.13` on uv 0.5.0 and later),
27
27
  # - and finally declare `/app` as the target for `uv sync`.
28
28
  ENV UV_LINK_MODE=copy \
29
29
  UV_COMPILE_BYTECODE=1 \
30
30
  UV_PYTHON_DOWNLOADS=never \
31
- UV_PYTHON=python3.12 \
31
+ UV_PYTHON=python3.13 \
32
32
  UV_PROJECT_ENVIRONMENT=/app
33
33
 
34
34
  # Synchronize DEPENDENCIES without the application itself.
@@ -60,7 +60,7 @@ RUN --mount=type=cache,target=/root/.cache \
60
60
 
61
61
  ##########################################################################
62
62
 
63
- FROM python:3.12.10-slim-bookworm
63
+ FROM python:3.13.3-slim-bookworm
64
64
  SHELL ["sh", "-exc"]
65
65
 
66
66
  ENV PATH=/app/bin:$PATH
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -15,12 +15,14 @@ Keywords: ai,indexing,mcp,rag
15
15
  Classifier: Development Status :: 2 - Pre-Alpha
16
16
  Classifier: Intended Audience :: Developers
17
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
18
19
  Classifier: Topic :: Software Development :: Code Generators
19
20
  Requires-Python: >=3.12
20
21
  Requires-Dist: aiofiles>=24.1.0
21
22
  Requires-Dist: aiosqlite>=0.20.0
22
23
  Requires-Dist: alembic>=1.15.2
23
24
  Requires-Dist: asgi-correlation-id>=4.3.4
25
+ Requires-Dist: asyncpg>=0.30.0
24
26
  Requires-Dist: better-exceptions>=0.3.3
25
27
  Requires-Dist: bm25s[core]>=0.2.12
26
28
  Requires-Dist: click>=8.1.8
@@ -41,6 +43,7 @@ Requires-Dist: sqlalchemy[asyncio]>=2.0.40
41
43
  Requires-Dist: structlog>=25.3.0
42
44
  Requires-Dist: tdqm>=0.0.1
43
45
  Requires-Dist: tiktoken>=0.9.0
46
+ Requires-Dist: transformers>=4.51.3
44
47
  Requires-Dist: tree-sitter-language-pack>=0.7.3
45
48
  Requires-Dist: tree-sitter>=0.24.0
46
49
  Requires-Dist: uritools>=5.0.0
@@ -169,7 +169,7 @@ recreate all indexes.
169
169
 
170
170
  ### Indexing
171
171
 
172
- #### Default Provider
172
+ #### Default Indexing Provider
173
173
 
174
174
  By default, Kodit will use small local models for semantic search and enrichment. If you
175
175
  are using Kodit in a professional capacity, it is likely that the local model latency is
@@ -188,6 +188,71 @@ DEFAULT_ENDPOINT_BASE_URL=https://api.openai.com/v1
188
188
  DEFAULT_ENDPOINT_API_KEY=sk-xxxxxx
189
189
  ```
190
190
 
191
+ ### Database
192
+
193
+ Out of the box Kodit uses a local sqlite file to make it easier for users to get
194
+ started. But for production use, it's likely you will want to use a database that has
195
+ dedicated semantic and keyword search capabilities for reduced latency.
196
+
197
+ #### VectorChord Database
198
+
199
+ [VectorChord](https://github.com/tensorchord/VectorChord) is an optimized PostgreSQL
200
+ extension that provides both vector and BM25 search. (See [Search](#search))
201
+
202
+ Start a container with:
203
+
204
+ ```sh
205
+ docker run \
206
+ --name kodit-vectorchord \
207
+ -e POSTGRES_DB=kodit \
208
+ -e POSTGRES_PASSWORD=mysecretpassword \
209
+ -p 5432:5432 \
210
+ -d tensorchord/vchord-suite:pg17-20250601
211
+ ```
212
+
213
+ {{< warn >}}
214
+ Kodit assumes the database exists. In the above example I'm abusing the POSTGRES_DB
215
+ environmental variable from the [Postgres Docker
216
+ container](https://hub.docker.com/_/postgres/) to create the database for me. In
217
+ production setups, please create a database yourself.
218
+ {{< /warn >}}
219
+
220
+ Then update your `.env` file to include:
221
+
222
+ ```env
223
+ DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
224
+ ```
225
+
226
+ ### Search
227
+
228
+ #### Default Search Provider
229
+
230
+ By default, Kodit will use built-in implementations of BM25 and similarity search to
231
+ improve the out of the box experience. If you are using Kodit in a professional
232
+ capacity, it is likely that the search latency is too high to provide a good developer
233
+ experience.
234
+
235
+ Instead, you should use the features included in your database. The settings provided
236
+ here will cause all search functionality to use this database by default. You can
237
+ override the database used for each search type if you wish. (Coming soon!)
238
+
239
+ ##### VectorChord Search
240
+
241
+ Configure Kodit to use a [VectorChord database](#vectorchord-database).
242
+
243
+ Then update your `.env` file to include:
244
+
245
+ ```env
246
+ DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
247
+ DEFAULT_SEARCH_PROVIDER=vectorchord
248
+ ```
249
+
250
+ ### Enrichment
251
+
252
+ #### Default Enrichment Provider
253
+
254
+ The default enrichment provider is the same as [the default indexing provider](#default-indexing-provider).
255
+
191
256
  ## Managing Kodit
192
257
 
193
258
  There is limited management functionality at this time. To delete indexes you must
@@ -18,6 +18,7 @@ classifiers = [
18
18
 
19
19
  # Specify the Python versions you support here.
20
20
  "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
21
22
  ]
22
23
  requires-python = ">=3.12"
23
24
  dependencies = [
@@ -48,6 +49,8 @@ dependencies = [
48
49
  "hf-xet>=1.1.2",
49
50
  "openai>=1.82.0",
50
51
  "tiktoken>=0.9.0",
52
+ "asyncpg>=0.30.0",
53
+ "transformers>=4.51.3",
51
54
  ]
52
55
 
53
56
  [dependency-groups]
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.14'
21
- __version_tuple__ = version_tuple = (0, 1, 14)
20
+ __version__ = version = '0.1.16'
21
+ __version_tuple__ = version_tuple = (0, 1, 16)
@@ -0,0 +1,17 @@
1
+ """Factory for creating keyword search providers."""
2
+
3
+ from sqlalchemy.ext.asyncio import AsyncSession
4
+
5
+ from kodit.bm25.keyword_search_service import KeywordSearchProvider
6
+ from kodit.bm25.local_bm25 import BM25Service
7
+ from kodit.bm25.vectorchord_bm25 import VectorChordBM25
8
+ from kodit.config import AppContext
9
+
10
+
11
+ def keyword_search_factory(
12
+ app_context: AppContext, session: AsyncSession
13
+ ) -> KeywordSearchProvider:
14
+ """Create a keyword search provider."""
15
+ if app_context.default_search.provider == "vectorchord":
16
+ return VectorChordBM25(session=session)
17
+ return BM25Service(data_dir=app_context.get_data_dir())
@@ -0,0 +1,34 @@
1
+ """Keyword search service."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import NamedTuple
5
+
6
+
7
+ class BM25Document(NamedTuple):
8
+ """BM25 document."""
9
+
10
+ snippet_id: int
11
+ text: str
12
+
13
+
14
+ class BM25Result(NamedTuple):
15
+ """BM25 result."""
16
+
17
+ snippet_id: int
18
+ score: float
19
+
20
+
21
+ class KeywordSearchProvider(ABC):
22
+ """Interface for keyword search providers."""
23
+
24
+ @abstractmethod
25
+ async def index(self, corpus: list[BM25Document]) -> None:
26
+ """Index a new corpus."""
27
+
28
+ @abstractmethod
29
+ async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
30
+ """Retrieve from the index."""
31
+
32
+ @abstractmethod
33
+ async def delete(self, snippet_ids: list[int]) -> None:
34
+ """Delete documents from the index."""
@@ -1,23 +1,36 @@
1
- """BM25 service."""
1
+ """Locally hosted BM25 service primarily for use with SQLite."""
2
2
 
3
+ import json
3
4
  from pathlib import Path
4
5
 
6
+ import aiofiles
5
7
  import bm25s
6
8
  import Stemmer
7
9
  import structlog
8
10
  from bm25s.tokenization import Tokenized
9
11
 
12
+ from kodit.bm25.keyword_search_service import (
13
+ BM25Document,
14
+ BM25Result,
15
+ KeywordSearchProvider,
16
+ )
10
17
 
11
- class BM25Service:
12
- """Service for BM25."""
18
+ SNIPPET_IDS_FILE = "snippet_ids.jsonl"
19
+
20
+
21
+ class BM25Service(KeywordSearchProvider):
22
+ """LocalBM25 service."""
13
23
 
14
24
  def __init__(self, data_dir: Path) -> None:
15
25
  """Initialize the BM25 service."""
16
26
  self.log = structlog.get_logger(__name__)
17
27
  self.index_path = data_dir / "bm25s_index"
28
+ self.snippet_ids: list[int] = []
18
29
  try:
19
30
  self.log.debug("Loading BM25 index")
20
31
  self.retriever = bm25s.BM25.load(self.index_path, mmap=True)
32
+ with Path(self.index_path / SNIPPET_IDS_FILE).open() as f:
33
+ self.snippet_ids = json.load(f)
21
34
  except FileNotFoundError:
22
35
  self.log.debug("BM25 index not found, creating new index")
23
36
  self.retriever = bm25s.BM25()
@@ -33,28 +46,34 @@ class BM25Service:
33
46
  show_progress=True,
34
47
  )
35
48
 
36
- def index(self, corpus: list[str]) -> None:
49
+ async def index(self, corpus: list[BM25Document]) -> None:
37
50
  """Index a new corpus."""
38
51
  self.log.debug("Indexing corpus")
39
- vocab = self._tokenize(corpus)
52
+ vocab = self._tokenize([doc.text for doc in corpus])
40
53
  self.retriever = bm25s.BM25()
41
54
  self.retriever.index(vocab, show_progress=False)
42
55
  self.retriever.save(self.index_path)
56
+ self.snippet_ids = self.snippet_ids + [doc.snippet_id for doc in corpus]
57
+ async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
58
+ await f.write(json.dumps(self.snippet_ids))
43
59
 
44
- def retrieve(
45
- self, doc_ids: list[int], query: str, top_k: int = 2
46
- ) -> list[tuple[int, float]]:
60
+ async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
47
61
  """Retrieve from the index."""
48
62
  if top_k == 0:
49
63
  self.log.warning("Top k is 0, returning empty list")
50
64
  return []
51
- if len(doc_ids) == 0:
52
- self.log.warning("No documents to retrieve from, returning empty list")
65
+
66
+ # Get the number of documents in the index
67
+ num_docs = self.retriever.scores["num_docs"]
68
+ if num_docs == 0:
53
69
  return []
54
70
 
55
- top_k = min(top_k, len(self.retriever.scores))
71
+ # Adjust top_k to not exceed corpus size
72
+ top_k = min(top_k, num_docs)
56
73
  self.log.debug(
57
- "Retrieving from index", query=query, top_k=top_k, num_docs=len(doc_ids)
74
+ "Retrieving from index",
75
+ query=query,
76
+ top_k=top_k,
58
77
  )
59
78
 
60
79
  query_tokens = self._tokenize([query])
@@ -62,10 +81,17 @@ class BM25Service:
62
81
  self.log.debug("Query tokens", query_tokens=query_tokens)
63
82
 
64
83
  results, scores = self.retriever.retrieve(
65
- query_tokens=query_tokens, corpus=doc_ids, k=top_k
84
+ query_tokens=query_tokens,
85
+ corpus=self.snippet_ids,
86
+ k=top_k,
66
87
  )
67
88
  self.log.debug("Raw results", results=results, scores=scores)
68
89
  return [
69
- (int(result), float(score))
90
+ BM25Result(snippet_id=int(result), score=float(score))
70
91
  for result, score in zip(results[0], scores[0], strict=False)
92
+ if score > 0.0
71
93
  ]
94
+
95
+ async def delete(self, snippet_ids: list[int]) -> None: # noqa: ARG002
96
+ """Delete documents from the index."""
97
+ self.log.warning("Deletion not supported for local BM25 index")
@@ -0,0 +1,193 @@
1
+ """VectorChord repository for document operations."""
2
+
3
+ from typing import Any
4
+
5
+ from sqlalchemy import Result, TextClause, bindparam, text
6
+ from sqlalchemy.ext.asyncio import AsyncSession
7
+
8
+ from kodit.bm25.keyword_search_service import (
9
+ BM25Document,
10
+ BM25Result,
11
+ KeywordSearchProvider,
12
+ )
13
+
14
+ TABLE_NAME = "vectorchord_bm25_documents"
15
+ INDEX_NAME = f"{TABLE_NAME}_idx"
16
+ TOKENIZER_NAME = "bert"
17
+
18
+ # SQL statements
19
+ CREATE_VCHORD_EXTENSION = "CREATE EXTENSION IF NOT EXISTS vchord CASCADE;"
20
+ CREATE_PG_TOKENIZER = "CREATE EXTENSION IF NOT EXISTS pg_tokenizer CASCADE;"
21
+ CREATE_VCHORD_BM25 = "CREATE EXTENSION IF NOT EXISTS vchord_bm25 CASCADE;"
22
+ SET_SEARCH_PATH = """
23
+ SET search_path TO
24
+ "$user", public, bm25_catalog, pg_catalog, information_schema, tokenizer_catalog;
25
+ """
26
+ CREATE_BM25_TABLE = f"""
27
+ CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
28
+ id SERIAL PRIMARY KEY,
29
+ snippet_id BIGINT NOT NULL,
30
+ passage TEXT NOT NULL,
31
+ embedding bm25vector,
32
+ UNIQUE(snippet_id)
33
+ )
34
+ """
35
+
36
+ CREATE_BM25_INDEX = f"""
37
+ CREATE INDEX IF NOT EXISTS {INDEX_NAME}
38
+ ON {TABLE_NAME}
39
+ USING bm25 (embedding bm25_ops)
40
+ """
41
+ TOKENIZER_NAME_CHECK_QUERY = (
42
+ f"SELECT 1 FROM tokenizer_catalog.tokenizer WHERE name = '{TOKENIZER_NAME}'" # noqa: S608
43
+ )
44
+ LOAD_TOKENIZER = """
45
+ SELECT create_tokenizer('bert', $$
46
+ model = "llmlingua2"
47
+ pre_tokenizer = "unicode_segmentation" # Unicode Standard Annex #29
48
+ [[character_filters]]
49
+ to_lowercase = {} # convert all characters to lowercase
50
+ [[character_filters]]
51
+ unicode_normalization = "nfkd" # Unicode Normalization Form KD
52
+ [[token_filters]]
53
+ skip_non_alphanumeric = {} # remove non-alphanumeric tokens
54
+ [[token_filters]]
55
+ stopwords = "nltk_english" # remove stopwords using the nltk dictionary
56
+ [[token_filters]]
57
+ stemmer = "english_porter2" # stem tokens using the English Porter2 stemmer
58
+ $$)
59
+ """
60
+ INSERT_QUERY = f"""
61
+ INSERT INTO {TABLE_NAME} (snippet_id, passage)
62
+ VALUES (:snippet_id, :passage)
63
+ ON CONFLICT (snippet_id) DO UPDATE
64
+ SET passage = EXCLUDED.passage
65
+ """ # noqa: S608
66
+ UPDATE_QUERY = f"""
67
+ UPDATE {TABLE_NAME}
68
+ SET embedding = tokenize(passage, '{TOKENIZER_NAME}')
69
+ """ # noqa: S608
70
+ SEARCH_QUERY = f"""
71
+ SELECT
72
+ snippet_id,
73
+ embedding <&>
74
+ to_bm25query('{INDEX_NAME}', tokenize(:query_text, '{TOKENIZER_NAME}'))
75
+ AS bm25_score
76
+ FROM {TABLE_NAME}
77
+ ORDER BY bm25_score
78
+ LIMIT :limit
79
+ """ # noqa: S608
80
+ DELETE_QUERY = f"""
81
+ DELETE FROM {TABLE_NAME}
82
+ WHERE snippet_id IN :snippet_ids
83
+ """ # noqa: S608
84
+
85
+
86
+ class VectorChordBM25(KeywordSearchProvider):
87
+ """BM25 using VectorChord."""
88
+
89
+ def __init__(
90
+ self,
91
+ session: AsyncSession,
92
+ ) -> None:
93
+ """Initialize the VectorChord BM25."""
94
+ self.__session = session
95
+ self._initialized = False
96
+
97
+ async def _initialize(self) -> None:
98
+ """Initialize the VectorChord environment."""
99
+ try:
100
+ await self._create_extensions()
101
+ await self._create_tokenizer_if_not_exists()
102
+ await self._create_tables()
103
+ self._initialized = True
104
+ except Exception as e:
105
+ msg = f"Failed to initialize VectorChord repository: {e}"
106
+ raise RuntimeError(msg) from e
107
+
108
+ async def _create_extensions(self) -> None:
109
+ """Create the necessary extensions."""
110
+ await self.__session.execute(text(CREATE_VCHORD_EXTENSION))
111
+ await self.__session.execute(text(CREATE_PG_TOKENIZER))
112
+ await self.__session.execute(text(CREATE_VCHORD_BM25))
113
+ await self.__session.execute(text(SET_SEARCH_PATH))
114
+ await self._commit()
115
+
116
+ async def _create_tokenizer_if_not_exists(self) -> None:
117
+ """Create the tokenizer if it doesn't exist."""
118
+ # Check if tokenizer exists in the catalog
119
+ result = await self.__session.execute(text(TOKENIZER_NAME_CHECK_QUERY))
120
+ if result.scalar_one_or_none() is None:
121
+ # Tokenizer doesn't exist, create it
122
+ await self.__session.execute(text(LOAD_TOKENIZER))
123
+ await self._commit()
124
+
125
+ async def _create_tables(self) -> None:
126
+ """Create the necessary tables in the correct order."""
127
+ await self.__session.execute(text(CREATE_BM25_TABLE))
128
+ await self.__session.execute(text(CREATE_BM25_INDEX))
129
+ await self._commit()
130
+
131
+ async def _execute(
132
+ self, query: TextClause, param_list: list[Any] | dict[str, Any] | None = None
133
+ ) -> Result:
134
+ """Execute a query."""
135
+ if not self._initialized:
136
+ await self._initialize()
137
+ return await self.__session.execute(query, param_list)
138
+
139
+ async def _commit(self) -> None:
140
+ """Commit the session."""
141
+ await self.__session.commit()
142
+
143
+ async def index(self, corpus: list[BM25Document]) -> None:
144
+ """Index a new corpus."""
145
+ # Filter out any documents that don't have a snippet_id or text
146
+ corpus = [
147
+ doc
148
+ for doc in corpus
149
+ if doc.snippet_id is not None and doc.text is not None and doc.text != ""
150
+ ]
151
+
152
+ if not corpus:
153
+ return
154
+
155
+ # Execute inserts
156
+ await self._execute(
157
+ text(INSERT_QUERY),
158
+ [{"snippet_id": doc.snippet_id, "passage": doc.text} for doc in corpus],
159
+ )
160
+
161
+ # Tokenize the new documents with schema qualification
162
+ await self._execute(text(UPDATE_QUERY))
163
+ await self._commit()
164
+
165
+ async def delete(self, snippet_ids: list[int]) -> None:
166
+ """Delete documents from the index."""
167
+ await self._execute(
168
+ text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
169
+ {"snippet_ids": snippet_ids},
170
+ )
171
+ await self._commit()
172
+
173
+ async def retrieve(
174
+ self,
175
+ query: str,
176
+ top_k: int = 10,
177
+ ) -> list[BM25Result]:
178
+ """Search documents using BM25 similarity."""
179
+ if not query or query == "":
180
+ return []
181
+
182
+ sql = text(SEARCH_QUERY).bindparams(query_text=query, limit=top_k)
183
+ try:
184
+ result = await self._execute(sql)
185
+ rows = result.mappings().all()
186
+
187
+ return [
188
+ BM25Result(snippet_id=row["snippet_id"], score=row["bm25_score"])
189
+ for row in rows
190
+ ]
191
+ except Exception as e:
192
+ msg = f"Error during BM25 search: {e}"
193
+ raise RuntimeError(msg) from e