kodit 0.1.13__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit-0.1.15/.cursor/rules/kodit.mdc +8 -0
- kodit-0.1.15/.github/dependabot.yml +10 -0
- {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/pypi.yaml +5 -5
- {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/test.yaml +12 -10
- {kodit-0.1.13 → kodit-0.1.15}/PKG-INFO +2 -1
- kodit-0.1.15/docs/_index.md +253 -0
- {kodit-0.1.13 → kodit-0.1.15}/pyproject.toml +1 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/_version.py +2 -2
- kodit-0.1.15/src/kodit/bm25/keyword_search_factory.py +17 -0
- kodit-0.1.15/src/kodit/bm25/keyword_search_service.py +34 -0
- kodit-0.1.13/src/kodit/bm25/bm25.py → kodit-0.1.15/src/kodit/bm25/local_bm25.py +40 -14
- kodit-0.1.15/src/kodit/bm25/vectorchord_bm25.py +193 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/cli.py +14 -11
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/config.py +9 -2
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/database.py +4 -2
- kodit-0.1.15/src/kodit/embedding/embedding_factory.py +44 -0
- kodit-0.1.15/src/kodit/embedding/embedding_provider/__init__.py +1 -0
- kodit-0.1.15/src/kodit/embedding/embedding_provider/embedding_provider.py +53 -0
- kodit-0.1.15/src/kodit/embedding/embedding_provider/hash_embedding_provider.py +77 -0
- kodit-0.1.15/src/kodit/embedding/embedding_provider/local_embedding_provider.py +58 -0
- kodit-0.1.15/src/kodit/embedding/embedding_provider/openai_embedding_provider.py +63 -0
- kodit-0.1.13/src/kodit/search/search_repository.py → kodit-0.1.15/src/kodit/embedding/embedding_repository.py +61 -33
- kodit-0.1.15/src/kodit/embedding/local_vector_search_service.py +50 -0
- kodit-0.1.15/src/kodit/embedding/vector_search_service.py +38 -0
- kodit-0.1.15/src/kodit/embedding/vectorchord_vector_search_service.py +145 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/indexing/indexing_repository.py +24 -4
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/indexing/indexing_service.py +25 -30
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/mcp.py +28 -7
- kodit-0.1.15/src/kodit/search/search_repository.py +57 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/search/search_service.py +12 -24
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/source/source_service.py +9 -3
- kodit-0.1.15/src/kodit/util/__init__.py +1 -0
- kodit-0.1.15/src/kodit/util/spinner.py +59 -0
- kodit-0.1.15/tests/experiments/cline-prompt-regression-tests/cline_prompt.txt +631 -0
- kodit-0.1.15/tests/experiments/cline-prompt-regression-tests/cline_prompt_test.py +141 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/experiments/embedding.py +3 -3
- kodit-0.1.15/tests/kodit/bm25/local_bm25_test.py +155 -0
- kodit-0.1.15/tests/kodit/bm25/vectorchord_repository_test.py +182 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/cli_test.py +16 -4
- kodit-0.1.15/tests/kodit/embedding/embedding_provider/local_embedding_provider_test.py +93 -0
- kodit-0.1.15/tests/kodit/embedding/embedding_provider/openai_embedding_provider_test.py +138 -0
- kodit-0.1.15/tests/kodit/embedding/local_vector_search_service_test.py +143 -0
- kodit-0.1.15/tests/kodit/embedding/vectorchord_vector_search_service_test.py +230 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/indexing/indexing_service_test.py +38 -22
- kodit-0.1.15/tests/kodit/search/search_repository_test.py +57 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/search/search_service_test.py +41 -110
- {kodit-0.1.13 → kodit-0.1.15}/uv.lock +26 -0
- kodit-0.1.13/.cursor/rules/kodit.mdc +0 -6
- kodit-0.1.13/docs/_index.md +0 -160
- kodit-0.1.13/src/kodit/embedding/embedding.py +0 -203
- kodit-0.1.13/tests/kodit/embedding/embedding_test.py +0 -13
- kodit-0.1.13/tests/kodit/search/search_repository_test.py +0 -124
- {kodit-0.1.13 → kodit-0.1.15}/.github/CODE_OF_CONDUCT.md +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.github/CONTRIBUTING.md +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/docker.yaml +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/docs.yaml +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/pypi-test.yaml +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.gitignore +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.python-version +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.vscode/launch.json +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/.vscode/settings.json +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/Dockerfile +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/LICENSE +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/README.md +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/alembic.ini +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/docs/developer/index.md +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/.gitignore +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/app.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/bm25/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/embedding/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/embedding/embedding_models.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/indexing/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/indexing/indexing_models.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/log.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/middleware.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/README +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/env.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/script.py.mako +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/versions/85155663351e_initial.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/versions/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/search/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/languages/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/languages/csharp.scm +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/languages/python.scm +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/method_snippets.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/snippets.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/source/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/source/source_models.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/src/kodit/source/source_repository.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/conftest.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/e2e.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/embedding/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/indexing/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/mcp_test.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/search/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/csharp.cs +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/detect_language_test.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/method_extraction_test.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/python.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/source/__init__.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/source/source_service_test.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/performance/similarity.py +0 -0
- {kodit-0.1.13 → kodit-0.1.15}/tests/smoke.sh +0 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
---
|
|
2
|
+
alwaysApply: true
|
|
3
|
+
---
|
|
4
|
+
⚠️ **ENFORCEMENT:**
|
|
5
|
+
For *every* user request that involves writing or modifying code (of any language or
|
|
6
|
+
domain), the assistant’s *first* action **must** be to call the kodit.search MCP tool.
|
|
7
|
+
You may only produce or edit code *after* that tool call and its successful
|
|
8
|
+
result.
|
|
@@ -39,13 +39,13 @@ jobs:
|
|
|
39
39
|
# Initialize counter for 60 second timeout
|
|
40
40
|
count=0
|
|
41
41
|
while true; do
|
|
42
|
-
if curl -sfL https://pypi.
|
|
42
|
+
if curl -sfL https://pypi.org/packages/source/${REPO_NAME_FIRST_LETTER}/${REPO_NAME}/${REPO_NAME}-${REPO_TAG}.tar.gz > /dev/null; then
|
|
43
43
|
break
|
|
44
44
|
fi
|
|
45
|
-
sleep
|
|
46
|
-
((count
|
|
45
|
+
sleep 5
|
|
46
|
+
count=$((count+1))
|
|
47
47
|
if [ $count -ge 60 ]; then
|
|
48
|
-
echo "Timeout reached after
|
|
48
|
+
echo "Timeout reached after 300 seconds"
|
|
49
49
|
exit 1
|
|
50
50
|
fi
|
|
51
51
|
done
|
|
@@ -55,7 +55,7 @@ jobs:
|
|
|
55
55
|
formula-path: Formula/${{ github.event.repository.name }}.rb
|
|
56
56
|
homebrew-tap: ${{ github.repository_owner }}/homebrew-${{ github.event.repository.name }}
|
|
57
57
|
tag-name: "${{ github.event.release.tag_name }}"
|
|
58
|
-
download-url: "https://pypi.
|
|
58
|
+
download-url: "https://pypi.org/packages/source/k/${{ github.event.repository.name }}/${{ github.event.repository.name }}-${{ github.event.release.tag_name }}.tar.gz"
|
|
59
59
|
commit-message: |
|
|
60
60
|
{{formulaName}} {{version}}
|
|
61
61
|
env:
|
|
@@ -35,7 +35,7 @@ jobs:
|
|
|
35
35
|
run: uv run ruff check
|
|
36
36
|
|
|
37
37
|
- name: Run tests
|
|
38
|
-
run: uv run pytest -s --cov=src --cov-report=xml
|
|
38
|
+
run: uv run pytest -s --cov=src --cov-report=xml tests/kodit
|
|
39
39
|
|
|
40
40
|
- name: Pytest coverage comment
|
|
41
41
|
if: github.event_name == 'pull_request'
|
|
@@ -58,13 +58,13 @@ jobs:
|
|
|
58
58
|
- name: Install uv
|
|
59
59
|
uses: astral-sh/setup-uv@v5
|
|
60
60
|
|
|
61
|
-
- run: uv build --
|
|
61
|
+
- run: uv build --wheel --out-dir test-build
|
|
62
62
|
|
|
63
63
|
- name: Upload built package
|
|
64
64
|
uses: actions/upload-artifact@v4
|
|
65
65
|
with:
|
|
66
66
|
name: built-package
|
|
67
|
-
path: test-build/*.
|
|
67
|
+
path: test-build/*.whl
|
|
68
68
|
|
|
69
69
|
test-package:
|
|
70
70
|
needs: build-package
|
|
@@ -88,14 +88,16 @@ jobs:
|
|
|
88
88
|
with:
|
|
89
89
|
python-version: 3.12
|
|
90
90
|
|
|
91
|
-
- name:
|
|
92
|
-
|
|
93
|
-
run: echo "sdist_path=$(ls test-build/*.tar.gz)" >> $GITHUB_OUTPUT
|
|
91
|
+
- name: Install uv
|
|
92
|
+
uses: astral-sh/setup-uv@v5
|
|
94
93
|
|
|
95
|
-
- name:
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
94
|
+
- name: Extract path to wheel
|
|
95
|
+
id: wheel_path
|
|
96
|
+
run: echo "wheel_path=$(ls test-build/*.whl)" >> $GITHUB_OUTPUT
|
|
97
|
+
|
|
98
|
+
# This is equivalent to `pipx install --include-deps, but faster
|
|
99
|
+
- name: Install wheel
|
|
100
|
+
run: uv tool install "${{ steps.wheel_path.outputs.wheel_path }}"
|
|
99
101
|
|
|
100
102
|
- name: Run simple version command test
|
|
101
103
|
run: kodit version
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kodit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.15
|
|
4
4
|
Summary: Code indexing for better AI code generation
|
|
5
5
|
Project-URL: Homepage, https://docs.helixml.tech/kodit/
|
|
6
6
|
Project-URL: Documentation, https://docs.helixml.tech/kodit/
|
|
@@ -21,6 +21,7 @@ Requires-Dist: aiofiles>=24.1.0
|
|
|
21
21
|
Requires-Dist: aiosqlite>=0.20.0
|
|
22
22
|
Requires-Dist: alembic>=1.15.2
|
|
23
23
|
Requires-Dist: asgi-correlation-id>=4.3.4
|
|
24
|
+
Requires-Dist: asyncpg>=0.30.0
|
|
24
25
|
Requires-Dist: better-exceptions>=0.3.3
|
|
25
26
|
Requires-Dist: bm25s[core]>=0.2.12
|
|
26
27
|
Requires-Dist: click>=8.1.8
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "kodit: Code Indexing MCP Server"
|
|
3
|
+
linkTitle: kodit Docs
|
|
4
|
+
cascade:
|
|
5
|
+
type: docs
|
|
6
|
+
menu:
|
|
7
|
+
main:
|
|
8
|
+
name: kodit Docs
|
|
9
|
+
weight: 3
|
|
10
|
+
# next: /helix/getting-started
|
|
11
|
+
weight: 1
|
|
12
|
+
aliases:
|
|
13
|
+
- /coda
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Please choose your preferred installation method. They all ultimately install the kodit
|
|
19
|
+
cli, which contains the kodit MCP server and other tools to manage your data sources.
|
|
20
|
+
|
|
21
|
+
### Docker
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
docker run -it --rm registry.helix.ml/helix/kodit:latest
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Always replace latest with a specific version.
|
|
28
|
+
|
|
29
|
+
### pipx
|
|
30
|
+
|
|
31
|
+
```sh
|
|
32
|
+
pipx install kodit
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### homebrew
|
|
36
|
+
|
|
37
|
+
```sh
|
|
38
|
+
brew install helixml/kodit/kodit
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### uv
|
|
42
|
+
|
|
43
|
+
```sh
|
|
44
|
+
uv tool install kodit
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### pip
|
|
48
|
+
|
|
49
|
+
Use this if you want to use kodit as a python library:
|
|
50
|
+
|
|
51
|
+
```sh
|
|
52
|
+
pip install kodit
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
Kodit has two key parts. A configuration CLI to manage what gets indexed and an MCP
|
|
58
|
+
server to expose your code to an AI coding assistant.
|
|
59
|
+
|
|
60
|
+
1. Index a source:
|
|
61
|
+
1. a local path: `kodit index /path/to/your/code`
|
|
62
|
+
2. or index a public git repository: `kodit index https://github.com/pydantic/pydantic-ai`
|
|
63
|
+
2. Manually search your index:
|
|
64
|
+
1. with a keyword: `kodit search keyword "test"`
|
|
65
|
+
2. or with code: `kodit search code "def main()"`
|
|
66
|
+
3. or via hybrid search: `kodit search code hybrid --keywords "main" --code "def main()"`
|
|
67
|
+
3. Start an MCP server: `kodit serve`
|
|
68
|
+
|
|
69
|
+
Now add the Kodit MCP server to your AI coding assistant.
|
|
70
|
+
|
|
71
|
+
### Integrating Kodit with Coding Assistants
|
|
72
|
+
|
|
73
|
+
#### Integration with Cursor
|
|
74
|
+
|
|
75
|
+
Add the following to `$HOME/.cursor/mcp.json`:
|
|
76
|
+
|
|
77
|
+
```json
|
|
78
|
+
{
|
|
79
|
+
"mcpServers": {
|
|
80
|
+
"kodit": {
|
|
81
|
+
"url": "http://localhost:8080/sse"
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Or find this configuration in `Cursor Settings` -> `MCP`.
|
|
88
|
+
|
|
89
|
+
#### Integration with Cline
|
|
90
|
+
|
|
91
|
+
1. Open Cline from the side menu
|
|
92
|
+
2. Click the `MCP Servers` button at the top right of the Cline window (the icon looks
|
|
93
|
+
like a server)
|
|
94
|
+
3. Click the `Remote Servers` tab.
|
|
95
|
+
4. Click `Edit Configuration`
|
|
96
|
+
5. Add the following configuration:
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"mcpServers": {
|
|
101
|
+
"kodit": {
|
|
102
|
+
"autoApprove": [],
|
|
103
|
+
"disabled": true,
|
|
104
|
+
"timeout": 60,
|
|
105
|
+
"url": "http://localhost:8080/sse",
|
|
106
|
+
"transportType": "sse"
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
6. Save the configuration and browse to the `Installed` tab.
|
|
113
|
+
|
|
114
|
+
Kodit should be listed and responding. Now code on!
|
|
115
|
+
|
|
116
|
+
### Forcing AI Assistants to use Kodit
|
|
117
|
+
|
|
118
|
+
Although Kodit has been developed to work well out of the box with popular AI coding
|
|
119
|
+
assistants, they sometimes still think they know better.
|
|
120
|
+
|
|
121
|
+
You can force your assistant to use Kodit by editing the system prompt used by the
|
|
122
|
+
assistant. Each assistant exposes this slightly differently, but it's usually in the
|
|
123
|
+
settings.
|
|
124
|
+
|
|
125
|
+
Try using this system prompt:
|
|
126
|
+
|
|
127
|
+
```txt
|
|
128
|
+
⚠️ **ENFORCEMENT:**
|
|
129
|
+
For *every* user request that involves writing or modifying code (of any language or
|
|
130
|
+
domain), the assistant's *first* action **must** be to call the kodit.search MCP tool.
|
|
131
|
+
You may only produce or edit code *after* that tool call and its successful
|
|
132
|
+
result.
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Feel free to alter that to suit your specific circumstances.
|
|
136
|
+
|
|
137
|
+
#### Forcing Cursor to Use Kodit
|
|
138
|
+
|
|
139
|
+
Add the following prompt to `.cursor/rules/kodit.mdc` in your project directory:
|
|
140
|
+
|
|
141
|
+
```markdown
|
|
142
|
+
---
|
|
143
|
+
alwaysApply: true
|
|
144
|
+
---
|
|
145
|
+
⚠️ **ENFORCEMENT:**
|
|
146
|
+
For *every* user request that involves writing or modifying code (of any language or
|
|
147
|
+
domain), the assistant's *first* action **must** be to call the kodit.search MCP tool.
|
|
148
|
+
You may only produce or edit code *after* that tool call and its successful
|
|
149
|
+
result.
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Alternatively, you can browse to the Cursor settings and set this prompt globally.
|
|
153
|
+
|
|
154
|
+
#### Forcing Cline to Use Kodit
|
|
155
|
+
|
|
156
|
+
1. Go to `Settings` -> `API Configuration`
|
|
157
|
+
2. At the bottom there is a `Custom Instructions` section.
|
|
158
|
+
|
|
159
|
+
## Configuring Kodit
|
|
160
|
+
|
|
161
|
+
Configuration of Kodit is performed by setting environmental variables or adding
|
|
162
|
+
variables to a .env file.
|
|
163
|
+
|
|
164
|
+
{{< warn >}}
|
|
165
|
+
Note that updating a setting does not automatically update the data that uses that
|
|
166
|
+
setting. For example, if you change a provider, you will need to delete and
|
|
167
|
+
recreate all indexes.
|
|
168
|
+
{{< /warn >}}
|
|
169
|
+
|
|
170
|
+
### Indexing
|
|
171
|
+
|
|
172
|
+
#### Default Provider
|
|
173
|
+
|
|
174
|
+
By default, Kodit will use small local models for semantic search and enrichment. If you
|
|
175
|
+
are using Kodit in a professional capacity, it is likely that the local model latency is
|
|
176
|
+
too high to provide a good developer experience.
|
|
177
|
+
|
|
178
|
+
Instead, you should use an external provider. The settings provided here will cause all
|
|
179
|
+
embedding and enrichments request to be sent to this provider by default. You can
|
|
180
|
+
override the provider used for each task if you wish. (Coming soon!)
|
|
181
|
+
|
|
182
|
+
##### OpenAI
|
|
183
|
+
|
|
184
|
+
Add the following settings to your .env file, or export them as environmental variables:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
DEFAULT_ENDPOINT_BASE_URL=https://api.openai.com/v1
|
|
188
|
+
DEFAULT_ENDPOINT_API_KEY=sk-xxxxxx
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Database
|
|
192
|
+
|
|
193
|
+
Out of the box Kodit uses a local sqlite file to make it easier for users to get
|
|
194
|
+
started. But for production use, it's likely you will want to use a database that has
|
|
195
|
+
dedicated semantic and keyword search capabilities for reduced latency.
|
|
196
|
+
|
|
197
|
+
#### VectorChord Database
|
|
198
|
+
|
|
199
|
+
[VectorChord](https://github.com/tensorchord/VectorChord) is an optimized PostgreSQL
|
|
200
|
+
extension that provides both vector and BM25 search. (See [Search](#search))
|
|
201
|
+
|
|
202
|
+
Start a container with:
|
|
203
|
+
|
|
204
|
+
```sh
|
|
205
|
+
docker run \
|
|
206
|
+
--name kodit-vectorchord \
|
|
207
|
+
-e POSTGRES_DB=kodit \
|
|
208
|
+
-e POSTGRES_PASSWORD=mysecretpassword \
|
|
209
|
+
-p 5432:5432 \
|
|
210
|
+
-d tensorchord/vchord-suite:pg17-20250601
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
{{< warn >}}
|
|
214
|
+
Kodit assumes the database exists. In the above example I'm abusing the POSTGRES_DB
|
|
215
|
+
environmental variable from the [Postgres Docker
|
|
216
|
+
container](https://hub.docker.com/_/postgres/) to create the database for me. In
|
|
217
|
+
production setups, please create a database yourself.
|
|
218
|
+
{{< /warn >}}
|
|
219
|
+
|
|
220
|
+
Then update your `.env` file to include:
|
|
221
|
+
|
|
222
|
+
```env
|
|
223
|
+
DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Search
|
|
227
|
+
|
|
228
|
+
#### Default Search Provider
|
|
229
|
+
|
|
230
|
+
By default, Kodit will use built-in implementations of BM25 and similarity search to
|
|
231
|
+
improve the out of the box experience. If you are using Kodit in a professional
|
|
232
|
+
capacity, it is likely that the search latency is too high to provide a good developer
|
|
233
|
+
experience.
|
|
234
|
+
|
|
235
|
+
Instead, you should use the features included in your database. The settings provided
|
|
236
|
+
here will cause all search functionality to use this database by default. You can
|
|
237
|
+
override the database used for each search type if you wish. (Coming soon!)
|
|
238
|
+
|
|
239
|
+
##### VectorChord Search
|
|
240
|
+
|
|
241
|
+
Configure Kodit to use a [VectorChord database](#vectorchord-database).
|
|
242
|
+
|
|
243
|
+
Then update your `.env` file to include:
|
|
244
|
+
|
|
245
|
+
```env
|
|
246
|
+
DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
|
|
247
|
+
DEFAULT_SEARCH_PROVIDER=vectorchord
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Managing Kodit
|
|
251
|
+
|
|
252
|
+
There is limited management functionality at this time. To delete indexes you must
|
|
253
|
+
delete the database and/or tables.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Factory for creating keyword search providers."""
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
+
|
|
5
|
+
from kodit.bm25.keyword_search_service import KeywordSearchProvider
|
|
6
|
+
from kodit.bm25.local_bm25 import BM25Service
|
|
7
|
+
from kodit.bm25.vectorchord_bm25 import VectorChordBM25
|
|
8
|
+
from kodit.config import AppContext
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def keyword_search_factory(
|
|
12
|
+
app_context: AppContext, session: AsyncSession
|
|
13
|
+
) -> KeywordSearchProvider:
|
|
14
|
+
"""Create a keyword search provider."""
|
|
15
|
+
if app_context.default_search.provider == "vectorchord":
|
|
16
|
+
return VectorChordBM25(session=session)
|
|
17
|
+
return BM25Service(data_dir=app_context.get_data_dir())
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Keyword search service."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import NamedTuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BM25Document(NamedTuple):
|
|
8
|
+
"""BM25 document."""
|
|
9
|
+
|
|
10
|
+
snippet_id: int
|
|
11
|
+
text: str
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BM25Result(NamedTuple):
|
|
15
|
+
"""BM25 result."""
|
|
16
|
+
|
|
17
|
+
snippet_id: int
|
|
18
|
+
score: float
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class KeywordSearchProvider(ABC):
|
|
22
|
+
"""Interface for keyword search providers."""
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
async def index(self, corpus: list[BM25Document]) -> None:
|
|
26
|
+
"""Index a new corpus."""
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
|
|
30
|
+
"""Retrieve from the index."""
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
async def delete(self, snippet_ids: list[int]) -> None:
|
|
34
|
+
"""Delete documents from the index."""
|
|
@@ -1,23 +1,36 @@
|
|
|
1
|
-
"""BM25 service."""
|
|
1
|
+
"""Locally hosted BM25 service primarily for use with SQLite."""
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
6
|
+
import aiofiles
|
|
5
7
|
import bm25s
|
|
6
8
|
import Stemmer
|
|
7
9
|
import structlog
|
|
8
10
|
from bm25s.tokenization import Tokenized
|
|
9
11
|
|
|
12
|
+
from kodit.bm25.keyword_search_service import (
|
|
13
|
+
BM25Document,
|
|
14
|
+
BM25Result,
|
|
15
|
+
KeywordSearchProvider,
|
|
16
|
+
)
|
|
10
17
|
|
|
11
|
-
|
|
12
|
-
|
|
18
|
+
SNIPPET_IDS_FILE = "snippet_ids.jsonl"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BM25Service(KeywordSearchProvider):
|
|
22
|
+
"""LocalBM25 service."""
|
|
13
23
|
|
|
14
24
|
def __init__(self, data_dir: Path) -> None:
|
|
15
25
|
"""Initialize the BM25 service."""
|
|
16
26
|
self.log = structlog.get_logger(__name__)
|
|
17
27
|
self.index_path = data_dir / "bm25s_index"
|
|
28
|
+
self.snippet_ids: list[int] = []
|
|
18
29
|
try:
|
|
19
30
|
self.log.debug("Loading BM25 index")
|
|
20
31
|
self.retriever = bm25s.BM25.load(self.index_path, mmap=True)
|
|
32
|
+
with Path(self.index_path / SNIPPET_IDS_FILE).open() as f:
|
|
33
|
+
self.snippet_ids = json.load(f)
|
|
21
34
|
except FileNotFoundError:
|
|
22
35
|
self.log.debug("BM25 index not found, creating new index")
|
|
23
36
|
self.retriever = bm25s.BM25()
|
|
@@ -33,28 +46,34 @@ class BM25Service:
|
|
|
33
46
|
show_progress=True,
|
|
34
47
|
)
|
|
35
48
|
|
|
36
|
-
def index(self, corpus: list[
|
|
49
|
+
async def index(self, corpus: list[BM25Document]) -> None:
|
|
37
50
|
"""Index a new corpus."""
|
|
38
51
|
self.log.debug("Indexing corpus")
|
|
39
|
-
vocab = self._tokenize(corpus)
|
|
52
|
+
vocab = self._tokenize([doc.text for doc in corpus])
|
|
40
53
|
self.retriever = bm25s.BM25()
|
|
41
54
|
self.retriever.index(vocab, show_progress=False)
|
|
42
55
|
self.retriever.save(self.index_path)
|
|
56
|
+
self.snippet_ids = self.snippet_ids + [doc.snippet_id for doc in corpus]
|
|
57
|
+
async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
|
|
58
|
+
await f.write(json.dumps(self.snippet_ids))
|
|
43
59
|
|
|
44
|
-
def retrieve(
|
|
45
|
-
self, doc_ids: list[int], query: str, top_k: int = 2
|
|
46
|
-
) -> list[tuple[int, float]]:
|
|
60
|
+
async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
|
|
47
61
|
"""Retrieve from the index."""
|
|
48
62
|
if top_k == 0:
|
|
49
63
|
self.log.warning("Top k is 0, returning empty list")
|
|
50
64
|
return []
|
|
51
|
-
|
|
52
|
-
|
|
65
|
+
|
|
66
|
+
# Get the number of documents in the index
|
|
67
|
+
num_docs = self.retriever.scores["num_docs"]
|
|
68
|
+
if num_docs == 0:
|
|
53
69
|
return []
|
|
54
70
|
|
|
55
|
-
|
|
71
|
+
# Adjust top_k to not exceed corpus size
|
|
72
|
+
top_k = min(top_k, num_docs)
|
|
56
73
|
self.log.debug(
|
|
57
|
-
"Retrieving from index",
|
|
74
|
+
"Retrieving from index",
|
|
75
|
+
query=query,
|
|
76
|
+
top_k=top_k,
|
|
58
77
|
)
|
|
59
78
|
|
|
60
79
|
query_tokens = self._tokenize([query])
|
|
@@ -62,10 +81,17 @@ class BM25Service:
|
|
|
62
81
|
self.log.debug("Query tokens", query_tokens=query_tokens)
|
|
63
82
|
|
|
64
83
|
results, scores = self.retriever.retrieve(
|
|
65
|
-
query_tokens=query_tokens,
|
|
84
|
+
query_tokens=query_tokens,
|
|
85
|
+
corpus=self.snippet_ids,
|
|
86
|
+
k=top_k,
|
|
66
87
|
)
|
|
67
88
|
self.log.debug("Raw results", results=results, scores=scores)
|
|
68
89
|
return [
|
|
69
|
-
(int(result), float(score))
|
|
90
|
+
BM25Result(snippet_id=int(result), score=float(score))
|
|
70
91
|
for result, score in zip(results[0], scores[0], strict=False)
|
|
92
|
+
if score > 0.0
|
|
71
93
|
]
|
|
94
|
+
|
|
95
|
+
async def delete(self, snippet_ids: list[int]) -> None: # noqa: ARG002
|
|
96
|
+
"""Delete documents from the index."""
|
|
97
|
+
self.log.warning("Deletion not supported for local BM25 index")
|