kodit 0.1.13__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (113) hide show
  1. kodit-0.1.15/.cursor/rules/kodit.mdc +8 -0
  2. kodit-0.1.15/.github/dependabot.yml +10 -0
  3. {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/pypi.yaml +5 -5
  4. {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/test.yaml +12 -10
  5. {kodit-0.1.13 → kodit-0.1.15}/PKG-INFO +2 -1
  6. kodit-0.1.15/docs/_index.md +253 -0
  7. {kodit-0.1.13 → kodit-0.1.15}/pyproject.toml +1 -0
  8. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/_version.py +2 -2
  9. kodit-0.1.15/src/kodit/bm25/keyword_search_factory.py +17 -0
  10. kodit-0.1.15/src/kodit/bm25/keyword_search_service.py +34 -0
  11. kodit-0.1.13/src/kodit/bm25/bm25.py → kodit-0.1.15/src/kodit/bm25/local_bm25.py +40 -14
  12. kodit-0.1.15/src/kodit/bm25/vectorchord_bm25.py +193 -0
  13. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/cli.py +14 -11
  14. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/config.py +9 -2
  15. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/database.py +4 -2
  16. kodit-0.1.15/src/kodit/embedding/embedding_factory.py +44 -0
  17. kodit-0.1.15/src/kodit/embedding/embedding_provider/__init__.py +1 -0
  18. kodit-0.1.15/src/kodit/embedding/embedding_provider/embedding_provider.py +53 -0
  19. kodit-0.1.15/src/kodit/embedding/embedding_provider/hash_embedding_provider.py +77 -0
  20. kodit-0.1.15/src/kodit/embedding/embedding_provider/local_embedding_provider.py +58 -0
  21. kodit-0.1.15/src/kodit/embedding/embedding_provider/openai_embedding_provider.py +63 -0
  22. kodit-0.1.13/src/kodit/search/search_repository.py → kodit-0.1.15/src/kodit/embedding/embedding_repository.py +61 -33
  23. kodit-0.1.15/src/kodit/embedding/local_vector_search_service.py +50 -0
  24. kodit-0.1.15/src/kodit/embedding/vector_search_service.py +38 -0
  25. kodit-0.1.15/src/kodit/embedding/vectorchord_vector_search_service.py +145 -0
  26. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/indexing/indexing_repository.py +24 -4
  27. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/indexing/indexing_service.py +25 -30
  28. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/mcp.py +28 -7
  29. kodit-0.1.15/src/kodit/search/search_repository.py +57 -0
  30. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/search/search_service.py +12 -24
  31. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/source/source_service.py +9 -3
  32. kodit-0.1.15/src/kodit/util/__init__.py +1 -0
  33. kodit-0.1.15/src/kodit/util/spinner.py +59 -0
  34. kodit-0.1.15/tests/experiments/cline-prompt-regression-tests/cline_prompt.txt +631 -0
  35. kodit-0.1.15/tests/experiments/cline-prompt-regression-tests/cline_prompt_test.py +141 -0
  36. {kodit-0.1.13 → kodit-0.1.15}/tests/experiments/embedding.py +3 -3
  37. kodit-0.1.15/tests/kodit/bm25/local_bm25_test.py +155 -0
  38. kodit-0.1.15/tests/kodit/bm25/vectorchord_repository_test.py +182 -0
  39. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/cli_test.py +16 -4
  40. kodit-0.1.15/tests/kodit/embedding/embedding_provider/local_embedding_provider_test.py +93 -0
  41. kodit-0.1.15/tests/kodit/embedding/embedding_provider/openai_embedding_provider_test.py +138 -0
  42. kodit-0.1.15/tests/kodit/embedding/local_vector_search_service_test.py +143 -0
  43. kodit-0.1.15/tests/kodit/embedding/vectorchord_vector_search_service_test.py +230 -0
  44. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/indexing/indexing_service_test.py +38 -22
  45. kodit-0.1.15/tests/kodit/search/search_repository_test.py +57 -0
  46. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/search/search_service_test.py +41 -110
  47. {kodit-0.1.13 → kodit-0.1.15}/uv.lock +26 -0
  48. kodit-0.1.13/.cursor/rules/kodit.mdc +0 -6
  49. kodit-0.1.13/docs/_index.md +0 -160
  50. kodit-0.1.13/src/kodit/embedding/embedding.py +0 -203
  51. kodit-0.1.13/tests/kodit/embedding/embedding_test.py +0 -13
  52. kodit-0.1.13/tests/kodit/search/search_repository_test.py +0 -124
  53. {kodit-0.1.13 → kodit-0.1.15}/.github/CODE_OF_CONDUCT.md +0 -0
  54. {kodit-0.1.13 → kodit-0.1.15}/.github/CONTRIBUTING.md +0 -0
  55. {kodit-0.1.13 → kodit-0.1.15}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  56. {kodit-0.1.13 → kodit-0.1.15}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  57. {kodit-0.1.13 → kodit-0.1.15}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  58. {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/docker.yaml +0 -0
  59. {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/docs.yaml +0 -0
  60. {kodit-0.1.13 → kodit-0.1.15}/.github/workflows/pypi-test.yaml +0 -0
  61. {kodit-0.1.13 → kodit-0.1.15}/.gitignore +0 -0
  62. {kodit-0.1.13 → kodit-0.1.15}/.python-version +0 -0
  63. {kodit-0.1.13 → kodit-0.1.15}/.vscode/launch.json +0 -0
  64. {kodit-0.1.13 → kodit-0.1.15}/.vscode/settings.json +0 -0
  65. {kodit-0.1.13 → kodit-0.1.15}/Dockerfile +0 -0
  66. {kodit-0.1.13 → kodit-0.1.15}/LICENSE +0 -0
  67. {kodit-0.1.13 → kodit-0.1.15}/README.md +0 -0
  68. {kodit-0.1.13 → kodit-0.1.15}/alembic.ini +0 -0
  69. {kodit-0.1.13 → kodit-0.1.15}/docs/developer/index.md +0 -0
  70. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/.gitignore +0 -0
  71. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/__init__.py +0 -0
  72. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/app.py +0 -0
  73. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/bm25/__init__.py +0 -0
  74. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/embedding/__init__.py +0 -0
  75. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/embedding/embedding_models.py +0 -0
  76. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/indexing/__init__.py +0 -0
  77. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/indexing/indexing_models.py +0 -0
  78. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/log.py +0 -0
  79. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/middleware.py +0 -0
  80. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/README +0 -0
  81. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/__init__.py +0 -0
  82. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/env.py +0 -0
  83. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/script.py.mako +0 -0
  84. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +0 -0
  85. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/versions/85155663351e_initial.py +0 -0
  86. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/migrations/versions/__init__.py +0 -0
  87. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/search/__init__.py +0 -0
  88. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/__init__.py +0 -0
  89. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/languages/__init__.py +0 -0
  90. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/languages/csharp.scm +0 -0
  91. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/languages/python.scm +0 -0
  92. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/method_snippets.py +0 -0
  93. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/snippets/snippets.py +0 -0
  94. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/source/__init__.py +0 -0
  95. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/source/source_models.py +0 -0
  96. {kodit-0.1.13 → kodit-0.1.15}/src/kodit/source/source_repository.py +0 -0
  97. {kodit-0.1.13 → kodit-0.1.15}/tests/__init__.py +0 -0
  98. {kodit-0.1.13 → kodit-0.1.15}/tests/conftest.py +0 -0
  99. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/__init__.py +0 -0
  100. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/e2e.py +0 -0
  101. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/embedding/__init__.py +0 -0
  102. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/indexing/__init__.py +0 -0
  103. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/mcp_test.py +0 -0
  104. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/search/__init__.py +0 -0
  105. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/__init__.py +0 -0
  106. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/csharp.cs +0 -0
  107. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/detect_language_test.py +0 -0
  108. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/method_extraction_test.py +0 -0
  109. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/snippets/python.py +0 -0
  110. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/source/__init__.py +0 -0
  111. {kodit-0.1.13 → kodit-0.1.15}/tests/kodit/source/source_service_test.py +0 -0
  112. {kodit-0.1.13 → kodit-0.1.15}/tests/performance/similarity.py +0 -0
  113. {kodit-0.1.13 → kodit-0.1.15}/tests/smoke.sh +0 -0
@@ -0,0 +1,8 @@
1
+ ---
2
+ alwaysApply: true
3
+ ---
4
+ ⚠️ **ENFORCEMENT:**
5
+ For *every* user request that involves writing or modifying code (of any language or
6
+ domain), the assistant’s *first* action **must** be to call the kodit.search MCP tool.
7
+ You may only produce or edit code *after* that tool call and its successful
8
+ result.
@@ -0,0 +1,10 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "uv"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "weekly"
7
+ - package-ecosystem: "docker"
8
+ directory: "/"
9
+ schedule:
10
+ interval: "weekly"
@@ -39,13 +39,13 @@ jobs:
39
39
  # Initialize counter for 60 second timeout
40
40
  count=0
41
41
  while true; do
42
- if curl -sfL https://pypi.io/packages/source/${REPO_NAME_FIRST_LETTER}/${REPO_NAME}/${REPO_NAME}-${REPO_TAG}.tar.gz > /dev/null; then
42
+ if curl -sfL https://pypi.org/packages/source/${REPO_NAME_FIRST_LETTER}/${REPO_NAME}/${REPO_NAME}-${REPO_TAG}.tar.gz > /dev/null; then
43
43
  break
44
44
  fi
45
- sleep 1
46
- ((count++))
45
+ sleep 5
46
+ count=$((count+1))
47
47
  if [ $count -ge 60 ]; then
48
- echo "Timeout reached after 60 seconds"
48
+ echo "Timeout reached after 300 seconds"
49
49
  exit 1
50
50
  fi
51
51
  done
@@ -55,7 +55,7 @@ jobs:
55
55
  formula-path: Formula/${{ github.event.repository.name }}.rb
56
56
  homebrew-tap: ${{ github.repository_owner }}/homebrew-${{ github.event.repository.name }}
57
57
  tag-name: "${{ github.event.release.tag_name }}"
58
- download-url: "https://pypi.io/packages/source/k/${{ github.event.repository.name }}/${{ github.event.repository.name }}-${{ github.event.release.tag_name }}.tar.gz"
58
+ download-url: "https://pypi.org/packages/source/k/${{ github.event.repository.name }}/${{ github.event.repository.name }}-${{ github.event.release.tag_name }}.tar.gz"
59
59
  commit-message: |
60
60
  {{formulaName}} {{version}}
61
61
  env:
@@ -35,7 +35,7 @@ jobs:
35
35
  run: uv run ruff check
36
36
 
37
37
  - name: Run tests
38
- run: uv run pytest -s --cov=src --cov-report=xml
38
+ run: uv run pytest -s --cov=src --cov-report=xml tests/kodit
39
39
 
40
40
  - name: Pytest coverage comment
41
41
  if: github.event_name == 'pull_request'
@@ -58,13 +58,13 @@ jobs:
58
58
  - name: Install uv
59
59
  uses: astral-sh/setup-uv@v5
60
60
 
61
- - run: uv build --sdist --out-dir test-build
61
+ - run: uv build --wheel --out-dir test-build
62
62
 
63
63
  - name: Upload built package
64
64
  uses: actions/upload-artifact@v4
65
65
  with:
66
66
  name: built-package
67
- path: test-build/*.tar.gz
67
+ path: test-build/*.whl
68
68
 
69
69
  test-package:
70
70
  needs: build-package
@@ -88,14 +88,16 @@ jobs:
88
88
  with:
89
89
  python-version: 3.12
90
90
 
91
- - name: Extract path to sdist
92
- id: sdist_path
93
- run: echo "sdist_path=$(ls test-build/*.tar.gz)" >> $GITHUB_OUTPUT
91
+ - name: Install uv
92
+ uses: astral-sh/setup-uv@v5
94
93
 
95
- - name: Install sdist
96
- uses: threeal/pipx-install-action@v1.0.0
97
- with:
98
- packages: "${{ steps.sdist_path.outputs.sdist_path }}"
94
+ - name: Extract path to wheel
95
+ id: wheel_path
96
+ run: echo "wheel_path=$(ls test-build/*.whl)" >> $GITHUB_OUTPUT
97
+
98
+ # This is equivalent to `pipx install --include-deps, but faster
99
+ - name: Install wheel
100
+ run: uv tool install "${{ steps.wheel_path.outputs.wheel_path }}"
99
101
 
100
102
  - name: Run simple version command test
101
103
  run: kodit version
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -21,6 +21,7 @@ Requires-Dist: aiofiles>=24.1.0
21
21
  Requires-Dist: aiosqlite>=0.20.0
22
22
  Requires-Dist: alembic>=1.15.2
23
23
  Requires-Dist: asgi-correlation-id>=4.3.4
24
+ Requires-Dist: asyncpg>=0.30.0
24
25
  Requires-Dist: better-exceptions>=0.3.3
25
26
  Requires-Dist: bm25s[core]>=0.2.12
26
27
  Requires-Dist: click>=8.1.8
@@ -0,0 +1,253 @@
1
+ ---
2
+ title: "kodit: Code Indexing MCP Server"
3
+ linkTitle: kodit Docs
4
+ cascade:
5
+ type: docs
6
+ menu:
7
+ main:
8
+ name: kodit Docs
9
+ weight: 3
10
+ # next: /helix/getting-started
11
+ weight: 1
12
+ aliases:
13
+ - /coda
14
+ ---
15
+
16
+ ## Installation
17
+
18
+ Please choose your preferred installation method. They all ultimately install the kodit
19
+ cli, which contains the kodit MCP server and other tools to manage your data sources.
20
+
21
+ ### Docker
22
+
23
+ ```sh
24
+ docker run -it --rm registry.helix.ml/helix/kodit:latest
25
+ ```
26
+
27
+ Always replace latest with a specific version.
28
+
29
+ ### pipx
30
+
31
+ ```sh
32
+ pipx install kodit
33
+ ```
34
+
35
+ ### homebrew
36
+
37
+ ```sh
38
+ brew install helixml/kodit/kodit
39
+ ```
40
+
41
+ ### uv
42
+
43
+ ```sh
44
+ uv tool install kodit
45
+ ```
46
+
47
+ ### pip
48
+
49
+ Use this if you want to use kodit as a python library:
50
+
51
+ ```sh
52
+ pip install kodit
53
+ ```
54
+
55
+ ## Quick Start
56
+
57
+ Kodit has two key parts. A configuration CLI to manage what gets indexed and an MCP
58
+ server to expose your code to an AI coding assistant.
59
+
60
+ 1. Index a source:
61
+ 1. a local path: `kodit index /path/to/your/code`
62
+ 2. or index a public git repository: `kodit index https://github.com/pydantic/pydantic-ai`
63
+ 2. Manually search your index:
64
+ 1. with a keyword: `kodit search keyword "test"`
65
+ 2. or with code: `kodit search code "def main()"`
66
+ 3. or via hybrid search: `kodit search code hybrid --keywords "main" --code "def main()"`
67
+ 3. Start an MCP server: `kodit serve`
68
+
69
+ Now add the Kodit MCP server to your AI coding assistant.
70
+
71
+ ### Integrating Kodit with Coding Assistants
72
+
73
+ #### Integration with Cursor
74
+
75
+ Add the following to `$HOME/.cursor/mcp.json`:
76
+
77
+ ```json
78
+ {
79
+ "mcpServers": {
80
+ "kodit": {
81
+ "url": "http://localhost:8080/sse"
82
+ }
83
+ }
84
+ }
85
+ ```
86
+
87
+ Or find this configuration in `Cursor Settings` -> `MCP`.
88
+
89
+ #### Integration with Cline
90
+
91
+ 1. Open Cline from the side menu
92
+ 2. Click the `MCP Servers` button at the top right of the Cline window (the icon looks
93
+ like a server)
94
+ 3. Click the `Remote Servers` tab.
95
+ 4. Click `Edit Configuration`
96
+ 5. Add the following configuration:
97
+
98
+ ```json
99
+ {
100
+ "mcpServers": {
101
+ "kodit": {
102
+ "autoApprove": [],
103
+ "disabled": true,
104
+ "timeout": 60,
105
+ "url": "http://localhost:8080/sse",
106
+ "transportType": "sse"
107
+ }
108
+ }
109
+ }
110
+ ```
111
+
112
+ 6. Save the configuration and browse to the `Installed` tab.
113
+
114
+ Kodit should be listed and responding. Now code on!
115
+
116
+ ### Forcing AI Assistants to use Kodit
117
+
118
+ Although Kodit has been developed to work well out of the box with popular AI coding
119
+ assistants, they sometimes still think they know better.
120
+
121
+ You can force your assistant to use Kodit by editing the system prompt used by the
122
+ assistant. Each assistant exposes this slightly differently, but it's usually in the
123
+ settings.
124
+
125
+ Try using this system prompt:
126
+
127
+ ```txt
128
+ ⚠️ **ENFORCEMENT:**
129
+ For *every* user request that involves writing or modifying code (of any language or
130
+ domain), the assistant's *first* action **must** be to call the kodit.search MCP tool.
131
+ You may only produce or edit code *after* that tool call and its successful
132
+ result.
133
+ ```
134
+
135
+ Feel free to alter that to suit your specific circumstances.
136
+
137
+ #### Forcing Cursor to Use Kodit
138
+
139
+ Add the following prompt to `.cursor/rules/kodit.mdc` in your project directory:
140
+
141
+ ```markdown
142
+ ---
143
+ alwaysApply: true
144
+ ---
145
+ ⚠️ **ENFORCEMENT:**
146
+ For *every* user request that involves writing or modifying code (of any language or
147
+ domain), the assistant's *first* action **must** be to call the kodit.search MCP tool.
148
+ You may only produce or edit code *after* that tool call and its successful
149
+ result.
150
+ ```
151
+
152
+ Alternatively, you can browse to the Cursor settings and set this prompt globally.
153
+
154
+ #### Forcing Cline to Use Kodit
155
+
156
+ 1. Go to `Settings` -> `API Configuration`
157
+ 2. At the bottom there is a `Custom Instructions` section.
158
+
159
+ ## Configuring Kodit
160
+
161
+ Configuration of Kodit is performed by setting environmental variables or adding
162
+ variables to a .env file.
163
+
164
+ {{< warn >}}
165
+ Note that updating a setting does not automatically update the data that uses that
166
+ setting. For example, if you change a provider, you will need to delete and
167
+ recreate all indexes.
168
+ {{< /warn >}}
169
+
170
+ ### Indexing
171
+
172
+ #### Default Provider
173
+
174
+ By default, Kodit will use small local models for semantic search and enrichment. If you
175
+ are using Kodit in a professional capacity, it is likely that the local model latency is
176
+ too high to provide a good developer experience.
177
+
178
+ Instead, you should use an external provider. The settings provided here will cause all
179
+ embedding and enrichments request to be sent to this provider by default. You can
180
+ override the provider used for each task if you wish. (Coming soon!)
181
+
182
+ ##### OpenAI
183
+
184
+ Add the following settings to your .env file, or export them as environmental variables:
185
+
186
+ ```bash
187
+ DEFAULT_ENDPOINT_BASE_URL=https://api.openai.com/v1
188
+ DEFAULT_ENDPOINT_API_KEY=sk-xxxxxx
189
+ ```
190
+
191
+ ### Database
192
+
193
+ Out of the box Kodit uses a local sqlite file to make it easier for users to get
194
+ started. But for production use, it's likely you will want to use a database that has
195
+ dedicated semantic and keyword search capabilities for reduced latency.
196
+
197
+ #### VectorChord Database
198
+
199
+ [VectorChord](https://github.com/tensorchord/VectorChord) is an optimized PostgreSQL
200
+ extension that provides both vector and BM25 search. (See [Search](#search))
201
+
202
+ Start a container with:
203
+
204
+ ```sh
205
+ docker run \
206
+ --name kodit-vectorchord \
207
+ -e POSTGRES_DB=kodit \
208
+ -e POSTGRES_PASSWORD=mysecretpassword \
209
+ -p 5432:5432 \
210
+ -d tensorchord/vchord-suite:pg17-20250601
211
+ ```
212
+
213
+ {{< warn >}}
214
+ Kodit assumes the database exists. In the above example I'm abusing the POSTGRES_DB
215
+ environmental variable from the [Postgres Docker
216
+ container](https://hub.docker.com/_/postgres/) to create the database for me. In
217
+ production setups, please create a database yourself.
218
+ {{< /warn >}}
219
+
220
+ Then update your `.env` file to include:
221
+
222
+ ```env
223
+ DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
224
+ ```
225
+
226
+ ### Search
227
+
228
+ #### Default Search Provider
229
+
230
+ By default, Kodit will use built-in implementations of BM25 and similarity search to
231
+ improve the out of the box experience. If you are using Kodit in a professional
232
+ capacity, it is likely that the search latency is too high to provide a good developer
233
+ experience.
234
+
235
+ Instead, you should use the features included in your database. The settings provided
236
+ here will cause all search functionality to use this database by default. You can
237
+ override the database used for each search type if you wish. (Coming soon!)
238
+
239
+ ##### VectorChord Search
240
+
241
+ Configure Kodit to use a [VectorChord database](#vectorchord-database).
242
+
243
+ Then update your `.env` file to include:
244
+
245
+ ```env
246
+ DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
247
+ DEFAULT_SEARCH_PROVIDER=vectorchord
248
+ ```
249
+
250
+ ## Managing Kodit
251
+
252
+ There is limited management functionality at this time. To delete indexes you must
253
+ delete the database and/or tables.
@@ -48,6 +48,7 @@ dependencies = [
48
48
  "hf-xet>=1.1.2",
49
49
  "openai>=1.82.0",
50
50
  "tiktoken>=0.9.0",
51
+ "asyncpg>=0.30.0",
51
52
  ]
52
53
 
53
54
  [dependency-groups]
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.13'
21
- __version_tuple__ = version_tuple = (0, 1, 13)
20
+ __version__ = version = '0.1.15'
21
+ __version_tuple__ = version_tuple = (0, 1, 15)
@@ -0,0 +1,17 @@
1
+ """Factory for creating keyword search providers."""
2
+
3
+ from sqlalchemy.ext.asyncio import AsyncSession
4
+
5
+ from kodit.bm25.keyword_search_service import KeywordSearchProvider
6
+ from kodit.bm25.local_bm25 import BM25Service
7
+ from kodit.bm25.vectorchord_bm25 import VectorChordBM25
8
+ from kodit.config import AppContext
9
+
10
+
11
+ def keyword_search_factory(
12
+ app_context: AppContext, session: AsyncSession
13
+ ) -> KeywordSearchProvider:
14
+ """Create a keyword search provider."""
15
+ if app_context.default_search.provider == "vectorchord":
16
+ return VectorChordBM25(session=session)
17
+ return BM25Service(data_dir=app_context.get_data_dir())
@@ -0,0 +1,34 @@
1
+ """Keyword search service."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import NamedTuple
5
+
6
+
7
+ class BM25Document(NamedTuple):
8
+ """BM25 document."""
9
+
10
+ snippet_id: int
11
+ text: str
12
+
13
+
14
+ class BM25Result(NamedTuple):
15
+ """BM25 result."""
16
+
17
+ snippet_id: int
18
+ score: float
19
+
20
+
21
+ class KeywordSearchProvider(ABC):
22
+ """Interface for keyword search providers."""
23
+
24
+ @abstractmethod
25
+ async def index(self, corpus: list[BM25Document]) -> None:
26
+ """Index a new corpus."""
27
+
28
+ @abstractmethod
29
+ async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
30
+ """Retrieve from the index."""
31
+
32
+ @abstractmethod
33
+ async def delete(self, snippet_ids: list[int]) -> None:
34
+ """Delete documents from the index."""
@@ -1,23 +1,36 @@
1
- """BM25 service."""
1
+ """Locally hosted BM25 service primarily for use with SQLite."""
2
2
 
3
+ import json
3
4
  from pathlib import Path
4
5
 
6
+ import aiofiles
5
7
  import bm25s
6
8
  import Stemmer
7
9
  import structlog
8
10
  from bm25s.tokenization import Tokenized
9
11
 
12
+ from kodit.bm25.keyword_search_service import (
13
+ BM25Document,
14
+ BM25Result,
15
+ KeywordSearchProvider,
16
+ )
10
17
 
11
- class BM25Service:
12
- """Service for BM25."""
18
+ SNIPPET_IDS_FILE = "snippet_ids.jsonl"
19
+
20
+
21
+ class BM25Service(KeywordSearchProvider):
22
+ """LocalBM25 service."""
13
23
 
14
24
  def __init__(self, data_dir: Path) -> None:
15
25
  """Initialize the BM25 service."""
16
26
  self.log = structlog.get_logger(__name__)
17
27
  self.index_path = data_dir / "bm25s_index"
28
+ self.snippet_ids: list[int] = []
18
29
  try:
19
30
  self.log.debug("Loading BM25 index")
20
31
  self.retriever = bm25s.BM25.load(self.index_path, mmap=True)
32
+ with Path(self.index_path / SNIPPET_IDS_FILE).open() as f:
33
+ self.snippet_ids = json.load(f)
21
34
  except FileNotFoundError:
22
35
  self.log.debug("BM25 index not found, creating new index")
23
36
  self.retriever = bm25s.BM25()
@@ -33,28 +46,34 @@ class BM25Service:
33
46
  show_progress=True,
34
47
  )
35
48
 
36
- def index(self, corpus: list[str]) -> None:
49
+ async def index(self, corpus: list[BM25Document]) -> None:
37
50
  """Index a new corpus."""
38
51
  self.log.debug("Indexing corpus")
39
- vocab = self._tokenize(corpus)
52
+ vocab = self._tokenize([doc.text for doc in corpus])
40
53
  self.retriever = bm25s.BM25()
41
54
  self.retriever.index(vocab, show_progress=False)
42
55
  self.retriever.save(self.index_path)
56
+ self.snippet_ids = self.snippet_ids + [doc.snippet_id for doc in corpus]
57
+ async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
58
+ await f.write(json.dumps(self.snippet_ids))
43
59
 
44
- def retrieve(
45
- self, doc_ids: list[int], query: str, top_k: int = 2
46
- ) -> list[tuple[int, float]]:
60
+ async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
47
61
  """Retrieve from the index."""
48
62
  if top_k == 0:
49
63
  self.log.warning("Top k is 0, returning empty list")
50
64
  return []
51
- if len(doc_ids) == 0:
52
- self.log.warning("No documents to retrieve from, returning empty list")
65
+
66
+ # Get the number of documents in the index
67
+ num_docs = self.retriever.scores["num_docs"]
68
+ if num_docs == 0:
53
69
  return []
54
70
 
55
- top_k = min(top_k, len(self.retriever.scores))
71
+ # Adjust top_k to not exceed corpus size
72
+ top_k = min(top_k, num_docs)
56
73
  self.log.debug(
57
- "Retrieving from index", query=query, top_k=top_k, num_docs=len(doc_ids)
74
+ "Retrieving from index",
75
+ query=query,
76
+ top_k=top_k,
58
77
  )
59
78
 
60
79
  query_tokens = self._tokenize([query])
@@ -62,10 +81,17 @@ class BM25Service:
62
81
  self.log.debug("Query tokens", query_tokens=query_tokens)
63
82
 
64
83
  results, scores = self.retriever.retrieve(
65
- query_tokens=query_tokens, corpus=doc_ids, k=top_k
84
+ query_tokens=query_tokens,
85
+ corpus=self.snippet_ids,
86
+ k=top_k,
66
87
  )
67
88
  self.log.debug("Raw results", results=results, scores=scores)
68
89
  return [
69
- (int(result), float(score))
90
+ BM25Result(snippet_id=int(result), score=float(score))
70
91
  for result, score in zip(results[0], scores[0], strict=False)
92
+ if score > 0.0
71
93
  ]
94
+
95
+ async def delete(self, snippet_ids: list[int]) -> None: # noqa: ARG002
96
+ """Delete documents from the index."""
97
+ self.log.warning("Deletion not supported for local BM25 index")