kodit 0.1.15__tar.gz → 0.1.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit-0.1.16/.github/workflows/pull_request.yaml +35 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/test.yaml +7 -8
- kodit-0.1.16/.python-version +1 -0
- {kodit-0.1.15 → kodit-0.1.16}/Dockerfile +4 -4
- {kodit-0.1.15 → kodit-0.1.16}/PKG-INFO +3 -1
- {kodit-0.1.15 → kodit-0.1.16}/docs/_index.md +7 -1
- {kodit-0.1.15 → kodit-0.1.16}/pyproject.toml +2 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/_version.py +2 -2
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/cli.py +105 -19
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_factory.py +2 -2
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/embedding_provider.py +9 -2
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/openai_embedding_provider.py +19 -7
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/vectorchord_vector_search_service.py +24 -15
- kodit-0.1.16/src/kodit/enrichment/__init__.py +1 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_factory.py +23 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_provider/__init__.py +1 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_provider/enrichment_provider.py +16 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_provider/local_enrichment_provider.py +63 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +77 -0
- kodit-0.1.16/src/kodit/enrichment/enrichment_service.py +33 -0
- kodit-0.1.16/src/kodit/indexing/fusion.py +67 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/indexing/indexing_repository.py +20 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/indexing/indexing_service.py +120 -4
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/mcp.py +25 -16
- kodit-0.1.16/src/kodit/snippets/languages/go.scm +26 -0
- kodit-0.1.16/tests/experiments/similarity_test.py +73 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/embedding_provider/openai_embedding_provider_test.py +87 -7
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/vectorchord_vector_search_service_test.py +1 -0
- kodit-0.1.16/tests/kodit/enrichment/enrichment_provider/__init__.py +0 -0
- kodit-0.1.16/tests/kodit/enrichment/enrichment_provider/openai_enrichment_provider_test.py +203 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/indexing/indexing_service_test.py +8 -5
- kodit-0.1.16/tests/kodit/snippets/__init__.py +0 -0
- kodit-0.1.16/tests/kodit/snippets/golang.go +28 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/snippets/method_extraction_test.py +38 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/smoke.sh +1 -1
- {kodit-0.1.15 → kodit-0.1.16}/uv.lock +2 -0
- kodit-0.1.15/.python-version +0 -1
- kodit-0.1.15/src/kodit/search/__init__.py +0 -1
- kodit-0.1.15/src/kodit/search/search_repository.py +0 -57
- kodit-0.1.15/src/kodit/search/search_service.py +0 -135
- kodit-0.1.15/tests/kodit/search/__init__.py +0 -1
- kodit-0.1.15/tests/kodit/search/search_repository_test.py +0 -57
- kodit-0.1.15/tests/kodit/search/search_service_test.py +0 -210
- {kodit-0.1.15 → kodit-0.1.16}/.cursor/rules/kodit.mdc +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/CODE_OF_CONDUCT.md +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/CONTRIBUTING.md +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/dependabot.yml +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/docker.yaml +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/docs.yaml +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/pypi-test.yaml +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/pypi.yaml +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.gitignore +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.vscode/launch.json +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/.vscode/settings.json +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/LICENSE +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/README.md +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/alembic.ini +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/docs/developer/index.md +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/.gitignore +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/app.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/keyword_search_factory.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/keyword_search_service.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/local_bm25.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/vectorchord_bm25.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/config.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/database.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_models.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/local_embedding_provider.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_repository.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/local_vector_search_service.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/vector_search_service.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/indexing/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/indexing/indexing_models.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/log.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/middleware.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/README +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/env.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/script.py.mako +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/versions/85155663351e_initial.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/versions/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/languages/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/languages/csharp.scm +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/languages/python.scm +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/method_snippets.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/snippets.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/source/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/source/source_models.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/source/source_repository.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/source/source_service.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/util/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/src/kodit/util/spinner.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/conftest.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/experiments/cline-prompt-regression-tests/cline_prompt.txt +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/experiments/cline-prompt-regression-tests/cline_prompt_test.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/experiments/embedding.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/bm25/local_bm25_test.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/bm25/vectorchord_repository_test.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/cli_test.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/e2e.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/embedding_provider/local_embedding_provider_test.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/local_vector_search_service_test.py +0 -0
- {kodit-0.1.15/tests/kodit/snippets → kodit-0.1.16/tests/kodit/enrichment}/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/indexing/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/mcp_test.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/snippets/csharp.cs +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/snippets/detect_language_test.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/snippets/python.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/source/__init__.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/source/source_service_test.py +0 -0
- {kodit-0.1.15 → kodit-0.1.16}/tests/performance/similarity.py +0 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# This workflow will install dependencies, create coverage tests and run Pytest Coverage Comment
|
|
2
|
+
# For more information see: https://github.com/MishaKav/pytest-coverage-comment/
|
|
3
|
+
name: pytest-coverage-comment
|
|
4
|
+
on:
|
|
5
|
+
pull_request:
|
|
6
|
+
branches:
|
|
7
|
+
- "*"
|
|
8
|
+
|
|
9
|
+
# https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
|
|
10
|
+
# `contents` is for permission to the contents of the repository.
|
|
11
|
+
# `pull-requests` is for permission to pull request
|
|
12
|
+
permissions:
|
|
13
|
+
contents: write
|
|
14
|
+
checks: write
|
|
15
|
+
pull-requests: write
|
|
16
|
+
|
|
17
|
+
jobs:
|
|
18
|
+
coverage-comment:
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
- uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version-file: ".python-version"
|
|
25
|
+
- uses: astral-sh/setup-uv@v5
|
|
26
|
+
- run: uv sync --locked --all-extras --dev
|
|
27
|
+
|
|
28
|
+
- name: Run tests
|
|
29
|
+
run: uv run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=src tests/kodit | tee pytest-coverage.txt
|
|
30
|
+
|
|
31
|
+
- name: Pytest coverage comment
|
|
32
|
+
uses: MishaKav/pytest-coverage-comment@main
|
|
33
|
+
with:
|
|
34
|
+
pytest-coverage-path: ./pytest-coverage.txt
|
|
35
|
+
junitxml-path: ./pytest.xml
|
|
@@ -9,7 +9,6 @@ on:
|
|
|
9
9
|
permissions:
|
|
10
10
|
contents: read # Needed to check out code
|
|
11
11
|
checks: write # Needed to report test results
|
|
12
|
-
pull-requests: write # Needed to add comments/annotations to PRs
|
|
13
12
|
|
|
14
13
|
jobs:
|
|
15
14
|
test:
|
|
@@ -37,12 +36,6 @@ jobs:
|
|
|
37
36
|
- name: Run tests
|
|
38
37
|
run: uv run pytest -s --cov=src --cov-report=xml tests/kodit
|
|
39
38
|
|
|
40
|
-
- name: Pytest coverage comment
|
|
41
|
-
if: github.event_name == 'pull_request'
|
|
42
|
-
uses: MishaKav/pytest-coverage-comment@v1.1.54
|
|
43
|
-
with:
|
|
44
|
-
pytest-xml-coverage-path: ./coverage.xml
|
|
45
|
-
|
|
46
39
|
build-package:
|
|
47
40
|
runs-on: ubuntu-latest
|
|
48
41
|
timeout-minutes: 10
|
|
@@ -69,12 +62,18 @@ jobs:
|
|
|
69
62
|
test-package:
|
|
70
63
|
needs: build-package
|
|
71
64
|
runs-on: ubuntu-latest
|
|
65
|
+
strategy:
|
|
66
|
+
matrix:
|
|
67
|
+
python-version:
|
|
68
|
+
- 3.12
|
|
69
|
+
- 3.13
|
|
72
70
|
timeout-minutes: 10
|
|
73
71
|
steps:
|
|
74
72
|
- uses: actions/checkout@v4
|
|
75
73
|
with:
|
|
76
74
|
sparse-checkout: |
|
|
77
75
|
tests/smoke.sh
|
|
76
|
+
uv.lock
|
|
78
77
|
sparse-checkout-cone-mode: false
|
|
79
78
|
|
|
80
79
|
- name: Download built package
|
|
@@ -86,7 +85,7 @@ jobs:
|
|
|
86
85
|
- name: "Set up Python"
|
|
87
86
|
uses: actions/setup-python@v5
|
|
88
87
|
with:
|
|
89
|
-
python-version:
|
|
88
|
+
python-version: ${{ matrix.python-version }}
|
|
90
89
|
|
|
91
90
|
- name: Install uv
|
|
92
91
|
uses: astral-sh/setup-uv@v5
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# syntax=docker/dockerfile:1.9
|
|
2
|
-
FROM python:3.
|
|
2
|
+
FROM python:3.13.3-slim-bookworm AS build
|
|
3
3
|
|
|
4
4
|
# The following does not work in Podman unless you build in Docker
|
|
5
5
|
# compatibility mode: <https://github.com/containers/podman/issues/8477>
|
|
@@ -23,12 +23,12 @@ COPY --from=ghcr.io/astral-sh/uv:0.7.2 /uv /usr/local/bin/uv
|
|
|
23
23
|
# - Silence uv complaining about not being able to use hard links,
|
|
24
24
|
# - tell uv to byte-compile packages for faster application startups,
|
|
25
25
|
# - prevent uv from accidentally downloading isolated Python builds,
|
|
26
|
-
# - pick a Python (use `/usr/bin/python3.
|
|
26
|
+
# - pick a Python (use `/usr/bin/python3.13` on uv 0.5.0 and later),
|
|
27
27
|
# - and finally declare `/app` as the target for `uv sync`.
|
|
28
28
|
ENV UV_LINK_MODE=copy \
|
|
29
29
|
UV_COMPILE_BYTECODE=1 \
|
|
30
30
|
UV_PYTHON_DOWNLOADS=never \
|
|
31
|
-
UV_PYTHON=python3.
|
|
31
|
+
UV_PYTHON=python3.13 \
|
|
32
32
|
UV_PROJECT_ENVIRONMENT=/app
|
|
33
33
|
|
|
34
34
|
# Synchronize DEPENDENCIES without the application itself.
|
|
@@ -60,7 +60,7 @@ RUN --mount=type=cache,target=/root/.cache \
|
|
|
60
60
|
|
|
61
61
|
##########################################################################
|
|
62
62
|
|
|
63
|
-
FROM python:3.
|
|
63
|
+
FROM python:3.13.3-slim-bookworm
|
|
64
64
|
SHELL ["sh", "-exc"]
|
|
65
65
|
|
|
66
66
|
ENV PATH=/app/bin:$PATH
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kodit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.16
|
|
4
4
|
Summary: Code indexing for better AI code generation
|
|
5
5
|
Project-URL: Homepage, https://docs.helixml.tech/kodit/
|
|
6
6
|
Project-URL: Documentation, https://docs.helixml.tech/kodit/
|
|
@@ -15,6 +15,7 @@ Keywords: ai,indexing,mcp,rag
|
|
|
15
15
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
16
|
Classifier: Intended Audience :: Developers
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
19
|
Classifier: Topic :: Software Development :: Code Generators
|
|
19
20
|
Requires-Python: >=3.12
|
|
20
21
|
Requires-Dist: aiofiles>=24.1.0
|
|
@@ -42,6 +43,7 @@ Requires-Dist: sqlalchemy[asyncio]>=2.0.40
|
|
|
42
43
|
Requires-Dist: structlog>=25.3.0
|
|
43
44
|
Requires-Dist: tdqm>=0.0.1
|
|
44
45
|
Requires-Dist: tiktoken>=0.9.0
|
|
46
|
+
Requires-Dist: transformers>=4.51.3
|
|
45
47
|
Requires-Dist: tree-sitter-language-pack>=0.7.3
|
|
46
48
|
Requires-Dist: tree-sitter>=0.24.0
|
|
47
49
|
Requires-Dist: uritools>=5.0.0
|
|
@@ -169,7 +169,7 @@ recreate all indexes.
|
|
|
169
169
|
|
|
170
170
|
### Indexing
|
|
171
171
|
|
|
172
|
-
#### Default Provider
|
|
172
|
+
#### Default Indexing Provider
|
|
173
173
|
|
|
174
174
|
By default, Kodit will use small local models for semantic search and enrichment. If you
|
|
175
175
|
are using Kodit in a professional capacity, it is likely that the local model latency is
|
|
@@ -247,6 +247,12 @@ DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
|
|
|
247
247
|
DEFAULT_SEARCH_PROVIDER=vectorchord
|
|
248
248
|
```
|
|
249
249
|
|
|
250
|
+
### Enrichment
|
|
251
|
+
|
|
252
|
+
#### Default Enrichment Provider
|
|
253
|
+
|
|
254
|
+
The default enrichment provider is the same as [the default indexing provider](#default-indexing-provider).
|
|
255
|
+
|
|
250
256
|
## Managing Kodit
|
|
251
257
|
|
|
252
258
|
There is limited management functionality at this time. To delete indexes you must
|
|
@@ -18,6 +18,7 @@ classifiers = [
|
|
|
18
18
|
|
|
19
19
|
# Specify the Python versions you support here.
|
|
20
20
|
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
21
22
|
]
|
|
22
23
|
requires-python = ">=3.12"
|
|
23
24
|
dependencies = [
|
|
@@ -49,6 +50,7 @@ dependencies = [
|
|
|
49
50
|
"openai>=1.82.0",
|
|
50
51
|
"tiktoken>=0.9.0",
|
|
51
52
|
"asyncpg>=0.30.0",
|
|
53
|
+
"transformers>=4.51.3",
|
|
52
54
|
]
|
|
53
55
|
|
|
54
56
|
[dependency-groups]
|
|
@@ -17,11 +17,10 @@ from kodit.config import (
|
|
|
17
17
|
with_session,
|
|
18
18
|
)
|
|
19
19
|
from kodit.embedding.embedding_factory import embedding_factory
|
|
20
|
+
from kodit.enrichment.enrichment_factory import enrichment_factory
|
|
20
21
|
from kodit.indexing.indexing_repository import IndexRepository
|
|
21
|
-
from kodit.indexing.indexing_service import IndexService
|
|
22
|
+
from kodit.indexing.indexing_service import IndexService, SearchRequest
|
|
22
23
|
from kodit.log import configure_logging, configure_telemetry, log_event
|
|
23
|
-
from kodit.search.search_repository import SearchRepository
|
|
24
|
-
from kodit.search.search_service import SearchRequest, SearchService
|
|
25
24
|
from kodit.source.source_repository import SourceRepository
|
|
26
25
|
from kodit.source.source_service import SourceService
|
|
27
26
|
|
|
@@ -72,9 +71,13 @@ async def index(
|
|
|
72
71
|
repository=repository,
|
|
73
72
|
source_service=source_service,
|
|
74
73
|
keyword_search_provider=keyword_search_factory(app_context, session),
|
|
75
|
-
|
|
76
|
-
app_context=app_context, session=session
|
|
74
|
+
code_search_service=embedding_factory(
|
|
75
|
+
task_name="code", app_context=app_context, session=session
|
|
77
76
|
),
|
|
77
|
+
text_search_service=embedding_factory(
|
|
78
|
+
task_name="text", app_context=app_context, session=session
|
|
79
|
+
),
|
|
80
|
+
enrichment_service=enrichment_factory(app_context),
|
|
78
81
|
)
|
|
79
82
|
|
|
80
83
|
if not sources:
|
|
@@ -131,11 +134,20 @@ async def code(
|
|
|
131
134
|
|
|
132
135
|
This works best if your query is code.
|
|
133
136
|
"""
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
+
source_repository = SourceRepository(session)
|
|
138
|
+
source_service = SourceService(app_context.get_clone_dir(), source_repository)
|
|
139
|
+
repository = IndexRepository(session)
|
|
140
|
+
service = IndexService(
|
|
141
|
+
repository=repository,
|
|
142
|
+
source_service=source_service,
|
|
137
143
|
keyword_search_provider=keyword_search_factory(app_context, session),
|
|
138
|
-
|
|
144
|
+
code_search_service=embedding_factory(
|
|
145
|
+
task_name="code", app_context=app_context, session=session
|
|
146
|
+
),
|
|
147
|
+
text_search_service=embedding_factory(
|
|
148
|
+
task_name="text", app_context=app_context, session=session
|
|
149
|
+
),
|
|
150
|
+
enrichment_service=enrichment_factory(app_context),
|
|
139
151
|
)
|
|
140
152
|
|
|
141
153
|
snippets = await service.search(SearchRequest(code_query=query, top_k=top_k))
|
|
@@ -147,6 +159,7 @@ async def code(
|
|
|
147
159
|
for snippet in snippets:
|
|
148
160
|
click.echo("-" * 80)
|
|
149
161
|
click.echo(f"{snippet.uri}")
|
|
162
|
+
click.echo(f"Original scores: {snippet.original_scores}")
|
|
150
163
|
click.echo(snippet.content)
|
|
151
164
|
click.echo("-" * 80)
|
|
152
165
|
click.echo()
|
|
@@ -164,11 +177,20 @@ async def keyword(
|
|
|
164
177
|
top_k: int,
|
|
165
178
|
) -> None:
|
|
166
179
|
"""Search for snippets using keyword search."""
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
180
|
+
source_repository = SourceRepository(session)
|
|
181
|
+
source_service = SourceService(app_context.get_clone_dir(), source_repository)
|
|
182
|
+
repository = IndexRepository(session)
|
|
183
|
+
service = IndexService(
|
|
184
|
+
repository=repository,
|
|
185
|
+
source_service=source_service,
|
|
170
186
|
keyword_search_provider=keyword_search_factory(app_context, session),
|
|
171
|
-
|
|
187
|
+
code_search_service=embedding_factory(
|
|
188
|
+
task_name="code", app_context=app_context, session=session
|
|
189
|
+
),
|
|
190
|
+
text_search_service=embedding_factory(
|
|
191
|
+
task_name="text", app_context=app_context, session=session
|
|
192
|
+
),
|
|
193
|
+
enrichment_service=enrichment_factory(app_context),
|
|
172
194
|
)
|
|
173
195
|
|
|
174
196
|
snippets = await service.search(SearchRequest(keywords=keywords, top_k=top_k))
|
|
@@ -180,6 +202,53 @@ async def keyword(
|
|
|
180
202
|
for snippet in snippets:
|
|
181
203
|
click.echo("-" * 80)
|
|
182
204
|
click.echo(f"{snippet.uri}")
|
|
205
|
+
click.echo(f"Original scores: {snippet.original_scores}")
|
|
206
|
+
click.echo(snippet.content)
|
|
207
|
+
click.echo("-" * 80)
|
|
208
|
+
click.echo()
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@search.command()
|
|
212
|
+
@click.argument("query")
|
|
213
|
+
@click.option("--top-k", default=10, help="Number of snippets to retrieve")
|
|
214
|
+
@with_app_context
|
|
215
|
+
@with_session
|
|
216
|
+
async def text(
|
|
217
|
+
session: AsyncSession,
|
|
218
|
+
app_context: AppContext,
|
|
219
|
+
query: str,
|
|
220
|
+
top_k: int,
|
|
221
|
+
) -> None:
|
|
222
|
+
"""Search for snippets using semantic text search.
|
|
223
|
+
|
|
224
|
+
This works best if your query is text.
|
|
225
|
+
"""
|
|
226
|
+
source_repository = SourceRepository(session)
|
|
227
|
+
source_service = SourceService(app_context.get_clone_dir(), source_repository)
|
|
228
|
+
repository = IndexRepository(session)
|
|
229
|
+
service = IndexService(
|
|
230
|
+
repository=repository,
|
|
231
|
+
source_service=source_service,
|
|
232
|
+
keyword_search_provider=keyword_search_factory(app_context, session),
|
|
233
|
+
code_search_service=embedding_factory(
|
|
234
|
+
task_name="code", app_context=app_context, session=session
|
|
235
|
+
),
|
|
236
|
+
text_search_service=embedding_factory(
|
|
237
|
+
task_name="text", app_context=app_context, session=session
|
|
238
|
+
),
|
|
239
|
+
enrichment_service=enrichment_factory(app_context),
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
snippets = await service.search(SearchRequest(text_query=query, top_k=top_k))
|
|
243
|
+
|
|
244
|
+
if len(snippets) == 0:
|
|
245
|
+
click.echo("No snippets found")
|
|
246
|
+
return
|
|
247
|
+
|
|
248
|
+
for snippet in snippets:
|
|
249
|
+
click.echo("-" * 80)
|
|
250
|
+
click.echo(f"{snippet.uri}")
|
|
251
|
+
click.echo(f"Original scores: {snippet.original_scores}")
|
|
183
252
|
click.echo(snippet.content)
|
|
184
253
|
click.echo("-" * 80)
|
|
185
254
|
click.echo()
|
|
@@ -189,28 +258,44 @@ async def keyword(
|
|
|
189
258
|
@click.option("--top-k", default=10, help="Number of snippets to retrieve")
|
|
190
259
|
@click.option("--keywords", required=True, help="Comma separated list of keywords")
|
|
191
260
|
@click.option("--code", required=True, help="Semantic code search query")
|
|
261
|
+
@click.option("--text", required=True, help="Semantic text search query")
|
|
192
262
|
@with_app_context
|
|
193
263
|
@with_session
|
|
194
|
-
async def hybrid(
|
|
264
|
+
async def hybrid( # noqa: PLR0913
|
|
195
265
|
session: AsyncSession,
|
|
196
266
|
app_context: AppContext,
|
|
197
267
|
top_k: int,
|
|
198
268
|
keywords: str,
|
|
199
269
|
code: str,
|
|
270
|
+
text: str,
|
|
200
271
|
) -> None:
|
|
201
272
|
"""Search for snippets using hybrid search."""
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
273
|
+
source_repository = SourceRepository(session)
|
|
274
|
+
source_service = SourceService(app_context.get_clone_dir(), source_repository)
|
|
275
|
+
repository = IndexRepository(session)
|
|
276
|
+
service = IndexService(
|
|
277
|
+
repository=repository,
|
|
278
|
+
source_service=source_service,
|
|
205
279
|
keyword_search_provider=keyword_search_factory(app_context, session),
|
|
206
|
-
|
|
280
|
+
code_search_service=embedding_factory(
|
|
281
|
+
task_name="code", app_context=app_context, session=session
|
|
282
|
+
),
|
|
283
|
+
text_search_service=embedding_factory(
|
|
284
|
+
task_name="text", app_context=app_context, session=session
|
|
285
|
+
),
|
|
286
|
+
enrichment_service=enrichment_factory(app_context),
|
|
207
287
|
)
|
|
208
288
|
|
|
209
289
|
# Parse keywords into a list of strings
|
|
210
290
|
keywords_list = [k.strip().lower() for k in keywords.split(",")]
|
|
211
291
|
|
|
212
292
|
snippets = await service.search(
|
|
213
|
-
SearchRequest(
|
|
293
|
+
SearchRequest(
|
|
294
|
+
text_query=text,
|
|
295
|
+
keywords=keywords_list,
|
|
296
|
+
code_query=code,
|
|
297
|
+
top_k=top_k,
|
|
298
|
+
)
|
|
214
299
|
)
|
|
215
300
|
|
|
216
301
|
if len(snippets) == 0:
|
|
@@ -220,6 +305,7 @@ async def hybrid(
|
|
|
220
305
|
for snippet in snippets:
|
|
221
306
|
click.echo("-" * 80)
|
|
222
307
|
click.echo(f"{snippet.uri}")
|
|
308
|
+
click.echo(f"Original scores: {snippet.original_scores}")
|
|
223
309
|
click.echo(snippet.content)
|
|
224
310
|
click.echo("-" * 80)
|
|
225
311
|
click.echo()
|
|
@@ -21,7 +21,7 @@ from kodit.embedding.vectorchord_vector_search_service import (
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def embedding_factory(
|
|
24
|
-
app_context: AppContext, session: AsyncSession
|
|
24
|
+
task_name: str, app_context: AppContext, session: AsyncSession
|
|
25
25
|
) -> VectorSearchService:
|
|
26
26
|
"""Create an embedding service."""
|
|
27
27
|
embedding_repository = EmbeddingRepository(session=session)
|
|
@@ -33,7 +33,7 @@ def embedding_factory(
|
|
|
33
33
|
embedding_provider = LocalEmbeddingProvider(CODE)
|
|
34
34
|
|
|
35
35
|
if app_context.default_search.provider == "vectorchord":
|
|
36
|
-
return VectorChordVectorSearchService(session, embedding_provider)
|
|
36
|
+
return VectorChordVectorSearchService(task_name, session, embedding_provider)
|
|
37
37
|
if app_context.default_search.provider == "sqlite":
|
|
38
38
|
return LocalVectorSearchService(
|
|
39
39
|
embedding_repository=embedding_repository,
|
|
@@ -38,8 +38,15 @@ def split_sub_batches(encoding: tiktoken.Encoding, data: list[str]) -> list[list
|
|
|
38
38
|
item_tokens = len(encoding.encode(next_item))
|
|
39
39
|
|
|
40
40
|
if item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
# Loop around trying to truncate the snippet until it fits in the max
|
|
42
|
+
# embedding size
|
|
43
|
+
while item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
|
|
44
|
+
next_item = next_item[:-1]
|
|
45
|
+
item_tokens = len(encoding.encode(next_item))
|
|
46
|
+
|
|
47
|
+
data_to_process[0] = next_item
|
|
48
|
+
|
|
49
|
+
log.warning("Truncated snippet", snippet=next_item)
|
|
43
50
|
|
|
44
51
|
if current_tokens + item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
|
|
45
52
|
break
|
{kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/openai_embedding_provider.py
RENAMED
|
@@ -38,26 +38,38 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
|
|
|
38
38
|
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
39
39
|
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
40
40
|
|
|
41
|
-
|
|
41
|
+
# Create a list of tuples with a temporary id for each batch
|
|
42
|
+
# We need to do this so that we can return the results in the same order as the
|
|
43
|
+
# input data
|
|
44
|
+
input_data = [(i, batch) for i, batch in enumerate(batched_data)]
|
|
45
|
+
|
|
46
|
+
async def process_batch(
|
|
47
|
+
data: tuple[int, list[str]],
|
|
48
|
+
) -> tuple[int, list[Vector]]:
|
|
49
|
+
batch_id, batch = data
|
|
42
50
|
async with sem:
|
|
43
51
|
try:
|
|
44
52
|
response = await self.openai_client.embeddings.create(
|
|
45
53
|
model=self.model_name,
|
|
46
54
|
input=batch,
|
|
47
55
|
)
|
|
48
|
-
return [
|
|
56
|
+
return batch_id, [
|
|
49
57
|
[float(x) for x in embedding.embedding]
|
|
50
58
|
for embedding in response.data
|
|
51
59
|
]
|
|
52
60
|
except Exception as e:
|
|
53
61
|
self.log.exception("Error embedding batch", error=str(e))
|
|
54
|
-
return []
|
|
62
|
+
return batch_id, []
|
|
55
63
|
|
|
56
64
|
# Create tasks for all batches
|
|
57
|
-
tasks = [process_batch(batch) for batch in
|
|
65
|
+
tasks = [process_batch(batch) for batch in input_data]
|
|
58
66
|
|
|
59
67
|
# Process all batches and yield results as they complete
|
|
60
|
-
results: list[Vector] = []
|
|
68
|
+
results: list[tuple[int, list[Vector]]] = []
|
|
61
69
|
for task in asyncio.as_completed(tasks):
|
|
62
|
-
|
|
63
|
-
|
|
70
|
+
result = await task
|
|
71
|
+
results.append(result)
|
|
72
|
+
|
|
73
|
+
# Output in the same order as the input data
|
|
74
|
+
ordered_results = [result for _, result in sorted(results, key=lambda x: x[0])]
|
|
75
|
+
return [item for sublist in ordered_results for item in sublist]
|
|
@@ -12,23 +12,20 @@ from kodit.embedding.vector_search_service import (
|
|
|
12
12
|
VectorSearchService,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
-
TABLE_NAME = "vectorchord_embeddings"
|
|
16
|
-
INDEX_NAME = f"{TABLE_NAME}_idx"
|
|
17
|
-
|
|
18
15
|
# SQL Queries
|
|
19
16
|
CREATE_VCHORD_EXTENSION = """
|
|
20
17
|
CREATE EXTENSION IF NOT EXISTS vchord CASCADE;
|
|
21
18
|
"""
|
|
22
19
|
|
|
23
|
-
CHECK_VCHORD_EMBEDDING_DIMENSION =
|
|
20
|
+
CHECK_VCHORD_EMBEDDING_DIMENSION = """
|
|
24
21
|
SELECT a.atttypmod as dimension
|
|
25
22
|
FROM pg_attribute a
|
|
26
23
|
JOIN pg_class c ON a.attrelid = c.oid
|
|
27
24
|
WHERE c.relname = '{TABLE_NAME}'
|
|
28
25
|
AND a.attname = 'embedding';
|
|
29
|
-
"""
|
|
26
|
+
"""
|
|
30
27
|
|
|
31
|
-
CREATE_VCHORD_INDEX =
|
|
28
|
+
CREATE_VCHORD_INDEX = """
|
|
32
29
|
CREATE INDEX IF NOT EXISTS {INDEX_NAME}
|
|
33
30
|
ON {TABLE_NAME}
|
|
34
31
|
USING vchordrq (embedding vector_l2_ops) WITH (options = $$
|
|
@@ -38,21 +35,21 @@ lists = []
|
|
|
38
35
|
$$);
|
|
39
36
|
"""
|
|
40
37
|
|
|
41
|
-
INSERT_QUERY =
|
|
38
|
+
INSERT_QUERY = """
|
|
42
39
|
INSERT INTO {TABLE_NAME} (snippet_id, embedding)
|
|
43
40
|
VALUES (:snippet_id, :embedding)
|
|
44
41
|
ON CONFLICT (snippet_id) DO UPDATE
|
|
45
42
|
SET embedding = EXCLUDED.embedding
|
|
46
|
-
"""
|
|
43
|
+
"""
|
|
47
44
|
|
|
48
45
|
# Note that <=> in vectorchord is cosine distance
|
|
49
46
|
# So scores go from 0 (similar) to 2 (opposite)
|
|
50
|
-
SEARCH_QUERY =
|
|
47
|
+
SEARCH_QUERY = """
|
|
51
48
|
SELECT snippet_id, embedding <=> :query as score
|
|
52
49
|
FROM {TABLE_NAME}
|
|
53
50
|
ORDER BY score ASC
|
|
54
51
|
LIMIT :top_k;
|
|
55
|
-
"""
|
|
52
|
+
"""
|
|
56
53
|
|
|
57
54
|
|
|
58
55
|
class VectorChordVectorSearchService(VectorSearchService):
|
|
@@ -60,6 +57,7 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
60
57
|
|
|
61
58
|
def __init__(
|
|
62
59
|
self,
|
|
60
|
+
task_name: str,
|
|
63
61
|
session: AsyncSession,
|
|
64
62
|
embedding_provider: EmbeddingProvider,
|
|
65
63
|
) -> None:
|
|
@@ -67,6 +65,8 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
67
65
|
self.embedding_provider = embedding_provider
|
|
68
66
|
self._session = session
|
|
69
67
|
self._initialized = False
|
|
68
|
+
self.table_name = f"vectorchord_{task_name}_embeddings"
|
|
69
|
+
self.index_name = f"{self.table_name}_idx"
|
|
70
70
|
|
|
71
71
|
async def _initialize(self) -> None:
|
|
72
72
|
"""Initialize the VectorChord environment."""
|
|
@@ -88,15 +88,23 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
88
88
|
vector_dim = (await self.embedding_provider.embed(["dimension"]))[0]
|
|
89
89
|
await self._session.execute(
|
|
90
90
|
text(
|
|
91
|
-
f"""CREATE TABLE IF NOT EXISTS {
|
|
91
|
+
f"""CREATE TABLE IF NOT EXISTS {self.table_name} (
|
|
92
92
|
id SERIAL PRIMARY KEY,
|
|
93
93
|
snippet_id INT NOT NULL UNIQUE,
|
|
94
94
|
embedding VECTOR({len(vector_dim)}) NOT NULL
|
|
95
95
|
);"""
|
|
96
96
|
)
|
|
97
97
|
)
|
|
98
|
-
await self._session.execute(
|
|
99
|
-
|
|
98
|
+
await self._session.execute(
|
|
99
|
+
text(
|
|
100
|
+
CREATE_VCHORD_INDEX.format(
|
|
101
|
+
TABLE_NAME=self.table_name, INDEX_NAME=self.index_name
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
result = await self._session.execute(
|
|
106
|
+
text(CHECK_VCHORD_EMBEDDING_DIMENSION.format(TABLE_NAME=self.table_name))
|
|
107
|
+
)
|
|
100
108
|
vector_dim_from_db = result.scalar_one()
|
|
101
109
|
if vector_dim_from_db != len(vector_dim):
|
|
102
110
|
msg = (
|
|
@@ -123,7 +131,7 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
123
131
|
embeddings = await self.embedding_provider.embed([doc.text for doc in data])
|
|
124
132
|
# Execute inserts
|
|
125
133
|
await self._execute(
|
|
126
|
-
text(INSERT_QUERY),
|
|
134
|
+
text(INSERT_QUERY.format(TABLE_NAME=self.table_name)),
|
|
127
135
|
[
|
|
128
136
|
{"snippet_id": doc.snippet_id, "embedding": str(embedding)}
|
|
129
137
|
for doc, embedding in zip(data, embeddings, strict=True)
|
|
@@ -135,7 +143,8 @@ class VectorChordVectorSearchService(VectorSearchService):
|
|
|
135
143
|
"""Query the embedding model."""
|
|
136
144
|
embedding = await self.embedding_provider.embed([query])
|
|
137
145
|
result = await self._execute(
|
|
138
|
-
text(SEARCH_QUERY
|
|
146
|
+
text(SEARCH_QUERY.format(TABLE_NAME=self.table_name)),
|
|
147
|
+
{"query": str(embedding[0]), "top_k": top_k},
|
|
139
148
|
)
|
|
140
149
|
rows = result.mappings().all()
|
|
141
150
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Enrichment."""
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Embedding service."""
|
|
2
|
+
|
|
3
|
+
from kodit.config import AppContext
|
|
4
|
+
from kodit.enrichment.enrichment_provider.local_enrichment_provider import (
|
|
5
|
+
LocalEnrichmentProvider,
|
|
6
|
+
)
|
|
7
|
+
from kodit.enrichment.enrichment_provider.openai_enrichment_provider import (
|
|
8
|
+
OpenAIEnrichmentProvider,
|
|
9
|
+
)
|
|
10
|
+
from kodit.enrichment.enrichment_service import (
|
|
11
|
+
EnrichmentService,
|
|
12
|
+
LLMEnrichmentService,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def enrichment_factory(app_context: AppContext) -> EnrichmentService:
|
|
17
|
+
"""Create an embedding service."""
|
|
18
|
+
openai_client = app_context.get_default_openai_client()
|
|
19
|
+
if openai_client is not None:
|
|
20
|
+
enrichment_provider = OpenAIEnrichmentProvider(openai_client=openai_client)
|
|
21
|
+
return LLMEnrichmentService(enrichment_provider)
|
|
22
|
+
|
|
23
|
+
return LLMEnrichmentService(LocalEnrichmentProvider())
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Enrichment provider."""
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Enrichment provider."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
ENRICHMENT_SYSTEM_PROMPT = """
|
|
6
|
+
You are a professional software developer. You will be given a snippet of code.
|
|
7
|
+
Please provide a concise explanation of the code.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EnrichmentProvider(ABC):
|
|
12
|
+
"""Enrichment provider."""
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
async def enrich(self, data: list[str]) -> list[str]:
|
|
16
|
+
"""Enrich a list of strings."""
|