kodit 0.1.15__tar.gz → 0.1.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (124) hide show
  1. kodit-0.1.16/.github/workflows/pull_request.yaml +35 -0
  2. {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/test.yaml +7 -8
  3. kodit-0.1.16/.python-version +1 -0
  4. {kodit-0.1.15 → kodit-0.1.16}/Dockerfile +4 -4
  5. {kodit-0.1.15 → kodit-0.1.16}/PKG-INFO +3 -1
  6. {kodit-0.1.15 → kodit-0.1.16}/docs/_index.md +7 -1
  7. {kodit-0.1.15 → kodit-0.1.16}/pyproject.toml +2 -0
  8. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/_version.py +2 -2
  9. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/cli.py +105 -19
  10. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_factory.py +2 -2
  11. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/embedding_provider.py +9 -2
  12. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/openai_embedding_provider.py +19 -7
  13. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/vectorchord_vector_search_service.py +24 -15
  14. kodit-0.1.16/src/kodit/enrichment/__init__.py +1 -0
  15. kodit-0.1.16/src/kodit/enrichment/enrichment_factory.py +23 -0
  16. kodit-0.1.16/src/kodit/enrichment/enrichment_provider/__init__.py +1 -0
  17. kodit-0.1.16/src/kodit/enrichment/enrichment_provider/enrichment_provider.py +16 -0
  18. kodit-0.1.16/src/kodit/enrichment/enrichment_provider/local_enrichment_provider.py +63 -0
  19. kodit-0.1.16/src/kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +77 -0
  20. kodit-0.1.16/src/kodit/enrichment/enrichment_service.py +33 -0
  21. kodit-0.1.16/src/kodit/indexing/fusion.py +67 -0
  22. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/indexing/indexing_repository.py +20 -0
  23. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/indexing/indexing_service.py +120 -4
  24. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/mcp.py +25 -16
  25. kodit-0.1.16/src/kodit/snippets/languages/go.scm +26 -0
  26. kodit-0.1.16/tests/experiments/similarity_test.py +73 -0
  27. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/embedding_provider/openai_embedding_provider_test.py +87 -7
  28. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/vectorchord_vector_search_service_test.py +1 -0
  29. kodit-0.1.16/tests/kodit/enrichment/enrichment_provider/__init__.py +0 -0
  30. kodit-0.1.16/tests/kodit/enrichment/enrichment_provider/openai_enrichment_provider_test.py +203 -0
  31. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/indexing/indexing_service_test.py +8 -5
  32. kodit-0.1.16/tests/kodit/snippets/__init__.py +0 -0
  33. kodit-0.1.16/tests/kodit/snippets/golang.go +28 -0
  34. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/snippets/method_extraction_test.py +38 -0
  35. {kodit-0.1.15 → kodit-0.1.16}/tests/smoke.sh +1 -1
  36. {kodit-0.1.15 → kodit-0.1.16}/uv.lock +2 -0
  37. kodit-0.1.15/.python-version +0 -1
  38. kodit-0.1.15/src/kodit/search/__init__.py +0 -1
  39. kodit-0.1.15/src/kodit/search/search_repository.py +0 -57
  40. kodit-0.1.15/src/kodit/search/search_service.py +0 -135
  41. kodit-0.1.15/tests/kodit/search/__init__.py +0 -1
  42. kodit-0.1.15/tests/kodit/search/search_repository_test.py +0 -57
  43. kodit-0.1.15/tests/kodit/search/search_service_test.py +0 -210
  44. {kodit-0.1.15 → kodit-0.1.16}/.cursor/rules/kodit.mdc +0 -0
  45. {kodit-0.1.15 → kodit-0.1.16}/.github/CODE_OF_CONDUCT.md +0 -0
  46. {kodit-0.1.15 → kodit-0.1.16}/.github/CONTRIBUTING.md +0 -0
  47. {kodit-0.1.15 → kodit-0.1.16}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  48. {kodit-0.1.15 → kodit-0.1.16}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  49. {kodit-0.1.15 → kodit-0.1.16}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  50. {kodit-0.1.15 → kodit-0.1.16}/.github/dependabot.yml +0 -0
  51. {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/docker.yaml +0 -0
  52. {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/docs.yaml +0 -0
  53. {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/pypi-test.yaml +0 -0
  54. {kodit-0.1.15 → kodit-0.1.16}/.github/workflows/pypi.yaml +0 -0
  55. {kodit-0.1.15 → kodit-0.1.16}/.gitignore +0 -0
  56. {kodit-0.1.15 → kodit-0.1.16}/.vscode/launch.json +0 -0
  57. {kodit-0.1.15 → kodit-0.1.16}/.vscode/settings.json +0 -0
  58. {kodit-0.1.15 → kodit-0.1.16}/LICENSE +0 -0
  59. {kodit-0.1.15 → kodit-0.1.16}/README.md +0 -0
  60. {kodit-0.1.15 → kodit-0.1.16}/alembic.ini +0 -0
  61. {kodit-0.1.15 → kodit-0.1.16}/docs/developer/index.md +0 -0
  62. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/.gitignore +0 -0
  63. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/__init__.py +0 -0
  64. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/app.py +0 -0
  65. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/__init__.py +0 -0
  66. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/keyword_search_factory.py +0 -0
  67. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/keyword_search_service.py +0 -0
  68. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/local_bm25.py +0 -0
  69. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/bm25/vectorchord_bm25.py +0 -0
  70. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/config.py +0 -0
  71. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/database.py +0 -0
  72. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/__init__.py +0 -0
  73. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_models.py +0 -0
  74. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/__init__.py +0 -0
  75. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -0
  76. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_provider/local_embedding_provider.py +0 -0
  77. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/embedding_repository.py +0 -0
  78. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/local_vector_search_service.py +0 -0
  79. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/embedding/vector_search_service.py +0 -0
  80. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/indexing/__init__.py +0 -0
  81. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/indexing/indexing_models.py +0 -0
  82. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/log.py +0 -0
  83. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/middleware.py +0 -0
  84. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/README +0 -0
  85. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/__init__.py +0 -0
  86. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/env.py +0 -0
  87. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/script.py.mako +0 -0
  88. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +0 -0
  89. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/versions/85155663351e_initial.py +0 -0
  90. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/migrations/versions/__init__.py +0 -0
  91. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/__init__.py +0 -0
  92. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/languages/__init__.py +0 -0
  93. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/languages/csharp.scm +0 -0
  94. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/languages/python.scm +0 -0
  95. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/method_snippets.py +0 -0
  96. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/snippets/snippets.py +0 -0
  97. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/source/__init__.py +0 -0
  98. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/source/source_models.py +0 -0
  99. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/source/source_repository.py +0 -0
  100. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/source/source_service.py +0 -0
  101. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/util/__init__.py +0 -0
  102. {kodit-0.1.15 → kodit-0.1.16}/src/kodit/util/spinner.py +0 -0
  103. {kodit-0.1.15 → kodit-0.1.16}/tests/__init__.py +0 -0
  104. {kodit-0.1.15 → kodit-0.1.16}/tests/conftest.py +0 -0
  105. {kodit-0.1.15 → kodit-0.1.16}/tests/experiments/cline-prompt-regression-tests/cline_prompt.txt +0 -0
  106. {kodit-0.1.15 → kodit-0.1.16}/tests/experiments/cline-prompt-regression-tests/cline_prompt_test.py +0 -0
  107. {kodit-0.1.15 → kodit-0.1.16}/tests/experiments/embedding.py +0 -0
  108. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/__init__.py +0 -0
  109. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/bm25/local_bm25_test.py +0 -0
  110. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/bm25/vectorchord_repository_test.py +0 -0
  111. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/cli_test.py +0 -0
  112. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/e2e.py +0 -0
  113. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/__init__.py +0 -0
  114. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/embedding_provider/local_embedding_provider_test.py +0 -0
  115. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/embedding/local_vector_search_service_test.py +0 -0
  116. {kodit-0.1.15/tests/kodit/snippets → kodit-0.1.16/tests/kodit/enrichment}/__init__.py +0 -0
  117. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/indexing/__init__.py +0 -0
  118. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/mcp_test.py +0 -0
  119. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/snippets/csharp.cs +0 -0
  120. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/snippets/detect_language_test.py +0 -0
  121. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/snippets/python.py +0 -0
  122. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/source/__init__.py +0 -0
  123. {kodit-0.1.15 → kodit-0.1.16}/tests/kodit/source/source_service_test.py +0 -0
  124. {kodit-0.1.15 → kodit-0.1.16}/tests/performance/similarity.py +0 -0
@@ -0,0 +1,35 @@
1
+ # This workflow will install dependencies, create coverage tests and run Pytest Coverage Comment
2
+ # For more information see: https://github.com/MishaKav/pytest-coverage-comment/
3
+ name: pytest-coverage-comment
4
+ on:
5
+ pull_request:
6
+ branches:
7
+ - "*"
8
+
9
+ # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
10
+ # `contents` is for permission to the contents of the repository.
11
+ # `pull-requests` is for permission to pull request
12
+ permissions:
13
+ contents: write
14
+ checks: write
15
+ pull-requests: write
16
+
17
+ jobs:
18
+ coverage-comment:
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - uses: actions/setup-python@v5
23
+ with:
24
+ python-version-file: ".python-version"
25
+ - uses: astral-sh/setup-uv@v5
26
+ - run: uv sync --locked --all-extras --dev
27
+
28
+ - name: Run tests
29
+ run: uv run pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=src tests/kodit | tee pytest-coverage.txt
30
+
31
+ - name: Pytest coverage comment
32
+ uses: MishaKav/pytest-coverage-comment@main
33
+ with:
34
+ pytest-coverage-path: ./pytest-coverage.txt
35
+ junitxml-path: ./pytest.xml
@@ -9,7 +9,6 @@ on:
9
9
  permissions:
10
10
  contents: read # Needed to check out code
11
11
  checks: write # Needed to report test results
12
- pull-requests: write # Needed to add comments/annotations to PRs
13
12
 
14
13
  jobs:
15
14
  test:
@@ -37,12 +36,6 @@ jobs:
37
36
  - name: Run tests
38
37
  run: uv run pytest -s --cov=src --cov-report=xml tests/kodit
39
38
 
40
- - name: Pytest coverage comment
41
- if: github.event_name == 'pull_request'
42
- uses: MishaKav/pytest-coverage-comment@v1.1.54
43
- with:
44
- pytest-xml-coverage-path: ./coverage.xml
45
-
46
39
  build-package:
47
40
  runs-on: ubuntu-latest
48
41
  timeout-minutes: 10
@@ -69,12 +62,18 @@ jobs:
69
62
  test-package:
70
63
  needs: build-package
71
64
  runs-on: ubuntu-latest
65
+ strategy:
66
+ matrix:
67
+ python-version:
68
+ - 3.12
69
+ - 3.13
72
70
  timeout-minutes: 10
73
71
  steps:
74
72
  - uses: actions/checkout@v4
75
73
  with:
76
74
  sparse-checkout: |
77
75
  tests/smoke.sh
76
+ uv.lock
78
77
  sparse-checkout-cone-mode: false
79
78
 
80
79
  - name: Download built package
@@ -86,7 +85,7 @@ jobs:
86
85
  - name: "Set up Python"
87
86
  uses: actions/setup-python@v5
88
87
  with:
89
- python-version: 3.12
88
+ python-version: ${{ matrix.python-version }}
90
89
 
91
90
  - name: Install uv
92
91
  uses: astral-sh/setup-uv@v5
@@ -0,0 +1 @@
1
+ 3.13
@@ -1,5 +1,5 @@
1
1
  # syntax=docker/dockerfile:1.9
2
- FROM python:3.12.10-slim-bookworm AS build
2
+ FROM python:3.13.3-slim-bookworm AS build
3
3
 
4
4
  # The following does not work in Podman unless you build in Docker
5
5
  # compatibility mode: <https://github.com/containers/podman/issues/8477>
@@ -23,12 +23,12 @@ COPY --from=ghcr.io/astral-sh/uv:0.7.2 /uv /usr/local/bin/uv
23
23
  # - Silence uv complaining about not being able to use hard links,
24
24
  # - tell uv to byte-compile packages for faster application startups,
25
25
  # - prevent uv from accidentally downloading isolated Python builds,
26
- # - pick a Python (use `/usr/bin/python3.12` on uv 0.5.0 and later),
26
+ # - pick a Python (use `/usr/bin/python3.13` on uv 0.5.0 and later),
27
27
  # - and finally declare `/app` as the target for `uv sync`.
28
28
  ENV UV_LINK_MODE=copy \
29
29
  UV_COMPILE_BYTECODE=1 \
30
30
  UV_PYTHON_DOWNLOADS=never \
31
- UV_PYTHON=python3.12 \
31
+ UV_PYTHON=python3.13 \
32
32
  UV_PROJECT_ENVIRONMENT=/app
33
33
 
34
34
  # Synchronize DEPENDENCIES without the application itself.
@@ -60,7 +60,7 @@ RUN --mount=type=cache,target=/root/.cache \
60
60
 
61
61
  ##########################################################################
62
62
 
63
- FROM python:3.12.10-slim-bookworm
63
+ FROM python:3.13.3-slim-bookworm
64
64
  SHELL ["sh", "-exc"]
65
65
 
66
66
  ENV PATH=/app/bin:$PATH
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.1.15
3
+ Version: 0.1.16
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -15,6 +15,7 @@ Keywords: ai,indexing,mcp,rag
15
15
  Classifier: Development Status :: 2 - Pre-Alpha
16
16
  Classifier: Intended Audience :: Developers
17
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
18
19
  Classifier: Topic :: Software Development :: Code Generators
19
20
  Requires-Python: >=3.12
20
21
  Requires-Dist: aiofiles>=24.1.0
@@ -42,6 +43,7 @@ Requires-Dist: sqlalchemy[asyncio]>=2.0.40
42
43
  Requires-Dist: structlog>=25.3.0
43
44
  Requires-Dist: tdqm>=0.0.1
44
45
  Requires-Dist: tiktoken>=0.9.0
46
+ Requires-Dist: transformers>=4.51.3
45
47
  Requires-Dist: tree-sitter-language-pack>=0.7.3
46
48
  Requires-Dist: tree-sitter>=0.24.0
47
49
  Requires-Dist: uritools>=5.0.0
@@ -169,7 +169,7 @@ recreate all indexes.
169
169
 
170
170
  ### Indexing
171
171
 
172
- #### Default Provider
172
+ #### Default Indexing Provider
173
173
 
174
174
  By default, Kodit will use small local models for semantic search and enrichment. If you
175
175
  are using Kodit in a professional capacity, it is likely that the local model latency is
@@ -247,6 +247,12 @@ DB_URL=postgresql+asyncpg://postgres:mysecretpassword@localhost:5432/kodit
247
247
  DEFAULT_SEARCH_PROVIDER=vectorchord
248
248
  ```
249
249
 
250
+ ### Enrichment
251
+
252
+ #### Default Enrichment Provider
253
+
254
+ The default enrichment provider is the same as [the default indexing provider](#default-indexing-provider).
255
+
250
256
  ## Managing Kodit
251
257
 
252
258
  There is limited management functionality at this time. To delete indexes you must
@@ -18,6 +18,7 @@ classifiers = [
18
18
 
19
19
  # Specify the Python versions you support here.
20
20
  "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
21
22
  ]
22
23
  requires-python = ">=3.12"
23
24
  dependencies = [
@@ -49,6 +50,7 @@ dependencies = [
49
50
  "openai>=1.82.0",
50
51
  "tiktoken>=0.9.0",
51
52
  "asyncpg>=0.30.0",
53
+ "transformers>=4.51.3",
52
54
  ]
53
55
 
54
56
  [dependency-groups]
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.15'
21
- __version_tuple__ = version_tuple = (0, 1, 15)
20
+ __version__ = version = '0.1.16'
21
+ __version_tuple__ = version_tuple = (0, 1, 16)
@@ -17,11 +17,10 @@ from kodit.config import (
17
17
  with_session,
18
18
  )
19
19
  from kodit.embedding.embedding_factory import embedding_factory
20
+ from kodit.enrichment.enrichment_factory import enrichment_factory
20
21
  from kodit.indexing.indexing_repository import IndexRepository
21
- from kodit.indexing.indexing_service import IndexService
22
+ from kodit.indexing.indexing_service import IndexService, SearchRequest
22
23
  from kodit.log import configure_logging, configure_telemetry, log_event
23
- from kodit.search.search_repository import SearchRepository
24
- from kodit.search.search_service import SearchRequest, SearchService
25
24
  from kodit.source.source_repository import SourceRepository
26
25
  from kodit.source.source_service import SourceService
27
26
 
@@ -72,9 +71,13 @@ async def index(
72
71
  repository=repository,
73
72
  source_service=source_service,
74
73
  keyword_search_provider=keyword_search_factory(app_context, session),
75
- vector_search_service=embedding_factory(
76
- app_context=app_context, session=session
74
+ code_search_service=embedding_factory(
75
+ task_name="code", app_context=app_context, session=session
77
76
  ),
77
+ text_search_service=embedding_factory(
78
+ task_name="text", app_context=app_context, session=session
79
+ ),
80
+ enrichment_service=enrichment_factory(app_context),
78
81
  )
79
82
 
80
83
  if not sources:
@@ -131,11 +134,20 @@ async def code(
131
134
 
132
135
  This works best if your query is code.
133
136
  """
134
- repository = SearchRepository(session)
135
- service = SearchService(
136
- repository,
137
+ source_repository = SourceRepository(session)
138
+ source_service = SourceService(app_context.get_clone_dir(), source_repository)
139
+ repository = IndexRepository(session)
140
+ service = IndexService(
141
+ repository=repository,
142
+ source_service=source_service,
137
143
  keyword_search_provider=keyword_search_factory(app_context, session),
138
- embedding_service=embedding_factory(app_context=app_context, session=session),
144
+ code_search_service=embedding_factory(
145
+ task_name="code", app_context=app_context, session=session
146
+ ),
147
+ text_search_service=embedding_factory(
148
+ task_name="text", app_context=app_context, session=session
149
+ ),
150
+ enrichment_service=enrichment_factory(app_context),
139
151
  )
140
152
 
141
153
  snippets = await service.search(SearchRequest(code_query=query, top_k=top_k))
@@ -147,6 +159,7 @@ async def code(
147
159
  for snippet in snippets:
148
160
  click.echo("-" * 80)
149
161
  click.echo(f"{snippet.uri}")
162
+ click.echo(f"Original scores: {snippet.original_scores}")
150
163
  click.echo(snippet.content)
151
164
  click.echo("-" * 80)
152
165
  click.echo()
@@ -164,11 +177,20 @@ async def keyword(
164
177
  top_k: int,
165
178
  ) -> None:
166
179
  """Search for snippets using keyword search."""
167
- repository = SearchRepository(session)
168
- service = SearchService(
169
- repository,
180
+ source_repository = SourceRepository(session)
181
+ source_service = SourceService(app_context.get_clone_dir(), source_repository)
182
+ repository = IndexRepository(session)
183
+ service = IndexService(
184
+ repository=repository,
185
+ source_service=source_service,
170
186
  keyword_search_provider=keyword_search_factory(app_context, session),
171
- embedding_service=embedding_factory(app_context=app_context, session=session),
187
+ code_search_service=embedding_factory(
188
+ task_name="code", app_context=app_context, session=session
189
+ ),
190
+ text_search_service=embedding_factory(
191
+ task_name="text", app_context=app_context, session=session
192
+ ),
193
+ enrichment_service=enrichment_factory(app_context),
172
194
  )
173
195
 
174
196
  snippets = await service.search(SearchRequest(keywords=keywords, top_k=top_k))
@@ -180,6 +202,53 @@ async def keyword(
180
202
  for snippet in snippets:
181
203
  click.echo("-" * 80)
182
204
  click.echo(f"{snippet.uri}")
205
+ click.echo(f"Original scores: {snippet.original_scores}")
206
+ click.echo(snippet.content)
207
+ click.echo("-" * 80)
208
+ click.echo()
209
+
210
+
211
+ @search.command()
212
+ @click.argument("query")
213
+ @click.option("--top-k", default=10, help="Number of snippets to retrieve")
214
+ @with_app_context
215
+ @with_session
216
+ async def text(
217
+ session: AsyncSession,
218
+ app_context: AppContext,
219
+ query: str,
220
+ top_k: int,
221
+ ) -> None:
222
+ """Search for snippets using semantic text search.
223
+
224
+ This works best if your query is text.
225
+ """
226
+ source_repository = SourceRepository(session)
227
+ source_service = SourceService(app_context.get_clone_dir(), source_repository)
228
+ repository = IndexRepository(session)
229
+ service = IndexService(
230
+ repository=repository,
231
+ source_service=source_service,
232
+ keyword_search_provider=keyword_search_factory(app_context, session),
233
+ code_search_service=embedding_factory(
234
+ task_name="code", app_context=app_context, session=session
235
+ ),
236
+ text_search_service=embedding_factory(
237
+ task_name="text", app_context=app_context, session=session
238
+ ),
239
+ enrichment_service=enrichment_factory(app_context),
240
+ )
241
+
242
+ snippets = await service.search(SearchRequest(text_query=query, top_k=top_k))
243
+
244
+ if len(snippets) == 0:
245
+ click.echo("No snippets found")
246
+ return
247
+
248
+ for snippet in snippets:
249
+ click.echo("-" * 80)
250
+ click.echo(f"{snippet.uri}")
251
+ click.echo(f"Original scores: {snippet.original_scores}")
183
252
  click.echo(snippet.content)
184
253
  click.echo("-" * 80)
185
254
  click.echo()
@@ -189,28 +258,44 @@ async def keyword(
189
258
  @click.option("--top-k", default=10, help="Number of snippets to retrieve")
190
259
  @click.option("--keywords", required=True, help="Comma separated list of keywords")
191
260
  @click.option("--code", required=True, help="Semantic code search query")
261
+ @click.option("--text", required=True, help="Semantic text search query")
192
262
  @with_app_context
193
263
  @with_session
194
- async def hybrid(
264
+ async def hybrid( # noqa: PLR0913
195
265
  session: AsyncSession,
196
266
  app_context: AppContext,
197
267
  top_k: int,
198
268
  keywords: str,
199
269
  code: str,
270
+ text: str,
200
271
  ) -> None:
201
272
  """Search for snippets using hybrid search."""
202
- repository = SearchRepository(session)
203
- service = SearchService(
204
- repository,
273
+ source_repository = SourceRepository(session)
274
+ source_service = SourceService(app_context.get_clone_dir(), source_repository)
275
+ repository = IndexRepository(session)
276
+ service = IndexService(
277
+ repository=repository,
278
+ source_service=source_service,
205
279
  keyword_search_provider=keyword_search_factory(app_context, session),
206
- embedding_service=embedding_factory(app_context=app_context, session=session),
280
+ code_search_service=embedding_factory(
281
+ task_name="code", app_context=app_context, session=session
282
+ ),
283
+ text_search_service=embedding_factory(
284
+ task_name="text", app_context=app_context, session=session
285
+ ),
286
+ enrichment_service=enrichment_factory(app_context),
207
287
  )
208
288
 
209
289
  # Parse keywords into a list of strings
210
290
  keywords_list = [k.strip().lower() for k in keywords.split(",")]
211
291
 
212
292
  snippets = await service.search(
213
- SearchRequest(keywords=keywords_list, code_query=code, top_k=top_k)
293
+ SearchRequest(
294
+ text_query=text,
295
+ keywords=keywords_list,
296
+ code_query=code,
297
+ top_k=top_k,
298
+ )
214
299
  )
215
300
 
216
301
  if len(snippets) == 0:
@@ -220,6 +305,7 @@ async def hybrid(
220
305
  for snippet in snippets:
221
306
  click.echo("-" * 80)
222
307
  click.echo(f"{snippet.uri}")
308
+ click.echo(f"Original scores: {snippet.original_scores}")
223
309
  click.echo(snippet.content)
224
310
  click.echo("-" * 80)
225
311
  click.echo()
@@ -21,7 +21,7 @@ from kodit.embedding.vectorchord_vector_search_service import (
21
21
 
22
22
 
23
23
  def embedding_factory(
24
- app_context: AppContext, session: AsyncSession
24
+ task_name: str, app_context: AppContext, session: AsyncSession
25
25
  ) -> VectorSearchService:
26
26
  """Create an embedding service."""
27
27
  embedding_repository = EmbeddingRepository(session=session)
@@ -33,7 +33,7 @@ def embedding_factory(
33
33
  embedding_provider = LocalEmbeddingProvider(CODE)
34
34
 
35
35
  if app_context.default_search.provider == "vectorchord":
36
- return VectorChordVectorSearchService(session, embedding_provider)
36
+ return VectorChordVectorSearchService(task_name, session, embedding_provider)
37
37
  if app_context.default_search.provider == "sqlite":
38
38
  return LocalVectorSearchService(
39
39
  embedding_repository=embedding_repository,
@@ -38,8 +38,15 @@ def split_sub_batches(encoding: tiktoken.Encoding, data: list[str]) -> list[list
38
38
  item_tokens = len(encoding.encode(next_item))
39
39
 
40
40
  if item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
41
- log.warning("Skipping too long snippet", snippet=data_to_process.pop(0))
42
- continue
41
+ # Loop around trying to truncate the snippet until it fits in the max
42
+ # embedding size
43
+ while item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
44
+ next_item = next_item[:-1]
45
+ item_tokens = len(encoding.encode(next_item))
46
+
47
+ data_to_process[0] = next_item
48
+
49
+ log.warning("Truncated snippet", snippet=next_item)
43
50
 
44
51
  if current_tokens + item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
45
52
  break
@@ -38,26 +38,38 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
38
38
  # Process batches in parallel with a semaphore to limit concurrent requests
39
39
  sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
40
40
 
41
- async def process_batch(batch: list[str]) -> list[Vector]:
41
+ # Create a list of tuples with a temporary id for each batch
42
+ # We need to do this so that we can return the results in the same order as the
43
+ # input data
44
+ input_data = [(i, batch) for i, batch in enumerate(batched_data)]
45
+
46
+ async def process_batch(
47
+ data: tuple[int, list[str]],
48
+ ) -> tuple[int, list[Vector]]:
49
+ batch_id, batch = data
42
50
  async with sem:
43
51
  try:
44
52
  response = await self.openai_client.embeddings.create(
45
53
  model=self.model_name,
46
54
  input=batch,
47
55
  )
48
- return [
56
+ return batch_id, [
49
57
  [float(x) for x in embedding.embedding]
50
58
  for embedding in response.data
51
59
  ]
52
60
  except Exception as e:
53
61
  self.log.exception("Error embedding batch", error=str(e))
54
- return []
62
+ return batch_id, []
55
63
 
56
64
  # Create tasks for all batches
57
- tasks = [process_batch(batch) for batch in batched_data]
65
+ tasks = [process_batch(batch) for batch in input_data]
58
66
 
59
67
  # Process all batches and yield results as they complete
60
- results: list[Vector] = []
68
+ results: list[tuple[int, list[Vector]]] = []
61
69
  for task in asyncio.as_completed(tasks):
62
- results.extend(await task)
63
- return results
70
+ result = await task
71
+ results.append(result)
72
+
73
+ # Output in the same order as the input data
74
+ ordered_results = [result for _, result in sorted(results, key=lambda x: x[0])]
75
+ return [item for sublist in ordered_results for item in sublist]
@@ -12,23 +12,20 @@ from kodit.embedding.vector_search_service import (
12
12
  VectorSearchService,
13
13
  )
14
14
 
15
- TABLE_NAME = "vectorchord_embeddings"
16
- INDEX_NAME = f"{TABLE_NAME}_idx"
17
-
18
15
  # SQL Queries
19
16
  CREATE_VCHORD_EXTENSION = """
20
17
  CREATE EXTENSION IF NOT EXISTS vchord CASCADE;
21
18
  """
22
19
 
23
- CHECK_VCHORD_EMBEDDING_DIMENSION = f"""
20
+ CHECK_VCHORD_EMBEDDING_DIMENSION = """
24
21
  SELECT a.atttypmod as dimension
25
22
  FROM pg_attribute a
26
23
  JOIN pg_class c ON a.attrelid = c.oid
27
24
  WHERE c.relname = '{TABLE_NAME}'
28
25
  AND a.attname = 'embedding';
29
- """ # noqa: S608
26
+ """
30
27
 
31
- CREATE_VCHORD_INDEX = f"""
28
+ CREATE_VCHORD_INDEX = """
32
29
  CREATE INDEX IF NOT EXISTS {INDEX_NAME}
33
30
  ON {TABLE_NAME}
34
31
  USING vchordrq (embedding vector_l2_ops) WITH (options = $$
@@ -38,21 +35,21 @@ lists = []
38
35
  $$);
39
36
  """
40
37
 
41
- INSERT_QUERY = f"""
38
+ INSERT_QUERY = """
42
39
  INSERT INTO {TABLE_NAME} (snippet_id, embedding)
43
40
  VALUES (:snippet_id, :embedding)
44
41
  ON CONFLICT (snippet_id) DO UPDATE
45
42
  SET embedding = EXCLUDED.embedding
46
- """ # noqa: S608
43
+ """
47
44
 
48
45
  # Note that <=> in vectorchord is cosine distance
49
46
  # So scores go from 0 (similar) to 2 (opposite)
50
- SEARCH_QUERY = f"""
47
+ SEARCH_QUERY = """
51
48
  SELECT snippet_id, embedding <=> :query as score
52
49
  FROM {TABLE_NAME}
53
50
  ORDER BY score ASC
54
51
  LIMIT :top_k;
55
- """ # noqa: S608
52
+ """
56
53
 
57
54
 
58
55
  class VectorChordVectorSearchService(VectorSearchService):
@@ -60,6 +57,7 @@ class VectorChordVectorSearchService(VectorSearchService):
60
57
 
61
58
  def __init__(
62
59
  self,
60
+ task_name: str,
63
61
  session: AsyncSession,
64
62
  embedding_provider: EmbeddingProvider,
65
63
  ) -> None:
@@ -67,6 +65,8 @@ class VectorChordVectorSearchService(VectorSearchService):
67
65
  self.embedding_provider = embedding_provider
68
66
  self._session = session
69
67
  self._initialized = False
68
+ self.table_name = f"vectorchord_{task_name}_embeddings"
69
+ self.index_name = f"{self.table_name}_idx"
70
70
 
71
71
  async def _initialize(self) -> None:
72
72
  """Initialize the VectorChord environment."""
@@ -88,15 +88,23 @@ class VectorChordVectorSearchService(VectorSearchService):
88
88
  vector_dim = (await self.embedding_provider.embed(["dimension"]))[0]
89
89
  await self._session.execute(
90
90
  text(
91
- f"""CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
91
+ f"""CREATE TABLE IF NOT EXISTS {self.table_name} (
92
92
  id SERIAL PRIMARY KEY,
93
93
  snippet_id INT NOT NULL UNIQUE,
94
94
  embedding VECTOR({len(vector_dim)}) NOT NULL
95
95
  );"""
96
96
  )
97
97
  )
98
- await self._session.execute(text(CREATE_VCHORD_INDEX))
99
- result = await self._session.execute(text(CHECK_VCHORD_EMBEDDING_DIMENSION))
98
+ await self._session.execute(
99
+ text(
100
+ CREATE_VCHORD_INDEX.format(
101
+ TABLE_NAME=self.table_name, INDEX_NAME=self.index_name
102
+ )
103
+ )
104
+ )
105
+ result = await self._session.execute(
106
+ text(CHECK_VCHORD_EMBEDDING_DIMENSION.format(TABLE_NAME=self.table_name))
107
+ )
100
108
  vector_dim_from_db = result.scalar_one()
101
109
  if vector_dim_from_db != len(vector_dim):
102
110
  msg = (
@@ -123,7 +131,7 @@ class VectorChordVectorSearchService(VectorSearchService):
123
131
  embeddings = await self.embedding_provider.embed([doc.text for doc in data])
124
132
  # Execute inserts
125
133
  await self._execute(
126
- text(INSERT_QUERY),
134
+ text(INSERT_QUERY.format(TABLE_NAME=self.table_name)),
127
135
  [
128
136
  {"snippet_id": doc.snippet_id, "embedding": str(embedding)}
129
137
  for doc, embedding in zip(data, embeddings, strict=True)
@@ -135,7 +143,8 @@ class VectorChordVectorSearchService(VectorSearchService):
135
143
  """Query the embedding model."""
136
144
  embedding = await self.embedding_provider.embed([query])
137
145
  result = await self._execute(
138
- text(SEARCH_QUERY), {"query": str(embedding[0]), "top_k": top_k}
146
+ text(SEARCH_QUERY.format(TABLE_NAME=self.table_name)),
147
+ {"query": str(embedding[0]), "top_k": top_k},
139
148
  )
140
149
  rows = result.mappings().all()
141
150
 
@@ -0,0 +1 @@
1
+ """Enrichment."""
@@ -0,0 +1,23 @@
1
+ """Embedding service."""
2
+
3
+ from kodit.config import AppContext
4
+ from kodit.enrichment.enrichment_provider.local_enrichment_provider import (
5
+ LocalEnrichmentProvider,
6
+ )
7
+ from kodit.enrichment.enrichment_provider.openai_enrichment_provider import (
8
+ OpenAIEnrichmentProvider,
9
+ )
10
+ from kodit.enrichment.enrichment_service import (
11
+ EnrichmentService,
12
+ LLMEnrichmentService,
13
+ )
14
+
15
+
16
+ def enrichment_factory(app_context: AppContext) -> EnrichmentService:
17
+ """Create an embedding service."""
18
+ openai_client = app_context.get_default_openai_client()
19
+ if openai_client is not None:
20
+ enrichment_provider = OpenAIEnrichmentProvider(openai_client=openai_client)
21
+ return LLMEnrichmentService(enrichment_provider)
22
+
23
+ return LLMEnrichmentService(LocalEnrichmentProvider())
@@ -0,0 +1 @@
1
+ """Enrichment provider."""
@@ -0,0 +1,16 @@
1
+ """Enrichment provider."""
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ ENRICHMENT_SYSTEM_PROMPT = """
6
+ You are a professional software developer. You will be given a snippet of code.
7
+ Please provide a concise explanation of the code.
8
+ """
9
+
10
+
11
+ class EnrichmentProvider(ABC):
12
+ """Enrichment provider."""
13
+
14
+ @abstractmethod
15
+ async def enrich(self, data: list[str]) -> list[str]:
16
+ """Enrich a list of strings."""