kodit 0.1.9__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- {kodit-0.1.9 → kodit-0.1.10}/.gitignore +1 -0
- {kodit-0.1.9 → kodit-0.1.10}/PKG-INFO +3 -1
- {kodit-0.1.9 → kodit-0.1.10}/alembic.ini +2 -2
- {kodit-0.1.9 → kodit-0.1.10}/docs/_index.md +10 -7
- {kodit-0.1.9 → kodit-0.1.10}/docs/developer/index.md +3 -2
- {kodit-0.1.9 → kodit-0.1.10}/pyproject.toml +2 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/_version.py +2 -2
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/bm25/bm25.py +1 -1
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/cli.py +101 -9
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/config.py +2 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/database.py +2 -2
- kodit-0.1.10/src/kodit/embedding/__init__.py +1 -0
- kodit-0.1.10/src/kodit/embedding/embedding.py +52 -0
- kodit-0.1.10/src/kodit/embedding/models.py +28 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/indexing/repository.py +11 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/indexing/service.py +24 -3
- kodit-0.1.9/src/kodit/logging.py → kodit-0.1.10/src/kodit/log.py +7 -1
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/mcp.py +3 -9
- {kodit-0.1.9/src/kodit/alembic → kodit-0.1.10/src/kodit/migrations}/env.py +1 -0
- kodit-0.1.10/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +47 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/retreival/repository.py +81 -6
- kodit-0.1.10/src/kodit/retreival/service.py +124 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/sources/service.py +2 -2
- kodit-0.1.10/tests/experiments/embedding.py +89 -0
- kodit-0.1.10/tests/kodit/embedding/embedding_test.py +9 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/indexing/test_service.py +7 -1
- kodit-0.1.10/tests/kodit/retreival/repository_test.py +57 -0
- kodit-0.1.10/tests/kodit/retreival/test_service.py +271 -0
- kodit-0.1.10/tests/kodit/snippets/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/smoke.sh +4 -2
- {kodit-0.1.9 → kodit-0.1.10}/uv.lock +499 -0
- kodit-0.1.9/src/kodit/retreival/service.py +0 -69
- kodit-0.1.9/tests/kodit/retreival/test_service.py +0 -107
- {kodit-0.1.9 → kodit-0.1.10}/.cursor/rules/kodit.mdc +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/CODE_OF_CONDUCT.md +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/CONTRIBUTING.md +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/workflows/docker.yaml +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/workflows/docs.yaml +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/workflows/pypi-test.yaml +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/workflows/pypi.yaml +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.github/workflows/test.yaml +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.python-version +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.vscode/launch.json +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/.vscode/settings.json +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/Dockerfile +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/LICENSE +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/README.md +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/.gitignore +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/app.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/bm25/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/indexing/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/indexing/models.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/middleware.py +0 -0
- {kodit-0.1.9/src/kodit/alembic → kodit-0.1.10/src/kodit/migrations}/README +0 -0
- {kodit-0.1.9/src/kodit/alembic → kodit-0.1.10/src/kodit/migrations}/__init__.py +0 -0
- {kodit-0.1.9/src/kodit/alembic → kodit-0.1.10/src/kodit/migrations}/script.py.mako +0 -0
- {kodit-0.1.9/src/kodit/alembic → kodit-0.1.10/src/kodit/migrations}/versions/85155663351e_initial.py +0 -0
- {kodit-0.1.9/src/kodit/alembic → kodit-0.1.10/src/kodit/migrations}/versions/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/retreival/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/snippets/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/snippets/languages/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/snippets/languages/csharp.scm +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/snippets/languages/python.scm +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/snippets/method_snippets.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/snippets/snippets.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/sources/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/sources/models.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/src/kodit/sources/repository.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/conftest.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/cli_test.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/e2e.py +0 -0
- {kodit-0.1.9/tests/kodit/snippets → kodit-0.1.10/tests/kodit/embedding}/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/indexing/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/mcp_test.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/retreival/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/snippets/csharp.cs +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/snippets/detect_language_test.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/snippets/method_extraction_test.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/snippets/python.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/sources/__init__.py +0 -0
- {kodit-0.1.9 → kodit-0.1.10}/tests/kodit/sources/test_service.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kodit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.10
|
|
4
4
|
Summary: Code indexing for better AI code generation
|
|
5
5
|
Project-URL: Homepage, https://docs.helixml.tech/kodit/
|
|
6
6
|
Project-URL: Documentation, https://docs.helixml.tech/kodit/
|
|
@@ -29,11 +29,13 @@ Requires-Dist: dotenv>=0.9.9
|
|
|
29
29
|
Requires-Dist: fastapi[standard]>=0.115.12
|
|
30
30
|
Requires-Dist: fastmcp>=2.3.3
|
|
31
31
|
Requires-Dist: gitpython>=3.1.44
|
|
32
|
+
Requires-Dist: hf-xet>=1.1.2
|
|
32
33
|
Requires-Dist: httpx-retries>=0.3.2
|
|
33
34
|
Requires-Dist: httpx>=0.28.1
|
|
34
35
|
Requires-Dist: posthog>=4.0.1
|
|
35
36
|
Requires-Dist: pydantic-settings>=2.9.1
|
|
36
37
|
Requires-Dist: pytable-formatter>=0.1.1
|
|
38
|
+
Requires-Dist: sentence-transformers>=4.1.0
|
|
37
39
|
Requires-Dist: sqlalchemy[asyncio]>=2.0.40
|
|
38
40
|
Requires-Dist: structlog>=25.3.0
|
|
39
41
|
Requires-Dist: tdqm>=0.0.1
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
[alembic]
|
|
4
4
|
# path to migration scripts
|
|
5
5
|
# Use forward slashes (/) also on windows to provide an os agnostic path
|
|
6
|
-
script_location = src/kodit/
|
|
6
|
+
script_location = src/kodit/migrations
|
|
7
7
|
|
|
8
8
|
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
|
|
9
9
|
# Uncomment the line below if you want the files to be prepended with date and time
|
|
@@ -63,7 +63,7 @@ version_path_separator = os
|
|
|
63
63
|
# are written from script.py.mako
|
|
64
64
|
# output_encoding = utf-8
|
|
65
65
|
|
|
66
|
-
sqlalchemy.url = sqlite+aiosqlite:///%(here)s/.kodit
|
|
66
|
+
sqlalchemy.url = sqlite+aiosqlite:///%(here)s/.kodit.db
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
[post_write_hooks]
|
|
@@ -57,13 +57,16 @@ pip install kodit
|
|
|
57
57
|
Kodit has two key parts. A configuration CLI to manage what gets indexed and an MCP
|
|
58
58
|
server to expose your code to an AI coding assistant.
|
|
59
59
|
|
|
60
|
-
1. Index a
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
60
|
+
1. Index a source:
|
|
61
|
+
1. a local path: `kodit index /path/to/your/code`
|
|
62
|
+
2. or index a public git repository: `kodit index https://github.com/pydantic/pydantic-ai`
|
|
63
|
+
2. Manually search your index:
|
|
64
|
+
1. with a keyword: `kodit search keyword "test"`
|
|
65
|
+
2. or with code: `kodit search code "def main()"`
|
|
66
|
+
3. or via hybrid search: `kodit search code hybrid --keywords "main" --code "def main()"`
|
|
67
|
+
3. Start an MCP server: `kodit serve`
|
|
68
|
+
|
|
69
|
+
Now add the Kodit MCP server to your AI coding assistant.
|
|
67
70
|
|
|
68
71
|
### Integration with Cursor
|
|
69
72
|
|
|
@@ -13,8 +13,9 @@ All database operations are handled by SQLAlchemy and Alembic.
|
|
|
13
13
|
|
|
14
14
|
1. Make changes to your models
|
|
15
15
|
2. Ensure the model is referenced in [alembic's env.py](src/kodit/alembic/env.py)
|
|
16
|
-
3. Run `alembic
|
|
17
|
-
4.
|
|
16
|
+
3. Run `alembic upgrade head` to create a temporary DB to compute the upgrade
|
|
17
|
+
4. Run `alembic revision --autogenerate -m "your message"`
|
|
18
|
+
5. The new migration will be applied when you next run a kodit command
|
|
18
19
|
|
|
19
20
|
## Releasing
|
|
20
21
|
|
|
@@ -38,7 +38,7 @@ class BM25Service:
|
|
|
38
38
|
self.log.debug("Indexing corpus")
|
|
39
39
|
vocab = self._tokenize(corpus)
|
|
40
40
|
self.retriever = bm25s.BM25()
|
|
41
|
-
self.retriever.index(vocab)
|
|
41
|
+
self.retriever.index(vocab, show_progress=False)
|
|
42
42
|
self.retriever.save(self.index_path)
|
|
43
43
|
|
|
44
44
|
def retrieve(
|
|
@@ -15,6 +15,7 @@ from kodit.config import (
|
|
|
15
15
|
DEFAULT_BASE_DIR,
|
|
16
16
|
DEFAULT_DB_URL,
|
|
17
17
|
DEFAULT_DISABLE_TELEMETRY,
|
|
18
|
+
DEFAULT_EMBEDDING_MODEL_NAME,
|
|
18
19
|
DEFAULT_LOG_FORMAT,
|
|
19
20
|
DEFAULT_LOG_LEVEL,
|
|
20
21
|
AppContext,
|
|
@@ -23,7 +24,7 @@ from kodit.config import (
|
|
|
23
24
|
)
|
|
24
25
|
from kodit.indexing.repository import IndexRepository
|
|
25
26
|
from kodit.indexing.service import IndexService
|
|
26
|
-
from kodit.
|
|
27
|
+
from kodit.log import configure_logging, configure_telemetry, log_event
|
|
27
28
|
from kodit.retreival.repository import RetrievalRepository
|
|
28
29
|
from kodit.retreival.service import RetrievalRequest, RetrievalService
|
|
29
30
|
from kodit.sources.repository import SourceRepository
|
|
@@ -97,7 +98,12 @@ async def index(
|
|
|
97
98
|
source_repository = SourceRepository(session)
|
|
98
99
|
source_service = SourceService(app_context.get_clone_dir(), source_repository)
|
|
99
100
|
repository = IndexRepository(session)
|
|
100
|
-
service = IndexService(
|
|
101
|
+
service = IndexService(
|
|
102
|
+
repository,
|
|
103
|
+
source_service,
|
|
104
|
+
app_context.get_data_dir(),
|
|
105
|
+
embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
|
|
106
|
+
)
|
|
101
107
|
|
|
102
108
|
if not sources:
|
|
103
109
|
# No source specified, list all indexes
|
|
@@ -133,20 +139,106 @@ async def index(
|
|
|
133
139
|
await service.run(index.id)
|
|
134
140
|
|
|
135
141
|
|
|
136
|
-
@cli.
|
|
142
|
+
@cli.group()
|
|
143
|
+
def search() -> None:
|
|
144
|
+
"""Search for snippets in the database."""
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@search.command()
|
|
137
148
|
@click.argument("query")
|
|
138
149
|
@click.option("--top-k", default=10, help="Number of snippets to retrieve")
|
|
139
150
|
@with_app_context
|
|
140
151
|
@with_session
|
|
141
|
-
async def
|
|
142
|
-
session: AsyncSession,
|
|
152
|
+
async def code(
|
|
153
|
+
session: AsyncSession,
|
|
154
|
+
app_context: AppContext,
|
|
155
|
+
query: str,
|
|
156
|
+
top_k: int,
|
|
157
|
+
) -> None:
|
|
158
|
+
"""Search for snippets using semantic code search.
|
|
159
|
+
|
|
160
|
+
This works best if your query is code.
|
|
161
|
+
"""
|
|
162
|
+
repository = RetrievalRepository(session)
|
|
163
|
+
service = RetrievalService(
|
|
164
|
+
repository,
|
|
165
|
+
app_context.get_data_dir(),
|
|
166
|
+
embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
snippets = await service.retrieve(RetrievalRequest(code_query=query, top_k=top_k))
|
|
170
|
+
|
|
171
|
+
if len(snippets) == 0:
|
|
172
|
+
click.echo("No snippets found")
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
for snippet in snippets:
|
|
176
|
+
click.echo("-" * 80)
|
|
177
|
+
click.echo(f"{snippet.uri}")
|
|
178
|
+
click.echo(snippet.content)
|
|
179
|
+
click.echo("-" * 80)
|
|
180
|
+
click.echo()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@search.command()
|
|
184
|
+
@click.argument("keywords", nargs=-1)
|
|
185
|
+
@click.option("--top-k", default=10, help="Number of snippets to retrieve")
|
|
186
|
+
@with_app_context
|
|
187
|
+
@with_session
|
|
188
|
+
async def keyword(
|
|
189
|
+
session: AsyncSession,
|
|
190
|
+
app_context: AppContext,
|
|
191
|
+
keywords: list[str],
|
|
192
|
+
top_k: int,
|
|
143
193
|
) -> None:
|
|
144
|
-
"""
|
|
194
|
+
"""Search for snippets using keyword search."""
|
|
145
195
|
repository = RetrievalRepository(session)
|
|
146
|
-
service = RetrievalService(
|
|
147
|
-
|
|
196
|
+
service = RetrievalService(
|
|
197
|
+
repository,
|
|
198
|
+
app_context.get_data_dir(),
|
|
199
|
+
embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
snippets = await service.retrieve(RetrievalRequest(keywords=keywords, top_k=top_k))
|
|
203
|
+
|
|
204
|
+
if len(snippets) == 0:
|
|
205
|
+
click.echo("No snippets found")
|
|
206
|
+
return
|
|
207
|
+
|
|
208
|
+
for snippet in snippets:
|
|
209
|
+
click.echo("-" * 80)
|
|
210
|
+
click.echo(f"{snippet.uri}")
|
|
211
|
+
click.echo(snippet.content)
|
|
212
|
+
click.echo("-" * 80)
|
|
213
|
+
click.echo()
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@search.command()
|
|
217
|
+
@click.option("--top-k", default=10, help="Number of snippets to retrieve")
|
|
218
|
+
@click.option("--keywords", required=True, help="Comma separated list of keywords")
|
|
219
|
+
@click.option("--code", required=True, help="Semantic code search query")
|
|
220
|
+
@with_app_context
|
|
221
|
+
@with_session
|
|
222
|
+
async def hybrid(
|
|
223
|
+
session: AsyncSession,
|
|
224
|
+
app_context: AppContext,
|
|
225
|
+
top_k: int,
|
|
226
|
+
keywords: str,
|
|
227
|
+
code: str,
|
|
228
|
+
) -> None:
|
|
229
|
+
"""Search for snippets using hybrid search."""
|
|
230
|
+
repository = RetrievalRepository(session)
|
|
231
|
+
service = RetrievalService(
|
|
232
|
+
repository,
|
|
233
|
+
app_context.get_data_dir(),
|
|
234
|
+
embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Parse keywords into a list of strings
|
|
238
|
+
keywords_list = [k.strip().lower() for k in keywords.split(",")]
|
|
239
|
+
|
|
148
240
|
snippets = await service.retrieve(
|
|
149
|
-
RetrievalRequest(keywords=
|
|
241
|
+
RetrievalRequest(keywords=keywords_list, code_query=code, top_k=top_k)
|
|
150
242
|
)
|
|
151
243
|
|
|
152
244
|
if len(snippets) == 0:
|
|
@@ -11,12 +11,14 @@ from pydantic import Field
|
|
|
11
11
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
12
12
|
|
|
13
13
|
from kodit.database import Database
|
|
14
|
+
from kodit.embedding.embedding import TINY
|
|
14
15
|
|
|
15
16
|
DEFAULT_BASE_DIR = Path.home() / ".kodit"
|
|
16
17
|
DEFAULT_DB_URL = f"sqlite+aiosqlite:///{DEFAULT_BASE_DIR}/kodit.db"
|
|
17
18
|
DEFAULT_LOG_LEVEL = "INFO"
|
|
18
19
|
DEFAULT_LOG_FORMAT = "pretty"
|
|
19
20
|
DEFAULT_DISABLE_TELEMETRY = False
|
|
21
|
+
DEFAULT_EMBEDDING_MODEL_NAME = TINY
|
|
20
22
|
T = TypeVar("T")
|
|
21
23
|
|
|
22
24
|
|
|
@@ -15,7 +15,7 @@ from sqlalchemy.ext.asyncio import (
|
|
|
15
15
|
)
|
|
16
16
|
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
|
|
17
17
|
|
|
18
|
-
from kodit import
|
|
18
|
+
from kodit import migrations
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class Base(AsyncAttrs, DeclarativeBase):
|
|
@@ -57,7 +57,7 @@ class Database:
|
|
|
57
57
|
# Create Alembic configuration and run migrations
|
|
58
58
|
alembic_cfg = AlembicConfig()
|
|
59
59
|
alembic_cfg.set_main_option(
|
|
60
|
-
"script_location", str(Path(
|
|
60
|
+
"script_location", str(Path(migrations.__file__).parent)
|
|
61
61
|
)
|
|
62
62
|
alembic_cfg.set_main_option("sqlalchemy.url", db_url)
|
|
63
63
|
self.log.debug("Running migrations", db_url=db_url)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Embedding module."""
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Embedding service."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from collections.abc import Generator
|
|
5
|
+
|
|
6
|
+
import structlog
|
|
7
|
+
from sentence_transformers import SentenceTransformer
|
|
8
|
+
|
|
9
|
+
TINY = "tiny"
|
|
10
|
+
CODE = "code"
|
|
11
|
+
TEST = "test"
|
|
12
|
+
|
|
13
|
+
COMMON_EMBEDDING_MODELS = {
|
|
14
|
+
TINY: "ibm-granite/granite-embedding-30m-english",
|
|
15
|
+
CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
|
|
16
|
+
TEST: "minishlab/potion-base-4M",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EmbeddingService:
|
|
21
|
+
"""Service for embeddings."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, model_name: str) -> None:
|
|
24
|
+
"""Initialize the embedding service."""
|
|
25
|
+
self.log = structlog.get_logger(__name__)
|
|
26
|
+
self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
|
|
27
|
+
self.embedding_model = None
|
|
28
|
+
|
|
29
|
+
def _model(self) -> SentenceTransformer:
|
|
30
|
+
"""Get the embedding model."""
|
|
31
|
+
if self.embedding_model is None:
|
|
32
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
|
|
33
|
+
self.embedding_model = SentenceTransformer(
|
|
34
|
+
self.model_name,
|
|
35
|
+
trust_remote_code=True,
|
|
36
|
+
device="cpu", # Force CPU so we don't have to install accelerate, etc.
|
|
37
|
+
)
|
|
38
|
+
return self.embedding_model
|
|
39
|
+
|
|
40
|
+
def embed(self, snippets: list[str]) -> Generator[list[float], None, None]:
|
|
41
|
+
"""Embed a list of documents."""
|
|
42
|
+
model = self._model()
|
|
43
|
+
embeddings = model.encode(snippets, show_progress_bar=False, batch_size=4)
|
|
44
|
+
for embedding in embeddings:
|
|
45
|
+
yield [float(x) for x in embedding]
|
|
46
|
+
|
|
47
|
+
def query(self, query: list[str]) -> Generator[list[float], None, None]:
|
|
48
|
+
"""Query the embedding model."""
|
|
49
|
+
model = self._model()
|
|
50
|
+
embeddings = model.encode(query, show_progress_bar=False, batch_size=4)
|
|
51
|
+
for embedding in embeddings:
|
|
52
|
+
yield [float(x) for x in embedding]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Embedding models."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import JSON, ForeignKey
|
|
6
|
+
from sqlalchemy import Enum as SQLAlchemyEnum
|
|
7
|
+
from sqlalchemy.orm import Mapped, mapped_column
|
|
8
|
+
|
|
9
|
+
from kodit.database import Base, CommonMixin
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EmbeddingType(Enum):
|
|
13
|
+
"""Embedding type."""
|
|
14
|
+
|
|
15
|
+
CODE = 1
|
|
16
|
+
TEXT = 2
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Embedding(Base, CommonMixin):
|
|
20
|
+
"""Embedding model."""
|
|
21
|
+
|
|
22
|
+
__tablename__ = "embeddings"
|
|
23
|
+
|
|
24
|
+
snippet_id: Mapped[int] = mapped_column(ForeignKey("snippets.id"), index=True)
|
|
25
|
+
type: Mapped[EmbeddingType] = mapped_column(
|
|
26
|
+
SQLAlchemyEnum(EmbeddingType), index=True
|
|
27
|
+
)
|
|
28
|
+
embedding: Mapped[list[float]] = mapped_column(JSON)
|
|
@@ -11,6 +11,7 @@ from typing import TypeVar
|
|
|
11
11
|
from sqlalchemy import delete, func, select
|
|
12
12
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
13
|
|
|
14
|
+
from kodit.embedding.models import Embedding
|
|
14
15
|
from kodit.indexing.models import Index, Snippet
|
|
15
16
|
from kodit.sources.models import File, Source
|
|
16
17
|
|
|
@@ -165,3 +166,13 @@ class IndexRepository:
|
|
|
165
166
|
query = select(Snippet).order_by(Snippet.id)
|
|
166
167
|
result = await self.session.execute(query)
|
|
167
168
|
return list(result.scalars())
|
|
169
|
+
|
|
170
|
+
async def add_embedding(self, embedding: Embedding) -> None:
|
|
171
|
+
"""Add a new embedding to the database.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
embedding: The Embedding instance to add.
|
|
175
|
+
|
|
176
|
+
"""
|
|
177
|
+
self.session.add(embedding)
|
|
178
|
+
await self.session.commit()
|
|
@@ -14,6 +14,8 @@ import structlog
|
|
|
14
14
|
from tqdm.asyncio import tqdm
|
|
15
15
|
|
|
16
16
|
from kodit.bm25.bm25 import BM25Service
|
|
17
|
+
from kodit.embedding.embedding import EmbeddingService
|
|
18
|
+
from kodit.embedding.models import Embedding, EmbeddingType
|
|
17
19
|
from kodit.indexing.models import Snippet
|
|
18
20
|
from kodit.indexing.repository import IndexRepository
|
|
19
21
|
from kodit.snippets.snippets import SnippetService
|
|
@@ -50,6 +52,7 @@ class IndexService:
|
|
|
50
52
|
repository: IndexRepository,
|
|
51
53
|
source_service: SourceService,
|
|
52
54
|
data_dir: Path,
|
|
55
|
+
embedding_model_name: str,
|
|
53
56
|
) -> None:
|
|
54
57
|
"""Initialize the index service.
|
|
55
58
|
|
|
@@ -63,6 +66,7 @@ class IndexService:
|
|
|
63
66
|
self.snippet_service = SnippetService()
|
|
64
67
|
self.log = structlog.get_logger(__name__)
|
|
65
68
|
self.bm25 = BM25Service(data_dir)
|
|
69
|
+
self.code_embedding_service = EmbeddingService(model_name=embedding_model_name)
|
|
66
70
|
|
|
67
71
|
async def create(self, source_id: int) -> IndexView:
|
|
68
72
|
"""Create a new index for a source.
|
|
@@ -128,9 +132,26 @@ class IndexService:
|
|
|
128
132
|
# Create snippets for supported file types
|
|
129
133
|
await self._create_snippets(index_id)
|
|
130
134
|
|
|
131
|
-
# Update BM25 index
|
|
132
135
|
snippets = await self.repository.get_all_snippets()
|
|
133
|
-
|
|
136
|
+
|
|
137
|
+
self.log.info("Creating keyword index")
|
|
138
|
+
self.bm25.index(
|
|
139
|
+
[
|
|
140
|
+
snippet.content
|
|
141
|
+
for snippet in tqdm(snippets, total=len(snippets), leave=False)
|
|
142
|
+
]
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
self.log.info("Creating semantic code index")
|
|
146
|
+
for snippet in tqdm(snippets, total=len(snippets), leave=False):
|
|
147
|
+
embedding = next(self.code_embedding_service.embed([snippet.content]))
|
|
148
|
+
await self.repository.add_embedding(
|
|
149
|
+
Embedding(
|
|
150
|
+
snippet_id=snippet.id,
|
|
151
|
+
embedding=embedding,
|
|
152
|
+
type=EmbeddingType.CODE,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
134
155
|
|
|
135
156
|
# Update index timestamp
|
|
136
157
|
await self.repository.update_index_timestamp(index)
|
|
@@ -148,7 +169,7 @@ class IndexService:
|
|
|
148
169
|
|
|
149
170
|
"""
|
|
150
171
|
files = await self.repository.files_for_index(index_id)
|
|
151
|
-
for file in tqdm(files, total=len(files)):
|
|
172
|
+
for file in tqdm(files, total=len(files), leave=False):
|
|
152
173
|
# Skip unsupported file types
|
|
153
174
|
if file.mime_type in MIME_BLACKLIST:
|
|
154
175
|
self.log.debug("Skipping mime type", mime_type=file.mime_type)
|
|
@@ -87,7 +87,13 @@ def configure_logging(app_context: AppContext) -> None:
|
|
|
87
87
|
# Configure uvicorn loggers to use our structlog setup
|
|
88
88
|
# Uvicorn spits out loads of exception logs when sse server doesn't shut down
|
|
89
89
|
# gracefully, so we hide them unless in DEBUG mode
|
|
90
|
-
for _log in [
|
|
90
|
+
for _log in [
|
|
91
|
+
"uvicorn",
|
|
92
|
+
"uvicorn.error",
|
|
93
|
+
"uvicorn.access",
|
|
94
|
+
"bm25s",
|
|
95
|
+
"sentence_transformers.SentenceTransformer",
|
|
96
|
+
]:
|
|
91
97
|
if root_logger.getEffectiveLevel() == logging.DEBUG:
|
|
92
98
|
logging.getLogger(_log).handlers.clear()
|
|
93
99
|
logging.getLogger(_log).propagate = True
|
|
@@ -12,7 +12,7 @@ from pydantic import Field
|
|
|
12
12
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
13
|
|
|
14
14
|
from kodit._version import version
|
|
15
|
-
from kodit.config import AppContext
|
|
15
|
+
from kodit.config import DEFAULT_EMBEDDING_MODEL_NAME, AppContext
|
|
16
16
|
from kodit.database import Database
|
|
17
17
|
from kodit.retreival.repository import RetrievalRepository, RetrievalResult
|
|
18
18
|
from kodit.retreival.service import RetrievalRequest, RetrievalService
|
|
@@ -115,18 +115,12 @@ async def retrieve_relevant_snippets(
|
|
|
115
115
|
retrieval_service = RetrievalService(
|
|
116
116
|
repository=retrieval_repository,
|
|
117
117
|
data_dir=mcp_context.data_dir,
|
|
118
|
+
embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
|
|
118
119
|
)
|
|
119
120
|
|
|
120
|
-
log.debug("Fusing input")
|
|
121
|
-
input_query = input_fusion(
|
|
122
|
-
user_intent=user_intent,
|
|
123
|
-
related_file_paths=related_file_paths,
|
|
124
|
-
related_file_contents=related_file_contents,
|
|
125
|
-
keywords=keywords,
|
|
126
|
-
)
|
|
127
|
-
log.debug("Input", input_query=input_query)
|
|
128
121
|
retrieval_request = RetrievalRequest(
|
|
129
122
|
keywords=keywords,
|
|
123
|
+
code_query="\n".join(related_file_contents),
|
|
130
124
|
)
|
|
131
125
|
log.debug("Retrieving snippets")
|
|
132
126
|
snippets = await retrieval_service.retrieve(request=retrieval_request)
|
|
@@ -8,6 +8,7 @@ from sqlalchemy import pool
|
|
|
8
8
|
from sqlalchemy.engine import Connection
|
|
9
9
|
from sqlalchemy.ext.asyncio import async_engine_from_config
|
|
10
10
|
|
|
11
|
+
import kodit.embedding.models
|
|
11
12
|
import kodit.indexing.models
|
|
12
13
|
import kodit.sources.models
|
|
13
14
|
from kodit.database import Base
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# ruff: noqa
|
|
2
|
+
"""add embeddings table
|
|
3
|
+
|
|
4
|
+
Revision ID: 7c3bbc2ab32b
|
|
5
|
+
Revises: 85155663351e
|
|
6
|
+
Create Date: 2025-05-23 17:23:09.924980
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Sequence, Union
|
|
11
|
+
|
|
12
|
+
from alembic import op
|
|
13
|
+
import sqlalchemy as sa
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '7c3bbc2ab32b'
|
|
18
|
+
down_revision: Union[str, None] = '85155663351e'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade() -> None:
|
|
24
|
+
"""Upgrade schema."""
|
|
25
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
26
|
+
op.create_table('embeddings',
|
|
27
|
+
sa.Column('snippet_id', sa.Integer(), nullable=False),
|
|
28
|
+
sa.Column('type', sa.Enum('CODE', 'TEXT', name='embeddingtype'), nullable=False),
|
|
29
|
+
sa.Column('embedding', sa.JSON(), nullable=False),
|
|
30
|
+
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
|
|
31
|
+
sa.Column('created_at', sa.DateTime(), nullable=False),
|
|
32
|
+
sa.Column('updated_at', sa.DateTime(), nullable=False),
|
|
33
|
+
sa.ForeignKeyConstraint(['snippet_id'], ['snippets.id'], ),
|
|
34
|
+
sa.PrimaryKeyConstraint('id')
|
|
35
|
+
)
|
|
36
|
+
op.create_index(op.f('ix_embeddings_snippet_id'), 'embeddings', ['snippet_id'], unique=False)
|
|
37
|
+
op.create_index(op.f('ix_embeddings_type'), 'embeddings', ['type'], unique=False)
|
|
38
|
+
# ### end Alembic commands ###
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def downgrade() -> None:
|
|
42
|
+
"""Downgrade schema."""
|
|
43
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
44
|
+
op.drop_index(op.f('ix_embeddings_type'), table_name='embeddings')
|
|
45
|
+
op.drop_index(op.f('ix_embeddings_snippet_id'), table_name='embeddings')
|
|
46
|
+
op.drop_table('embeddings')
|
|
47
|
+
# ### end Alembic commands ###
|