kodit 0.1.9__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (89) hide show
  1. {kodit-0.1.9 → kodit-0.1.11}/.gitignore +3 -0
  2. {kodit-0.1.9 → kodit-0.1.11}/PKG-INFO +3 -1
  3. {kodit-0.1.9 → kodit-0.1.11}/alembic.ini +2 -2
  4. {kodit-0.1.9 → kodit-0.1.11}/docs/_index.md +10 -7
  5. {kodit-0.1.9 → kodit-0.1.11}/docs/developer/index.md +3 -2
  6. {kodit-0.1.9 → kodit-0.1.11}/pyproject.toml +3 -0
  7. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/_version.py +2 -2
  8. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/bm25/bm25.py +1 -1
  9. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/cli.py +101 -9
  10. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/config.py +2 -0
  11. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/database.py +2 -2
  12. kodit-0.1.11/src/kodit/embedding/__init__.py +1 -0
  13. kodit-0.1.11/src/kodit/embedding/embedding.py +52 -0
  14. kodit-0.1.11/src/kodit/embedding/models.py +28 -0
  15. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/indexing/repository.py +11 -0
  16. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/indexing/service.py +24 -3
  17. kodit-0.1.9/src/kodit/logging.py → kodit-0.1.11/src/kodit/log.py +7 -1
  18. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/mcp.py +3 -9
  19. {kodit-0.1.9/src/kodit/alembic → kodit-0.1.11/src/kodit/migrations}/env.py +1 -0
  20. kodit-0.1.11/src/kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +47 -0
  21. kodit-0.1.11/src/kodit/retreival/repository.py +231 -0
  22. kodit-0.1.11/src/kodit/retreival/service.py +124 -0
  23. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/sources/service.py +2 -2
  24. kodit-0.1.11/tests/experiments/embedding.py +89 -0
  25. kodit-0.1.11/tests/kodit/embedding/embedding_test.py +9 -0
  26. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/indexing/test_service.py +7 -1
  27. kodit-0.1.11/tests/kodit/retreival/repository_test.py +57 -0
  28. kodit-0.1.11/tests/kodit/retreival/test_service.py +271 -0
  29. kodit-0.1.11/tests/kodit/snippets/__init__.py +0 -0
  30. kodit-0.1.11/tests/performance/similarity.py +139 -0
  31. {kodit-0.1.9 → kodit-0.1.11}/tests/smoke.sh +4 -2
  32. {kodit-0.1.9 → kodit-0.1.11}/uv.lock +513 -0
  33. kodit-0.1.9/src/kodit/retreival/repository.py +0 -108
  34. kodit-0.1.9/src/kodit/retreival/service.py +0 -69
  35. kodit-0.1.9/tests/kodit/retreival/test_service.py +0 -107
  36. {kodit-0.1.9 → kodit-0.1.11}/.cursor/rules/kodit.mdc +0 -0
  37. {kodit-0.1.9 → kodit-0.1.11}/.github/CODE_OF_CONDUCT.md +0 -0
  38. {kodit-0.1.9 → kodit-0.1.11}/.github/CONTRIBUTING.md +0 -0
  39. {kodit-0.1.9 → kodit-0.1.11}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  40. {kodit-0.1.9 → kodit-0.1.11}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  41. {kodit-0.1.9 → kodit-0.1.11}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  42. {kodit-0.1.9 → kodit-0.1.11}/.github/workflows/docker.yaml +0 -0
  43. {kodit-0.1.9 → kodit-0.1.11}/.github/workflows/docs.yaml +0 -0
  44. {kodit-0.1.9 → kodit-0.1.11}/.github/workflows/pypi-test.yaml +0 -0
  45. {kodit-0.1.9 → kodit-0.1.11}/.github/workflows/pypi.yaml +0 -0
  46. {kodit-0.1.9 → kodit-0.1.11}/.github/workflows/test.yaml +0 -0
  47. {kodit-0.1.9 → kodit-0.1.11}/.python-version +0 -0
  48. {kodit-0.1.9 → kodit-0.1.11}/.vscode/launch.json +0 -0
  49. {kodit-0.1.9 → kodit-0.1.11}/.vscode/settings.json +0 -0
  50. {kodit-0.1.9 → kodit-0.1.11}/Dockerfile +0 -0
  51. {kodit-0.1.9 → kodit-0.1.11}/LICENSE +0 -0
  52. {kodit-0.1.9 → kodit-0.1.11}/README.md +0 -0
  53. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/.gitignore +0 -0
  54. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/__init__.py +0 -0
  55. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/app.py +0 -0
  56. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/bm25/__init__.py +0 -0
  57. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/indexing/__init__.py +0 -0
  58. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/indexing/models.py +0 -0
  59. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/middleware.py +0 -0
  60. {kodit-0.1.9/src/kodit/alembic → kodit-0.1.11/src/kodit/migrations}/README +0 -0
  61. {kodit-0.1.9/src/kodit/alembic → kodit-0.1.11/src/kodit/migrations}/__init__.py +0 -0
  62. {kodit-0.1.9/src/kodit/alembic → kodit-0.1.11/src/kodit/migrations}/script.py.mako +0 -0
  63. {kodit-0.1.9/src/kodit/alembic → kodit-0.1.11/src/kodit/migrations}/versions/85155663351e_initial.py +0 -0
  64. {kodit-0.1.9/src/kodit/alembic → kodit-0.1.11/src/kodit/migrations}/versions/__init__.py +0 -0
  65. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/retreival/__init__.py +0 -0
  66. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/snippets/__init__.py +0 -0
  67. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/snippets/languages/__init__.py +0 -0
  68. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/snippets/languages/csharp.scm +0 -0
  69. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/snippets/languages/python.scm +0 -0
  70. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/snippets/method_snippets.py +0 -0
  71. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/snippets/snippets.py +0 -0
  72. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/sources/__init__.py +0 -0
  73. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/sources/models.py +0 -0
  74. {kodit-0.1.9 → kodit-0.1.11}/src/kodit/sources/repository.py +0 -0
  75. {kodit-0.1.9 → kodit-0.1.11}/tests/__init__.py +0 -0
  76. {kodit-0.1.9 → kodit-0.1.11}/tests/conftest.py +0 -0
  77. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/__init__.py +0 -0
  78. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/cli_test.py +0 -0
  79. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/e2e.py +0 -0
  80. {kodit-0.1.9/tests/kodit/snippets → kodit-0.1.11/tests/kodit/embedding}/__init__.py +0 -0
  81. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/indexing/__init__.py +0 -0
  82. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/mcp_test.py +0 -0
  83. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/retreival/__init__.py +0 -0
  84. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/snippets/csharp.cs +0 -0
  85. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/snippets/detect_language_test.py +0 -0
  86. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/snippets/method_extraction_test.py +0 -0
  87. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/snippets/python.py +0 -0
  88. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/sources/__init__.py +0 -0
  89. {kodit-0.1.9 → kodit-0.1.11}/tests/kodit/sources/test_service.py +0 -0
@@ -174,3 +174,6 @@ cython_debug/
174
174
  .pypirc
175
175
  .kodit/
176
176
  .DS_Store
177
+ .kodit.db
178
+ benchmark.db
179
+ profile.prof
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.1.9
3
+ Version: 0.1.11
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -29,11 +29,13 @@ Requires-Dist: dotenv>=0.9.9
29
29
  Requires-Dist: fastapi[standard]>=0.115.12
30
30
  Requires-Dist: fastmcp>=2.3.3
31
31
  Requires-Dist: gitpython>=3.1.44
32
+ Requires-Dist: hf-xet>=1.1.2
32
33
  Requires-Dist: httpx-retries>=0.3.2
33
34
  Requires-Dist: httpx>=0.28.1
34
35
  Requires-Dist: posthog>=4.0.1
35
36
  Requires-Dist: pydantic-settings>=2.9.1
36
37
  Requires-Dist: pytable-formatter>=0.1.1
38
+ Requires-Dist: sentence-transformers>=4.1.0
37
39
  Requires-Dist: sqlalchemy[asyncio]>=2.0.40
38
40
  Requires-Dist: structlog>=25.3.0
39
41
  Requires-Dist: tdqm>=0.0.1
@@ -3,7 +3,7 @@
3
3
  [alembic]
4
4
  # path to migration scripts
5
5
  # Use forward slashes (/) also on windows to provide an os agnostic path
6
- script_location = src/kodit/alembic
6
+ script_location = src/kodit/migrations
7
7
 
8
8
  # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
9
9
  # Uncomment the line below if you want the files to be prepended with date and time
@@ -63,7 +63,7 @@ version_path_separator = os
63
63
  # are written from script.py.mako
64
64
  # output_encoding = utf-8
65
65
 
66
- sqlalchemy.url = sqlite+aiosqlite:///%(here)s/.kodit/kodit.db
66
+ sqlalchemy.url = sqlite+aiosqlite:///%(here)s/.kodit.db
67
67
 
68
68
 
69
69
  [post_write_hooks]
@@ -57,13 +57,16 @@ pip install kodit
57
57
  Kodit has two key parts. A configuration CLI to manage what gets indexed and an MCP
58
58
  server to expose your code to an AI coding assistant.
59
59
 
60
- 1. Index a local path: `kodit index /path/to/your/code`
61
- 2. Or index a public git repository: `kodit index https://github.com/pydantic/pydantic-ai`
62
- 3. Test retrieval on your index: `kodit retrieve "test"`
63
- 4. Start an MCP server: `kodit serve`
64
-
65
- Now browse to your AI coding assistant and add the MCP server. You will also need to
66
- tell your assistant to use this server in coding tasks, otherwise it won't get called!
60
+ 1. Index a source:
61
+ 1. a local path: `kodit index /path/to/your/code`
62
+ 2. or index a public git repository: `kodit index https://github.com/pydantic/pydantic-ai`
63
+ 2. Manually search your index:
64
+ 1. with a keyword: `kodit search keyword "test"`
65
+ 2. or with code: `kodit search code "def main()"`
66
+ 3. or via hybrid search: `kodit search code hybrid --keywords "main" --code "def main()"`
67
+ 3. Start an MCP server: `kodit serve`
68
+
69
+ Now add the Kodit MCP server to your AI coding assistant.
67
70
 
68
71
  ### Integration with Cursor
69
72
 
@@ -13,8 +13,9 @@ All database operations are handled by SQLAlchemy and Alembic.
13
13
 
14
14
  1. Make changes to your models
15
15
  2. Ensure the model is referenced in [alembic's env.py](src/kodit/alembic/env.py)
16
- 3. Run `alembic revision --autogenerate -m "your message"`
17
- 4. The new migration will be applied when you next run a kodit command
16
+ 3. Run `alembic upgrade head` to create a temporary DB to compute the upgrade
17
+ 4. Run `alembic revision --autogenerate -m "your message"`
18
+ 5. The new migration will be applied when you next run a kodit command
18
19
 
19
20
  ## Releasing
20
21
 
@@ -44,6 +44,8 @@ dependencies = [
44
44
  "pydantic-settings>=2.9.1",
45
45
  "bm25s[core]>=0.2.12",
46
46
  "gitpython>=3.1.44",
47
+ "sentence-transformers>=4.1.0",
48
+ "hf-xet>=1.1.2",
47
49
  ]
48
50
 
49
51
  [dependency-groups]
@@ -54,6 +56,7 @@ dev = [
54
56
  "pytest>=8.3.5",
55
57
  "pytest-cov>=6.1.1",
56
58
  "ruff>=0.11.8",
59
+ "snakeviz>=2.2.2",
57
60
  ]
58
61
 
59
62
  [project.urls]
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.9'
21
- __version_tuple__ = version_tuple = (0, 1, 9)
20
+ __version__ = version = '0.1.11'
21
+ __version_tuple__ = version_tuple = (0, 1, 11)
@@ -38,7 +38,7 @@ class BM25Service:
38
38
  self.log.debug("Indexing corpus")
39
39
  vocab = self._tokenize(corpus)
40
40
  self.retriever = bm25s.BM25()
41
- self.retriever.index(vocab)
41
+ self.retriever.index(vocab, show_progress=False)
42
42
  self.retriever.save(self.index_path)
43
43
 
44
44
  def retrieve(
@@ -15,6 +15,7 @@ from kodit.config import (
15
15
  DEFAULT_BASE_DIR,
16
16
  DEFAULT_DB_URL,
17
17
  DEFAULT_DISABLE_TELEMETRY,
18
+ DEFAULT_EMBEDDING_MODEL_NAME,
18
19
  DEFAULT_LOG_FORMAT,
19
20
  DEFAULT_LOG_LEVEL,
20
21
  AppContext,
@@ -23,7 +24,7 @@ from kodit.config import (
23
24
  )
24
25
  from kodit.indexing.repository import IndexRepository
25
26
  from kodit.indexing.service import IndexService
26
- from kodit.logging import configure_logging, configure_telemetry, log_event
27
+ from kodit.log import configure_logging, configure_telemetry, log_event
27
28
  from kodit.retreival.repository import RetrievalRepository
28
29
  from kodit.retreival.service import RetrievalRequest, RetrievalService
29
30
  from kodit.sources.repository import SourceRepository
@@ -97,7 +98,12 @@ async def index(
97
98
  source_repository = SourceRepository(session)
98
99
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
99
100
  repository = IndexRepository(session)
100
- service = IndexService(repository, source_service, app_context.get_data_dir())
101
+ service = IndexService(
102
+ repository,
103
+ source_service,
104
+ app_context.get_data_dir(),
105
+ embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
106
+ )
101
107
 
102
108
  if not sources:
103
109
  # No source specified, list all indexes
@@ -133,20 +139,106 @@ async def index(
133
139
  await service.run(index.id)
134
140
 
135
141
 
136
- @cli.command()
142
+ @cli.group()
143
+ def search() -> None:
144
+ """Search for snippets in the database."""
145
+
146
+
147
+ @search.command()
137
148
  @click.argument("query")
138
149
  @click.option("--top-k", default=10, help="Number of snippets to retrieve")
139
150
  @with_app_context
140
151
  @with_session
141
- async def retrieve(
142
- session: AsyncSession, app_context: AppContext, query: str, top_k: int
152
+ async def code(
153
+ session: AsyncSession,
154
+ app_context: AppContext,
155
+ query: str,
156
+ top_k: int,
157
+ ) -> None:
158
+ """Search for snippets using semantic code search.
159
+
160
+ This works best if your query is code.
161
+ """
162
+ repository = RetrievalRepository(session)
163
+ service = RetrievalService(
164
+ repository,
165
+ app_context.get_data_dir(),
166
+ embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
167
+ )
168
+
169
+ snippets = await service.retrieve(RetrievalRequest(code_query=query, top_k=top_k))
170
+
171
+ if len(snippets) == 0:
172
+ click.echo("No snippets found")
173
+ return
174
+
175
+ for snippet in snippets:
176
+ click.echo("-" * 80)
177
+ click.echo(f"{snippet.uri}")
178
+ click.echo(snippet.content)
179
+ click.echo("-" * 80)
180
+ click.echo()
181
+
182
+
183
+ @search.command()
184
+ @click.argument("keywords", nargs=-1)
185
+ @click.option("--top-k", default=10, help="Number of snippets to retrieve")
186
+ @with_app_context
187
+ @with_session
188
+ async def keyword(
189
+ session: AsyncSession,
190
+ app_context: AppContext,
191
+ keywords: list[str],
192
+ top_k: int,
143
193
  ) -> None:
144
- """Retrieve snippets from the database."""
194
+ """Search for snippets using keyword search."""
145
195
  repository = RetrievalRepository(session)
146
- service = RetrievalService(repository, app_context.get_data_dir())
147
- # Temporary request while we don't have all search capabilities
196
+ service = RetrievalService(
197
+ repository,
198
+ app_context.get_data_dir(),
199
+ embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
200
+ )
201
+
202
+ snippets = await service.retrieve(RetrievalRequest(keywords=keywords, top_k=top_k))
203
+
204
+ if len(snippets) == 0:
205
+ click.echo("No snippets found")
206
+ return
207
+
208
+ for snippet in snippets:
209
+ click.echo("-" * 80)
210
+ click.echo(f"{snippet.uri}")
211
+ click.echo(snippet.content)
212
+ click.echo("-" * 80)
213
+ click.echo()
214
+
215
+
216
+ @search.command()
217
+ @click.option("--top-k", default=10, help="Number of snippets to retrieve")
218
+ @click.option("--keywords", required=True, help="Comma separated list of keywords")
219
+ @click.option("--code", required=True, help="Semantic code search query")
220
+ @with_app_context
221
+ @with_session
222
+ async def hybrid(
223
+ session: AsyncSession,
224
+ app_context: AppContext,
225
+ top_k: int,
226
+ keywords: str,
227
+ code: str,
228
+ ) -> None:
229
+ """Search for snippets using hybrid search."""
230
+ repository = RetrievalRepository(session)
231
+ service = RetrievalService(
232
+ repository,
233
+ app_context.get_data_dir(),
234
+ embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
235
+ )
236
+
237
+ # Parse keywords into a list of strings
238
+ keywords_list = [k.strip().lower() for k in keywords.split(",")]
239
+
148
240
  snippets = await service.retrieve(
149
- RetrievalRequest(keywords=query.split(","), top_k=top_k)
241
+ RetrievalRequest(keywords=keywords_list, code_query=code, top_k=top_k)
150
242
  )
151
243
 
152
244
  if len(snippets) == 0:
@@ -11,12 +11,14 @@ from pydantic import Field
11
11
  from pydantic_settings import BaseSettings, SettingsConfigDict
12
12
 
13
13
  from kodit.database import Database
14
+ from kodit.embedding.embedding import TINY
14
15
 
15
16
  DEFAULT_BASE_DIR = Path.home() / ".kodit"
16
17
  DEFAULT_DB_URL = f"sqlite+aiosqlite:///{DEFAULT_BASE_DIR}/kodit.db"
17
18
  DEFAULT_LOG_LEVEL = "INFO"
18
19
  DEFAULT_LOG_FORMAT = "pretty"
19
20
  DEFAULT_DISABLE_TELEMETRY = False
21
+ DEFAULT_EMBEDDING_MODEL_NAME = TINY
20
22
  T = TypeVar("T")
21
23
 
22
24
 
@@ -15,7 +15,7 @@ from sqlalchemy.ext.asyncio import (
15
15
  )
16
16
  from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
17
17
 
18
- from kodit import alembic
18
+ from kodit import migrations
19
19
 
20
20
 
21
21
  class Base(AsyncAttrs, DeclarativeBase):
@@ -57,7 +57,7 @@ class Database:
57
57
  # Create Alembic configuration and run migrations
58
58
  alembic_cfg = AlembicConfig()
59
59
  alembic_cfg.set_main_option(
60
- "script_location", str(Path(alembic.__file__).parent)
60
+ "script_location", str(Path(migrations.__file__).parent)
61
61
  )
62
62
  alembic_cfg.set_main_option("sqlalchemy.url", db_url)
63
63
  self.log.debug("Running migrations", db_url=db_url)
@@ -0,0 +1 @@
1
+ """Embedding module."""
@@ -0,0 +1,52 @@
1
+ """Embedding service."""
2
+
3
+ import os
4
+ from collections.abc import Generator
5
+
6
+ import structlog
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ TINY = "tiny"
10
+ CODE = "code"
11
+ TEST = "test"
12
+
13
+ COMMON_EMBEDDING_MODELS = {
14
+ TINY: "ibm-granite/granite-embedding-30m-english",
15
+ CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
16
+ TEST: "minishlab/potion-base-4M",
17
+ }
18
+
19
+
20
+ class EmbeddingService:
21
+ """Service for embeddings."""
22
+
23
+ def __init__(self, model_name: str) -> None:
24
+ """Initialize the embedding service."""
25
+ self.log = structlog.get_logger(__name__)
26
+ self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
27
+ self.embedding_model = None
28
+
29
+ def _model(self) -> SentenceTransformer:
30
+ """Get the embedding model."""
31
+ if self.embedding_model is None:
32
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
33
+ self.embedding_model = SentenceTransformer(
34
+ self.model_name,
35
+ trust_remote_code=True,
36
+ device="cpu", # Force CPU so we don't have to install accelerate, etc.
37
+ )
38
+ return self.embedding_model
39
+
40
+ def embed(self, snippets: list[str]) -> Generator[list[float], None, None]:
41
+ """Embed a list of documents."""
42
+ model = self._model()
43
+ embeddings = model.encode(snippets, show_progress_bar=False, batch_size=4)
44
+ for embedding in embeddings:
45
+ yield [float(x) for x in embedding]
46
+
47
+ def query(self, query: list[str]) -> Generator[list[float], None, None]:
48
+ """Query the embedding model."""
49
+ model = self._model()
50
+ embeddings = model.encode(query, show_progress_bar=False, batch_size=4)
51
+ for embedding in embeddings:
52
+ yield [float(x) for x in embedding]
@@ -0,0 +1,28 @@
1
+ """Embedding models."""
2
+
3
+ from enum import Enum
4
+
5
+ from sqlalchemy import JSON, ForeignKey
6
+ from sqlalchemy import Enum as SQLAlchemyEnum
7
+ from sqlalchemy.orm import Mapped, mapped_column
8
+
9
+ from kodit.database import Base, CommonMixin
10
+
11
+
12
+ class EmbeddingType(Enum):
13
+ """Embedding type."""
14
+
15
+ CODE = 1
16
+ TEXT = 2
17
+
18
+
19
+ class Embedding(Base, CommonMixin):
20
+ """Embedding model."""
21
+
22
+ __tablename__ = "embeddings"
23
+
24
+ snippet_id: Mapped[int] = mapped_column(ForeignKey("snippets.id"), index=True)
25
+ type: Mapped[EmbeddingType] = mapped_column(
26
+ SQLAlchemyEnum(EmbeddingType), index=True
27
+ )
28
+ embedding: Mapped[list[float]] = mapped_column(JSON)
@@ -11,6 +11,7 @@ from typing import TypeVar
11
11
  from sqlalchemy import delete, func, select
12
12
  from sqlalchemy.ext.asyncio import AsyncSession
13
13
 
14
+ from kodit.embedding.models import Embedding
14
15
  from kodit.indexing.models import Index, Snippet
15
16
  from kodit.sources.models import File, Source
16
17
 
@@ -165,3 +166,13 @@ class IndexRepository:
165
166
  query = select(Snippet).order_by(Snippet.id)
166
167
  result = await self.session.execute(query)
167
168
  return list(result.scalars())
169
+
170
+ async def add_embedding(self, embedding: Embedding) -> None:
171
+ """Add a new embedding to the database.
172
+
173
+ Args:
174
+ embedding: The Embedding instance to add.
175
+
176
+ """
177
+ self.session.add(embedding)
178
+ await self.session.commit()
@@ -14,6 +14,8 @@ import structlog
14
14
  from tqdm.asyncio import tqdm
15
15
 
16
16
  from kodit.bm25.bm25 import BM25Service
17
+ from kodit.embedding.embedding import EmbeddingService
18
+ from kodit.embedding.models import Embedding, EmbeddingType
17
19
  from kodit.indexing.models import Snippet
18
20
  from kodit.indexing.repository import IndexRepository
19
21
  from kodit.snippets.snippets import SnippetService
@@ -50,6 +52,7 @@ class IndexService:
50
52
  repository: IndexRepository,
51
53
  source_service: SourceService,
52
54
  data_dir: Path,
55
+ embedding_model_name: str,
53
56
  ) -> None:
54
57
  """Initialize the index service.
55
58
 
@@ -63,6 +66,7 @@ class IndexService:
63
66
  self.snippet_service = SnippetService()
64
67
  self.log = structlog.get_logger(__name__)
65
68
  self.bm25 = BM25Service(data_dir)
69
+ self.code_embedding_service = EmbeddingService(model_name=embedding_model_name)
66
70
 
67
71
  async def create(self, source_id: int) -> IndexView:
68
72
  """Create a new index for a source.
@@ -128,9 +132,26 @@ class IndexService:
128
132
  # Create snippets for supported file types
129
133
  await self._create_snippets(index_id)
130
134
 
131
- # Update BM25 index
132
135
  snippets = await self.repository.get_all_snippets()
133
- self.bm25.index([snippet.content for snippet in snippets])
136
+
137
+ self.log.info("Creating keyword index")
138
+ self.bm25.index(
139
+ [
140
+ snippet.content
141
+ for snippet in tqdm(snippets, total=len(snippets), leave=False)
142
+ ]
143
+ )
144
+
145
+ self.log.info("Creating semantic code index")
146
+ for snippet in tqdm(snippets, total=len(snippets), leave=False):
147
+ embedding = next(self.code_embedding_service.embed([snippet.content]))
148
+ await self.repository.add_embedding(
149
+ Embedding(
150
+ snippet_id=snippet.id,
151
+ embedding=embedding,
152
+ type=EmbeddingType.CODE,
153
+ )
154
+ )
134
155
 
135
156
  # Update index timestamp
136
157
  await self.repository.update_index_timestamp(index)
@@ -148,7 +169,7 @@ class IndexService:
148
169
 
149
170
  """
150
171
  files = await self.repository.files_for_index(index_id)
151
- for file in tqdm(files, total=len(files)):
172
+ for file in tqdm(files, total=len(files), leave=False):
152
173
  # Skip unsupported file types
153
174
  if file.mime_type in MIME_BLACKLIST:
154
175
  self.log.debug("Skipping mime type", mime_type=file.mime_type)
@@ -87,7 +87,13 @@ def configure_logging(app_context: AppContext) -> None:
87
87
  # Configure uvicorn loggers to use our structlog setup
88
88
  # Uvicorn spits out loads of exception logs when sse server doesn't shut down
89
89
  # gracefully, so we hide them unless in DEBUG mode
90
- for _log in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
90
+ for _log in [
91
+ "uvicorn",
92
+ "uvicorn.error",
93
+ "uvicorn.access",
94
+ "bm25s",
95
+ "sentence_transformers.SentenceTransformer",
96
+ ]:
91
97
  if root_logger.getEffectiveLevel() == logging.DEBUG:
92
98
  logging.getLogger(_log).handlers.clear()
93
99
  logging.getLogger(_log).propagate = True
@@ -12,7 +12,7 @@ from pydantic import Field
12
12
  from sqlalchemy.ext.asyncio import AsyncSession
13
13
 
14
14
  from kodit._version import version
15
- from kodit.config import AppContext
15
+ from kodit.config import DEFAULT_EMBEDDING_MODEL_NAME, AppContext
16
16
  from kodit.database import Database
17
17
  from kodit.retreival.repository import RetrievalRepository, RetrievalResult
18
18
  from kodit.retreival.service import RetrievalRequest, RetrievalService
@@ -115,18 +115,12 @@ async def retrieve_relevant_snippets(
115
115
  retrieval_service = RetrievalService(
116
116
  repository=retrieval_repository,
117
117
  data_dir=mcp_context.data_dir,
118
+ embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
118
119
  )
119
120
 
120
- log.debug("Fusing input")
121
- input_query = input_fusion(
122
- user_intent=user_intent,
123
- related_file_paths=related_file_paths,
124
- related_file_contents=related_file_contents,
125
- keywords=keywords,
126
- )
127
- log.debug("Input", input_query=input_query)
128
121
  retrieval_request = RetrievalRequest(
129
122
  keywords=keywords,
123
+ code_query="\n".join(related_file_contents),
130
124
  )
131
125
  log.debug("Retrieving snippets")
132
126
  snippets = await retrieval_service.retrieve(request=retrieval_request)
@@ -8,6 +8,7 @@ from sqlalchemy import pool
8
8
  from sqlalchemy.engine import Connection
9
9
  from sqlalchemy.ext.asyncio import async_engine_from_config
10
10
 
11
+ import kodit.embedding.models
11
12
  import kodit.indexing.models
12
13
  import kodit.sources.models
13
14
  from kodit.database import Base
@@ -0,0 +1,47 @@
1
+ # ruff: noqa
2
+ """add embeddings table
3
+
4
+ Revision ID: 7c3bbc2ab32b
5
+ Revises: 85155663351e
6
+ Create Date: 2025-05-23 17:23:09.924980
7
+
8
+ """
9
+
10
+ from typing import Sequence, Union
11
+
12
+ from alembic import op
13
+ import sqlalchemy as sa
14
+
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '7c3bbc2ab32b'
18
+ down_revision: Union[str, None] = '85155663351e'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade() -> None:
24
+ """Upgrade schema."""
25
+ # ### commands auto generated by Alembic - please adjust! ###
26
+ op.create_table('embeddings',
27
+ sa.Column('snippet_id', sa.Integer(), nullable=False),
28
+ sa.Column('type', sa.Enum('CODE', 'TEXT', name='embeddingtype'), nullable=False),
29
+ sa.Column('embedding', sa.JSON(), nullable=False),
30
+ sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
31
+ sa.Column('created_at', sa.DateTime(), nullable=False),
32
+ sa.Column('updated_at', sa.DateTime(), nullable=False),
33
+ sa.ForeignKeyConstraint(['snippet_id'], ['snippets.id'], ),
34
+ sa.PrimaryKeyConstraint('id')
35
+ )
36
+ op.create_index(op.f('ix_embeddings_snippet_id'), 'embeddings', ['snippet_id'], unique=False)
37
+ op.create_index(op.f('ix_embeddings_type'), 'embeddings', ['type'], unique=False)
38
+ # ### end Alembic commands ###
39
+
40
+
41
+ def downgrade() -> None:
42
+ """Downgrade schema."""
43
+ # ### commands auto generated by Alembic - please adjust! ###
44
+ op.drop_index(op.f('ix_embeddings_type'), table_name='embeddings')
45
+ op.drop_index(op.f('ix_embeddings_snippet_id'), table_name='embeddings')
46
+ op.drop_table('embeddings')
47
+ # ### end Alembic commands ###