kodit 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/cli.py +105 -19
- kodit/embedding/embedding_factory.py +2 -2
- kodit/embedding/embedding_provider/embedding_provider.py +9 -2
- kodit/embedding/embedding_provider/openai_embedding_provider.py +19 -7
- kodit/embedding/vectorchord_vector_search_service.py +26 -15
- kodit/enrichment/__init__.py +1 -0
- kodit/enrichment/enrichment_factory.py +23 -0
- kodit/enrichment/enrichment_provider/__init__.py +1 -0
- kodit/enrichment/enrichment_provider/enrichment_provider.py +16 -0
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +63 -0
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +77 -0
- kodit/enrichment/enrichment_service.py +33 -0
- kodit/indexing/fusion.py +67 -0
- kodit/indexing/indexing_repository.py +41 -23
- kodit/indexing/indexing_service.py +128 -8
- kodit/mcp.py +25 -16
- kodit/migrations/versions/c3f5137d30f5_index_all_the_things.py +44 -0
- kodit/snippets/languages/go.scm +26 -0
- kodit/source/source_models.py +4 -4
- kodit-0.1.17.dist-info/METADATA +152 -0
- {kodit-0.1.15.dist-info → kodit-0.1.17.dist-info}/RECORD +25 -18
- kodit/search/__init__.py +0 -1
- kodit/search/search_repository.py +0 -57
- kodit/search/search_service.py +0 -135
- kodit-0.1.15.dist-info/METADATA +0 -89
- {kodit-0.1.15.dist-info → kodit-0.1.17.dist-info}/WHEEL +0 -0
- {kodit-0.1.15.dist-info → kodit-0.1.17.dist-info}/entry_points.txt +0 -0
- {kodit-0.1.15.dist-info → kodit-0.1.17.dist-info}/licenses/LICENSE +0 -0
kodit/indexing/fusion.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Fusion functions for combining search results."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class FusionResult:
|
|
9
|
+
"""Result of a fusion operation."""
|
|
10
|
+
|
|
11
|
+
id: int
|
|
12
|
+
score: float
|
|
13
|
+
original_scores: list[float]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class FusionRequest:
|
|
18
|
+
"""Result of a RRF operation."""
|
|
19
|
+
|
|
20
|
+
id: int
|
|
21
|
+
score: float
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def reciprocal_rank_fusion(
|
|
25
|
+
rankings: list[list[FusionRequest]], k: float = 60
|
|
26
|
+
) -> list[FusionResult]:
|
|
27
|
+
"""RRF prioritises results that are present in all results.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
rankings: List of rankers, each containing a list of document ids. Top of the
|
|
31
|
+
list is considered to be the best result.
|
|
32
|
+
k: Parameter for RRF.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Dictionary of ids and their scores.
|
|
36
|
+
|
|
37
|
+
"""
|
|
38
|
+
scores = {}
|
|
39
|
+
for ranker in rankings:
|
|
40
|
+
for rank in ranker:
|
|
41
|
+
scores[rank.id] = float(0)
|
|
42
|
+
|
|
43
|
+
for ranker in rankings:
|
|
44
|
+
for i, rank in enumerate(ranker):
|
|
45
|
+
scores[rank.id] += 1.0 / (k + i)
|
|
46
|
+
|
|
47
|
+
# Create a list of tuples of ids and their scores
|
|
48
|
+
results = [(rank, scores[rank]) for rank in scores]
|
|
49
|
+
|
|
50
|
+
# Sort results by score
|
|
51
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
52
|
+
|
|
53
|
+
# Create a map of original scores to ids
|
|
54
|
+
original_scores_to_ids = defaultdict(list)
|
|
55
|
+
for ranker in rankings:
|
|
56
|
+
for rank in ranker:
|
|
57
|
+
original_scores_to_ids[rank.id].append(rank.score)
|
|
58
|
+
|
|
59
|
+
# Rebuild a list of final results with their original scores
|
|
60
|
+
return [
|
|
61
|
+
FusionResult(
|
|
62
|
+
id=result[0],
|
|
63
|
+
score=result[1],
|
|
64
|
+
original_scores=original_scores_to_ids[result[0]],
|
|
65
|
+
)
|
|
66
|
+
for result in results
|
|
67
|
+
]
|
|
@@ -10,7 +10,6 @@ from typing import TypeVar
|
|
|
10
10
|
|
|
11
11
|
from sqlalchemy import delete, func, select
|
|
12
12
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
|
-
from sqlalchemy.orm.exc import MultipleResultsFound
|
|
14
13
|
|
|
15
14
|
from kodit.embedding.embedding_models import Embedding
|
|
16
15
|
from kodit.indexing.indexing_models import Index, Snippet
|
|
@@ -125,34 +124,15 @@ class IndexRepository:
|
|
|
125
124
|
index.updated_at = datetime.now(UTC)
|
|
126
125
|
await self.session.commit()
|
|
127
126
|
|
|
128
|
-
async def
|
|
127
|
+
async def add_snippet(self, snippet: Snippet) -> None:
|
|
129
128
|
"""Add a new snippet to the database if it doesn't exist, otherwise update it.
|
|
130
129
|
|
|
131
130
|
Args:
|
|
132
131
|
snippet: The Snippet instance to add.
|
|
133
132
|
|
|
134
133
|
"""
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
Snippet.index_id == snippet.index_id,
|
|
138
|
-
)
|
|
139
|
-
result = await self.session.execute(query)
|
|
140
|
-
try:
|
|
141
|
-
existing_snippet = result.scalar_one_or_none()
|
|
142
|
-
|
|
143
|
-
if existing_snippet:
|
|
144
|
-
existing_snippet.content = snippet.content
|
|
145
|
-
else:
|
|
146
|
-
self.session.add(snippet)
|
|
147
|
-
|
|
148
|
-
await self.session.commit()
|
|
149
|
-
except MultipleResultsFound as e:
|
|
150
|
-
msg = (
|
|
151
|
-
f"Multiple snippets found for file_id {snippet.file_id}, this "
|
|
152
|
-
"shouldn't happen. "
|
|
153
|
-
"Please report this as a bug then delete your index and start again."
|
|
154
|
-
)
|
|
155
|
-
raise ValueError(msg) from e
|
|
134
|
+
self.session.add(snippet)
|
|
135
|
+
await self.session.commit()
|
|
156
136
|
|
|
157
137
|
async def delete_all_snippets(self, index_id: int) -> None:
|
|
158
138
|
"""Delete all snippets for an index.
|
|
@@ -161,6 +141,15 @@ class IndexRepository:
|
|
|
161
141
|
index_id: The ID of the index to delete snippets for.
|
|
162
142
|
|
|
163
143
|
"""
|
|
144
|
+
# First get all snippets for this index
|
|
145
|
+
snippets = await self.get_snippets_for_index(index_id)
|
|
146
|
+
|
|
147
|
+
# Delete all embeddings for these snippets, if there are any
|
|
148
|
+
for snippet in snippets:
|
|
149
|
+
query = delete(Embedding).where(Embedding.snippet_id == snippet.id)
|
|
150
|
+
await self.session.execute(query)
|
|
151
|
+
|
|
152
|
+
# Now delete the snippets
|
|
164
153
|
query = delete(Snippet).where(Snippet.index_id == index_id)
|
|
165
154
|
await self.session.execute(query)
|
|
166
155
|
await self.session.commit()
|
|
@@ -196,3 +185,32 @@ class IndexRepository:
|
|
|
196
185
|
"""
|
|
197
186
|
self.session.add(embedding)
|
|
198
187
|
await self.session.commit()
|
|
188
|
+
|
|
189
|
+
async def list_snippets_by_ids(self, ids: list[int]) -> list[tuple[File, Snippet]]:
|
|
190
|
+
"""List snippets by IDs.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
A list of snippets in the same order as the input IDs.
|
|
194
|
+
|
|
195
|
+
"""
|
|
196
|
+
query = (
|
|
197
|
+
select(Snippet, File)
|
|
198
|
+
.where(Snippet.id.in_(ids))
|
|
199
|
+
.join(File, Snippet.file_id == File.id)
|
|
200
|
+
)
|
|
201
|
+
rows = await self.session.execute(query)
|
|
202
|
+
|
|
203
|
+
# Create a dictionary for O(1) lookup of results by ID
|
|
204
|
+
id_to_result = {snippet.id: (file, snippet) for snippet, file in rows.all()}
|
|
205
|
+
|
|
206
|
+
# Check that all IDs are present
|
|
207
|
+
if len(id_to_result) != len(ids):
|
|
208
|
+
# Create a list of missing IDs
|
|
209
|
+
missing_ids = [
|
|
210
|
+
snippet_id for snippet_id in ids if snippet_id not in id_to_result
|
|
211
|
+
]
|
|
212
|
+
msg = f"Some IDs are not present: {missing_ids}"
|
|
213
|
+
raise ValueError(msg)
|
|
214
|
+
|
|
215
|
+
# Rebuild the list in the same order that it was passed in
|
|
216
|
+
return [id_to_result[i] for i in ids]
|
|
@@ -13,11 +13,17 @@ import pydantic
|
|
|
13
13
|
import structlog
|
|
14
14
|
from tqdm.asyncio import tqdm
|
|
15
15
|
|
|
16
|
-
from kodit.bm25.keyword_search_service import
|
|
16
|
+
from kodit.bm25.keyword_search_service import (
|
|
17
|
+
BM25Document,
|
|
18
|
+
BM25Result,
|
|
19
|
+
KeywordSearchProvider,
|
|
20
|
+
)
|
|
17
21
|
from kodit.embedding.vector_search_service import (
|
|
18
22
|
VectorSearchRequest,
|
|
19
23
|
VectorSearchService,
|
|
20
24
|
)
|
|
25
|
+
from kodit.enrichment.enrichment_service import EnrichmentService
|
|
26
|
+
from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
|
|
21
27
|
from kodit.indexing.indexing_models import Snippet
|
|
22
28
|
from kodit.indexing.indexing_repository import IndexRepository
|
|
23
29
|
from kodit.snippets.snippets import SnippetService
|
|
@@ -42,6 +48,28 @@ class IndexView(pydantic.BaseModel):
|
|
|
42
48
|
num_snippets: int | None = None
|
|
43
49
|
|
|
44
50
|
|
|
51
|
+
class SearchRequest(pydantic.BaseModel):
|
|
52
|
+
"""Request for a search."""
|
|
53
|
+
|
|
54
|
+
text_query: str | None = None
|
|
55
|
+
code_query: str | None = None
|
|
56
|
+
keywords: list[str] | None = None
|
|
57
|
+
top_k: int = 10
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SearchResult(pydantic.BaseModel):
|
|
61
|
+
"""Data transfer object for search results.
|
|
62
|
+
|
|
63
|
+
This model represents a single search result, containing both the file path
|
|
64
|
+
and the matching snippet content.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
id: int
|
|
68
|
+
uri: str
|
|
69
|
+
content: str
|
|
70
|
+
original_scores: list[float]
|
|
71
|
+
|
|
72
|
+
|
|
45
73
|
class IndexService:
|
|
46
74
|
"""Service for managing code indexes.
|
|
47
75
|
|
|
@@ -50,12 +78,14 @@ class IndexService:
|
|
|
50
78
|
IndexRepository), and provides a clean API for index management.
|
|
51
79
|
"""
|
|
52
80
|
|
|
53
|
-
def __init__(
|
|
81
|
+
def __init__( # noqa: PLR0913
|
|
54
82
|
self,
|
|
55
83
|
repository: IndexRepository,
|
|
56
84
|
source_service: SourceService,
|
|
57
85
|
keyword_search_provider: KeywordSearchProvider,
|
|
58
|
-
|
|
86
|
+
code_search_service: VectorSearchService,
|
|
87
|
+
text_search_service: VectorSearchService,
|
|
88
|
+
enrichment_service: EnrichmentService,
|
|
59
89
|
) -> None:
|
|
60
90
|
"""Initialize the index service.
|
|
61
91
|
|
|
@@ -69,7 +99,9 @@ class IndexService:
|
|
|
69
99
|
self.snippet_service = SnippetService()
|
|
70
100
|
self.log = structlog.get_logger(__name__)
|
|
71
101
|
self.keyword_search_provider = keyword_search_provider
|
|
72
|
-
self.code_search_service =
|
|
102
|
+
self.code_search_service = code_search_service
|
|
103
|
+
self.text_search_service = text_search_service
|
|
104
|
+
self.enrichment_service = enrichment_service
|
|
73
105
|
|
|
74
106
|
async def create(self, source_id: int) -> IndexView:
|
|
75
107
|
"""Create a new index for a source.
|
|
@@ -129,10 +161,15 @@ class IndexService:
|
|
|
129
161
|
msg = f"Index not found: {index_id}"
|
|
130
162
|
raise ValueError(msg)
|
|
131
163
|
|
|
164
|
+
# Delete old snippets so we don't duplicate. In the future should probably check
|
|
165
|
+
# which files have changed and only change those.
|
|
166
|
+
await self.repository.delete_all_snippets(index.id)
|
|
167
|
+
|
|
132
168
|
# Create snippets for supported file types
|
|
133
|
-
|
|
169
|
+
self.log.info("Creating snippets for files", index_id=index.id)
|
|
170
|
+
await self._create_snippets(index.id)
|
|
134
171
|
|
|
135
|
-
snippets = await self.repository.get_all_snippets(
|
|
172
|
+
snippets = await self.repository.get_all_snippets(index.id)
|
|
136
173
|
|
|
137
174
|
self.log.info("Creating keyword index")
|
|
138
175
|
with Spinner():
|
|
@@ -152,9 +189,93 @@ class IndexService:
|
|
|
152
189
|
]
|
|
153
190
|
)
|
|
154
191
|
|
|
192
|
+
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
193
|
+
enriched_contents = await self.enrichment_service.enrich(
|
|
194
|
+
[snippet.content for snippet in snippets]
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
self.log.info("Creating semantic text index")
|
|
198
|
+
with Spinner():
|
|
199
|
+
await self.text_search_service.index(
|
|
200
|
+
[
|
|
201
|
+
VectorSearchRequest(snippet.id, enriched_content)
|
|
202
|
+
for snippet, enriched_content in zip(
|
|
203
|
+
snippets, enriched_contents, strict=True
|
|
204
|
+
)
|
|
205
|
+
]
|
|
206
|
+
)
|
|
207
|
+
# Add the enriched text back to the snippets and write to the database
|
|
208
|
+
for snippet, enriched_content in zip(
|
|
209
|
+
snippets, enriched_contents, strict=True
|
|
210
|
+
):
|
|
211
|
+
snippet.content = (
|
|
212
|
+
enriched_content + "\n\n```\n" + snippet.content + "\n```"
|
|
213
|
+
)
|
|
214
|
+
await self.repository.add_snippet(snippet)
|
|
215
|
+
|
|
155
216
|
# Update index timestamp
|
|
156
217
|
await self.repository.update_index_timestamp(index)
|
|
157
218
|
|
|
219
|
+
async def search(self, request: SearchRequest) -> list[SearchResult]:
|
|
220
|
+
"""Search for relevant data."""
|
|
221
|
+
fusion_list: list[list[FusionRequest]] = []
|
|
222
|
+
if request.keywords:
|
|
223
|
+
# Gather results for each keyword
|
|
224
|
+
result_ids: list[BM25Result] = []
|
|
225
|
+
for keyword in request.keywords:
|
|
226
|
+
results = await self.keyword_search_provider.retrieve(
|
|
227
|
+
keyword, request.top_k
|
|
228
|
+
)
|
|
229
|
+
result_ids.extend(results)
|
|
230
|
+
|
|
231
|
+
fusion_list.append(
|
|
232
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Compute embedding for semantic query
|
|
236
|
+
if request.code_query:
|
|
237
|
+
query_embedding = await self.code_search_service.retrieve(
|
|
238
|
+
request.code_query, top_k=request.top_k
|
|
239
|
+
)
|
|
240
|
+
fusion_list.append(
|
|
241
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
if request.text_query:
|
|
245
|
+
query_embedding = await self.text_search_service.retrieve(
|
|
246
|
+
request.text_query, top_k=request.top_k
|
|
247
|
+
)
|
|
248
|
+
fusion_list.append(
|
|
249
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if len(fusion_list) == 0:
|
|
253
|
+
return []
|
|
254
|
+
|
|
255
|
+
# Combine all results together with RFF if required
|
|
256
|
+
final_results = reciprocal_rank_fusion(
|
|
257
|
+
rankings=fusion_list,
|
|
258
|
+
k=60,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Only keep top_k results
|
|
262
|
+
final_results = final_results[: request.top_k]
|
|
263
|
+
|
|
264
|
+
# Get snippets from database (up to top_k)
|
|
265
|
+
search_results = await self.repository.list_snippets_by_ids(
|
|
266
|
+
[x.id for x in final_results]
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
return [
|
|
270
|
+
SearchResult(
|
|
271
|
+
id=snippet.id,
|
|
272
|
+
uri=file.uri,
|
|
273
|
+
content=snippet.content,
|
|
274
|
+
original_scores=fr.original_scores,
|
|
275
|
+
)
|
|
276
|
+
for (file, snippet), fr in zip(search_results, final_results, strict=True)
|
|
277
|
+
]
|
|
278
|
+
|
|
158
279
|
async def _create_snippets(
|
|
159
280
|
self,
|
|
160
281
|
index_id: int,
|
|
@@ -168,7 +289,6 @@ class IndexService:
|
|
|
168
289
|
|
|
169
290
|
"""
|
|
170
291
|
files = await self.repository.files_for_index(index_id)
|
|
171
|
-
self.log.info("Creating snippets for files", index_id=index_id)
|
|
172
292
|
for file in tqdm(files, total=len(files), leave=False):
|
|
173
293
|
# Skip unsupported file types
|
|
174
294
|
if file.mime_type in MIME_BLACKLIST:
|
|
@@ -190,4 +310,4 @@ class IndexService:
|
|
|
190
310
|
file_id=file.id,
|
|
191
311
|
content=snippet.text,
|
|
192
312
|
)
|
|
193
|
-
await self.repository.
|
|
313
|
+
await self.repository.add_snippet(s)
|
kodit/mcp.py
CHANGED
|
@@ -16,8 +16,11 @@ from kodit.bm25.keyword_search_factory import keyword_search_factory
|
|
|
16
16
|
from kodit.config import AppContext
|
|
17
17
|
from kodit.database import Database
|
|
18
18
|
from kodit.embedding.embedding_factory import embedding_factory
|
|
19
|
-
from kodit.
|
|
20
|
-
from kodit.
|
|
19
|
+
from kodit.enrichment.enrichment_factory import enrichment_factory
|
|
20
|
+
from kodit.indexing.indexing_repository import IndexRepository
|
|
21
|
+
from kodit.indexing.indexing_service import IndexService, SearchRequest, SearchResult
|
|
22
|
+
from kodit.source.source_repository import SourceRepository
|
|
23
|
+
from kodit.source.source_service import SourceService
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
@dataclass
|
|
@@ -123,32 +126,38 @@ async def search(
|
|
|
123
126
|
|
|
124
127
|
mcp_context: MCPContext = ctx.request_context.lifespan_context
|
|
125
128
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
+
source_repository = SourceRepository(mcp_context.session)
|
|
130
|
+
source_service = SourceService(
|
|
131
|
+
mcp_context.app_context.get_clone_dir(), source_repository
|
|
129
132
|
)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
log.debug("Creating search service")
|
|
137
|
-
search_service = SearchService(
|
|
138
|
-
repository=search_repository,
|
|
133
|
+
repository = IndexRepository(mcp_context.session)
|
|
134
|
+
service = IndexService(
|
|
135
|
+
repository=repository,
|
|
136
|
+
source_service=source_service,
|
|
139
137
|
keyword_search_provider=keyword_search_factory(
|
|
138
|
+
mcp_context.app_context, mcp_context.session
|
|
139
|
+
),
|
|
140
|
+
code_search_service=embedding_factory(
|
|
141
|
+
task_name="code",
|
|
140
142
|
app_context=mcp_context.app_context,
|
|
141
143
|
session=mcp_context.session,
|
|
142
144
|
),
|
|
143
|
-
|
|
145
|
+
text_search_service=embedding_factory(
|
|
146
|
+
task_name="text",
|
|
147
|
+
app_context=mcp_context.app_context,
|
|
148
|
+
session=mcp_context.session,
|
|
149
|
+
),
|
|
150
|
+
enrichment_service=enrichment_factory(mcp_context.app_context),
|
|
144
151
|
)
|
|
145
152
|
|
|
146
153
|
search_request = SearchRequest(
|
|
147
154
|
keywords=keywords,
|
|
148
155
|
code_query="\n".join(related_file_contents),
|
|
156
|
+
text_query=user_intent,
|
|
149
157
|
)
|
|
158
|
+
|
|
150
159
|
log.debug("Searching for snippets")
|
|
151
|
-
snippets = await
|
|
160
|
+
snippets = await service.search(request=search_request)
|
|
152
161
|
|
|
153
162
|
log.debug("Fusing output")
|
|
154
163
|
output = output_fusion(snippets=snippets)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# ruff: noqa
|
|
2
|
+
"""index all the things
|
|
3
|
+
|
|
4
|
+
Revision ID: c3f5137d30f5
|
|
5
|
+
Revises: 7c3bbc2ab32b
|
|
6
|
+
Create Date: 2025-06-05 17:17:32.440740
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Sequence, Union
|
|
11
|
+
|
|
12
|
+
from alembic import op
|
|
13
|
+
import sqlalchemy as sa
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = 'c3f5137d30f5'
|
|
18
|
+
down_revision: Union[str, None] = '7c3bbc2ab32b'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade() -> None:
|
|
24
|
+
"""Upgrade schema."""
|
|
25
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
26
|
+
op.create_index(op.f('ix_files_cloned_path'), 'files', ['cloned_path'], unique=False)
|
|
27
|
+
op.create_index(op.f('ix_files_mime_type'), 'files', ['mime_type'], unique=False)
|
|
28
|
+
op.create_index(op.f('ix_files_uri'), 'files', ['uri'], unique=False)
|
|
29
|
+
op.create_index(op.f('ix_snippets_file_id'), 'snippets', ['file_id'], unique=False)
|
|
30
|
+
op.create_index(op.f('ix_snippets_index_id'), 'snippets', ['index_id'], unique=False)
|
|
31
|
+
op.create_index(op.f('ix_sources_cloned_path'), 'sources', ['cloned_path'], unique=False)
|
|
32
|
+
# ### end Alembic commands ###
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def downgrade() -> None:
|
|
36
|
+
"""Downgrade schema."""
|
|
37
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
38
|
+
op.drop_index(op.f('ix_sources_cloned_path'), table_name='sources')
|
|
39
|
+
op.drop_index(op.f('ix_snippets_index_id'), table_name='snippets')
|
|
40
|
+
op.drop_index(op.f('ix_snippets_file_id'), table_name='snippets')
|
|
41
|
+
op.drop_index(op.f('ix_files_uri'), table_name='files')
|
|
42
|
+
op.drop_index(op.f('ix_files_mime_type'), table_name='files')
|
|
43
|
+
op.drop_index(op.f('ix_files_cloned_path'), table_name='files')
|
|
44
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
(function_declaration
|
|
2
|
+
name: (identifier) @function.name
|
|
3
|
+
body: (block) @function.body
|
|
4
|
+
) @function.def
|
|
5
|
+
|
|
6
|
+
(method_declaration
|
|
7
|
+
name: (field_identifier) @method.name
|
|
8
|
+
body: (block) @method.body
|
|
9
|
+
) @method.def
|
|
10
|
+
|
|
11
|
+
(import_declaration
|
|
12
|
+
(import_spec
|
|
13
|
+
path: (interpreted_string_literal) @import.name
|
|
14
|
+
)
|
|
15
|
+
) @import.statement
|
|
16
|
+
|
|
17
|
+
(identifier) @ident
|
|
18
|
+
|
|
19
|
+
(parameter_declaration
|
|
20
|
+
name: (identifier) @param.name
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
(package_clause "package" (package_identifier) @name.definition.module)
|
|
24
|
+
|
|
25
|
+
;; Exclude comments from being captured
|
|
26
|
+
(comment) @comment
|
kodit/source/source_models.py
CHANGED
|
@@ -31,7 +31,7 @@ class Source(Base, CommonMixin):
|
|
|
31
31
|
|
|
32
32
|
__tablename__ = "sources"
|
|
33
33
|
uri: Mapped[str] = mapped_column(String(1024), index=True, unique=True)
|
|
34
|
-
cloned_path: Mapped[str] = mapped_column(String(1024))
|
|
34
|
+
cloned_path: Mapped[str] = mapped_column(String(1024), index=True)
|
|
35
35
|
|
|
36
36
|
def __init__(self, uri: str, cloned_path: str) -> None:
|
|
37
37
|
"""Initialize a new Source instance for typing purposes."""
|
|
@@ -46,9 +46,9 @@ class File(Base, CommonMixin):
|
|
|
46
46
|
__tablename__ = "files"
|
|
47
47
|
|
|
48
48
|
source_id: Mapped[int] = mapped_column(ForeignKey("sources.id"))
|
|
49
|
-
mime_type: Mapped[str] = mapped_column(String(255), default="")
|
|
50
|
-
uri: Mapped[str] = mapped_column(String(1024), default="")
|
|
51
|
-
cloned_path: Mapped[str] = mapped_column(String(1024))
|
|
49
|
+
mime_type: Mapped[str] = mapped_column(String(255), default="", index=True)
|
|
50
|
+
uri: Mapped[str] = mapped_column(String(1024), default="", index=True)
|
|
51
|
+
cloned_path: Mapped[str] = mapped_column(String(1024), index=True)
|
|
52
52
|
sha256: Mapped[str] = mapped_column(String(64), default="", index=True)
|
|
53
53
|
size_bytes: Mapped[int] = mapped_column(Integer, default=0)
|
|
54
54
|
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kodit
|
|
3
|
+
Version: 0.1.17
|
|
4
|
+
Summary: Code indexing for better AI code generation
|
|
5
|
+
Project-URL: Homepage, https://docs.helixml.tech/kodit/
|
|
6
|
+
Project-URL: Documentation, https://docs.helixml.tech/kodit/
|
|
7
|
+
Project-URL: Repository, https://github.com/helixml/kodit.git
|
|
8
|
+
Project-URL: Issues, https://github.com/helixml/kodit/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/helixml/kodit/releases
|
|
10
|
+
Author-email: "Helix.ML" <founders@helix.ml>
|
|
11
|
+
Maintainer-email: "Helix.ML" <founders@helix.ml>
|
|
12
|
+
License-Expression: Apache-2.0
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Keywords: ai,indexing,mcp,rag
|
|
15
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: aiofiles>=24.1.0
|
|
22
|
+
Requires-Dist: aiosqlite>=0.20.0
|
|
23
|
+
Requires-Dist: alembic>=1.15.2
|
|
24
|
+
Requires-Dist: asgi-correlation-id>=4.3.4
|
|
25
|
+
Requires-Dist: asyncpg>=0.30.0
|
|
26
|
+
Requires-Dist: better-exceptions>=0.3.3
|
|
27
|
+
Requires-Dist: bm25s[core]>=0.2.12
|
|
28
|
+
Requires-Dist: click>=8.1.8
|
|
29
|
+
Requires-Dist: colorama>=0.4.6
|
|
30
|
+
Requires-Dist: dotenv>=0.9.9
|
|
31
|
+
Requires-Dist: fastapi[standard]>=0.115.12
|
|
32
|
+
Requires-Dist: fastmcp>=2.3.3
|
|
33
|
+
Requires-Dist: gitpython>=3.1.44
|
|
34
|
+
Requires-Dist: hf-xet>=1.1.2
|
|
35
|
+
Requires-Dist: httpx-retries>=0.3.2
|
|
36
|
+
Requires-Dist: httpx>=0.28.1
|
|
37
|
+
Requires-Dist: openai>=1.82.0
|
|
38
|
+
Requires-Dist: posthog>=4.0.1
|
|
39
|
+
Requires-Dist: pydantic-settings>=2.9.1
|
|
40
|
+
Requires-Dist: pytable-formatter>=0.1.1
|
|
41
|
+
Requires-Dist: sentence-transformers>=4.1.0
|
|
42
|
+
Requires-Dist: sqlalchemy[asyncio]>=2.0.40
|
|
43
|
+
Requires-Dist: structlog>=25.3.0
|
|
44
|
+
Requires-Dist: tdqm>=0.0.1
|
|
45
|
+
Requires-Dist: tiktoken>=0.9.0
|
|
46
|
+
Requires-Dist: transformers>=4.51.3
|
|
47
|
+
Requires-Dist: tree-sitter-language-pack>=0.7.3
|
|
48
|
+
Requires-Dist: tree-sitter>=0.24.0
|
|
49
|
+
Requires-Dist: uritools>=5.0.0
|
|
50
|
+
Description-Content-Type: text/markdown
|
|
51
|
+
|
|
52
|
+
<p align="center">
|
|
53
|
+
<a href="https://docs.helix.ml/kodit/"><img src="https://docs.helix.ml/images/helix-kodit-logo.png" alt="Helix Kodit Logo" width="300"></a>
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
<h1 align="center">
|
|
57
|
+
Kodit: A Code Indexing MCP Server
|
|
58
|
+
</h1>
|
|
59
|
+
|
|
60
|
+
<p align="center">
|
|
61
|
+
Kodit connects your AI coding assistant to external codebases to provide accurate and up-to-date snippets of code.
|
|
62
|
+
</p>
|
|
63
|
+
|
|
64
|
+
<div align="center">
|
|
65
|
+
|
|
66
|
+
[](https://docs.helix.ml/kodit/)
|
|
67
|
+
[](./LICENSE)
|
|
68
|
+
[](https://github.com/helixml/kodit/discussions)
|
|
69
|
+
|
|
70
|
+
</div>
|
|
71
|
+
|
|
72
|
+
**Helix Kodit** is an **MCP server** that connects your AI coding assistant to external codebases. It can:
|
|
73
|
+
|
|
74
|
+
- Improve your AI-assisted code by providing canonical examples direct from the source
|
|
75
|
+
- Index local and public codebases
|
|
76
|
+
- Integrates with any AI coding assistant via MCP
|
|
77
|
+
- Search using keyword and semantic search
|
|
78
|
+
- Integrate with any OpenAI-compatible or custom API/model
|
|
79
|
+
|
|
80
|
+
If you're an engineer working with AI-powered coding assistants, Kodit helps by
|
|
81
|
+
providing relevant and up-to-date examples of your task so that LLMs make less mistakes
|
|
82
|
+
and produce fewer hallucinations.
|
|
83
|
+
|
|
84
|
+
## ✨ Features
|
|
85
|
+
|
|
86
|
+
### Codebase Indexing
|
|
87
|
+
|
|
88
|
+
Kodit connects to a variety of local and remote codebases to build an index of your
|
|
89
|
+
code. This index is used to build a snippet library, ready for ingestion into an LLM.
|
|
90
|
+
|
|
91
|
+
- Index local directories and public Git repositories
|
|
92
|
+
- Build comprehensive snippet libraries for LLM ingestion
|
|
93
|
+
- Support for multiple codebase types and languages
|
|
94
|
+
- Efficient indexing and search capabilities
|
|
95
|
+
|
|
96
|
+
### MCP Server
|
|
97
|
+
|
|
98
|
+
Relevant snippets are exposed to an AI coding assistant via an MCP server. This allows
|
|
99
|
+
the assistant to request relevant snippets by providing keywords, code, and semantic
|
|
100
|
+
intent. Kodit has been tested to work well with:
|
|
101
|
+
|
|
102
|
+
- Seamless integration with popular AI coding assistants
|
|
103
|
+
- Tested and verified with:
|
|
104
|
+
- [Cursor](https://docs.helix.ml/kodit/#integration-with-cursor)
|
|
105
|
+
- [Cline](https://docs.helix.ml/kodit/#integration-with-cline)
|
|
106
|
+
- Please contribute more instructions! ... any other assistant is likely to work ...
|
|
107
|
+
|
|
108
|
+
### Enterprise Ready
|
|
109
|
+
|
|
110
|
+
Out of the box, Kodit works with a local SQLite database and very small, local models.
|
|
111
|
+
But enterprises can scale out with performant databases and dedicated models. Everything
|
|
112
|
+
can even run securely, privately, with on-premise LLM platforms like
|
|
113
|
+
[Helix](https://helix.ml).
|
|
114
|
+
|
|
115
|
+
Supported databases:
|
|
116
|
+
|
|
117
|
+
- SQLite
|
|
118
|
+
- [Vectorchord](https://github.com/tensorchord/VectorChord)
|
|
119
|
+
|
|
120
|
+
Supported providers:
|
|
121
|
+
|
|
122
|
+
- Local (which uses tiny CPU-only open-source models)
|
|
123
|
+
- OpenAI
|
|
124
|
+
- Secure, private LLM enclave with [Helix](https://helix.ml).
|
|
125
|
+
- Any other OpenAI compatible API
|
|
126
|
+
|
|
127
|
+
## 🚀 Quick Start
|
|
128
|
+
|
|
129
|
+
1. [Install Kodit](https://docs.helix.ml/kodit/#installation)
|
|
130
|
+
2. [Index codebases](https://docs.helix.ml/kodit/#quick-start)
|
|
131
|
+
3. [Integrate with your coding assistant](https://docs.helix.ml/kodit/#integrating-kodit-with-coding-assistants)
|
|
132
|
+
|
|
133
|
+
### Documentation
|
|
134
|
+
|
|
135
|
+
- [Installation Guide](https://docs.helix.ml/kodit/#installation)
|
|
136
|
+
- [Usage Guide](https://docs.helix.ml/kodit/#quick-start)
|
|
137
|
+
- [Connecting to Kodit](https://docs.helix.ml/kodit/#integrating-kodit-with-coding-assistants)
|
|
138
|
+
- [Configuration Options](https://docs.helix.ml/kodit/#configuring-kodit)
|
|
139
|
+
- [Contribution Guidelines](.github/CONTRIBUTING.md)
|
|
140
|
+
|
|
141
|
+
## Roadmap
|
|
142
|
+
|
|
143
|
+
The roadmap is currently maintained as a [Github Project](https://github.com/orgs/helixml/projects/4).
|
|
144
|
+
|
|
145
|
+
## 💬 Support
|
|
146
|
+
|
|
147
|
+
For commercial support, please contact [Helix.ML](founders@helix.ml). To ask a question,
|
|
148
|
+
please [open a discussion](https://github.com/helixml/kodit/discussions).
|
|
149
|
+
|
|
150
|
+
## License
|
|
151
|
+
|
|
152
|
+
[Apache 2.0 © 2025 HelixML, Inc.](./LICENSE)
|