kodit 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/__init__.py +1 -0
- kodit/application/commands/__init__.py +1 -0
- kodit/application/commands/snippet_commands.py +22 -0
- kodit/application/services/__init__.py +1 -0
- kodit/application/services/indexing_application_service.py +363 -0
- kodit/application/services/snippet_application_service.py +143 -0
- kodit/cli.py +105 -82
- kodit/database.py +0 -22
- kodit/domain/__init__.py +1 -0
- kodit/{source/source_models.py → domain/entities.py} +88 -19
- kodit/domain/enums.py +9 -0
- kodit/domain/interfaces.py +27 -0
- kodit/domain/repositories.py +95 -0
- kodit/domain/services/__init__.py +1 -0
- kodit/domain/services/bm25_service.py +124 -0
- kodit/domain/services/embedding_service.py +155 -0
- kodit/domain/services/enrichment_service.py +48 -0
- kodit/domain/services/ignore_service.py +45 -0
- kodit/domain/services/indexing_service.py +203 -0
- kodit/domain/services/snippet_extraction_service.py +89 -0
- kodit/domain/services/source_service.py +83 -0
- kodit/domain/value_objects.py +215 -0
- kodit/infrastructure/__init__.py +1 -0
- kodit/infrastructure/bm25/__init__.py +1 -0
- kodit/infrastructure/bm25/bm25_factory.py +28 -0
- kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
- kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
- kodit/infrastructure/cloning/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/factory.py +119 -0
- kodit/infrastructure/cloning/folder/working_copy.py +38 -0
- kodit/infrastructure/cloning/git/__init__.py +1 -0
- kodit/infrastructure/cloning/git/factory.py +133 -0
- kodit/infrastructure/cloning/git/working_copy.py +32 -0
- kodit/infrastructure/cloning/metadata.py +127 -0
- kodit/infrastructure/embedding/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_factory.py +87 -0
- kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
- kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
- kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
- kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +98 -32
- kodit/infrastructure/enrichment/__init__.py +1 -0
- kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
- kodit/infrastructure/enrichment/local_enrichment_provider.py +115 -0
- kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
- kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
- kodit/infrastructure/git/__init__.py +1 -0
- kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
- kodit/infrastructure/ignore/__init__.py +1 -0
- kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
- kodit/infrastructure/indexing/__init__.py +1 -0
- kodit/infrastructure/indexing/fusion_service.py +55 -0
- kodit/infrastructure/indexing/index_repository.py +296 -0
- kodit/infrastructure/indexing/indexing_factory.py +111 -0
- kodit/infrastructure/snippet_extraction/__init__.py +1 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
- kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
- kodit/infrastructure/sqlalchemy/__init__.py +1 -0
- kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
- kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
- kodit/infrastructure/sqlalchemy/repository.py +121 -0
- kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
- kodit/infrastructure/ui/__init__.py +1 -0
- kodit/infrastructure/ui/progress.py +127 -0
- kodit/{util → infrastructure/ui}/spinner.py +19 -4
- kodit/mcp.py +50 -28
- kodit/migrations/env.py +1 -4
- kodit/reporting.py +78 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
- kodit-0.2.5.dist-info/RECORD +99 -0
- kodit/bm25/__init__.py +0 -1
- kodit/bm25/keyword_search_factory.py +0 -17
- kodit/bm25/keyword_search_service.py +0 -34
- kodit/embedding/__init__.py +0 -1
- kodit/embedding/embedding_factory.py +0 -63
- kodit/embedding/embedding_models.py +0 -28
- kodit/embedding/embedding_provider/__init__.py +0 -1
- kodit/embedding/embedding_provider/embedding_provider.py +0 -64
- kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -77
- kodit/embedding/embedding_provider/local_embedding_provider.py +0 -64
- kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -77
- kodit/embedding/local_vector_search_service.py +0 -54
- kodit/embedding/vector_search_service.py +0 -38
- kodit/enrichment/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -16
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +0 -92
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -81
- kodit/enrichment/enrichment_service.py +0 -33
- kodit/indexing/__init__.py +0 -1
- kodit/indexing/fusion.py +0 -67
- kodit/indexing/indexing_models.py +0 -43
- kodit/indexing/indexing_repository.py +0 -216
- kodit/indexing/indexing_service.py +0 -338
- kodit/snippets/__init__.py +0 -1
- kodit/snippets/languages/__init__.py +0 -53
- kodit/snippets/snippets.py +0 -50
- kodit/source/__init__.py +0 -1
- kodit/source/source_factories.py +0 -356
- kodit/source/source_repository.py +0 -169
- kodit/source/source_service.py +0 -150
- kodit/util/__init__.py +0 -1
- kodit-0.2.3.dist-info/RECORD +0 -71
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
"""Index models for managing code indexes.
|
|
2
|
-
|
|
3
|
-
This module defines the SQLAlchemy models used for storing and managing code indexes,
|
|
4
|
-
including files and snippets. It provides the data structures for tracking indexed
|
|
5
|
-
files and their content.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from sqlalchemy import ForeignKey, UnicodeText
|
|
9
|
-
from sqlalchemy.orm import Mapped, mapped_column
|
|
10
|
-
|
|
11
|
-
from kodit.database import Base, CommonMixin
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Index(Base, CommonMixin):
|
|
15
|
-
"""Index model."""
|
|
16
|
-
|
|
17
|
-
__tablename__ = "indexes"
|
|
18
|
-
|
|
19
|
-
source_id: Mapped[int] = mapped_column(
|
|
20
|
-
ForeignKey("sources.id"), unique=True, index=True
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
def __init__(self, source_id: int) -> None:
|
|
24
|
-
"""Initialize the index."""
|
|
25
|
-
super().__init__()
|
|
26
|
-
self.source_id = source_id
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class Snippet(Base, CommonMixin):
|
|
30
|
-
"""Snippet model."""
|
|
31
|
-
|
|
32
|
-
__tablename__ = "snippets"
|
|
33
|
-
|
|
34
|
-
file_id: Mapped[int] = mapped_column(ForeignKey("files.id"), index=True)
|
|
35
|
-
index_id: Mapped[int] = mapped_column(ForeignKey("indexes.id"), index=True)
|
|
36
|
-
content: Mapped[str] = mapped_column(UnicodeText, default="")
|
|
37
|
-
|
|
38
|
-
def __init__(self, file_id: int, index_id: int, content: str) -> None:
|
|
39
|
-
"""Initialize the snippet."""
|
|
40
|
-
super().__init__()
|
|
41
|
-
self.file_id = file_id
|
|
42
|
-
self.index_id = index_id
|
|
43
|
-
self.content = content
|
|
@@ -1,216 +0,0 @@
|
|
|
1
|
-
"""Repository for managing code indexes and their associated files and snippets.
|
|
2
|
-
|
|
3
|
-
This module provides the IndexRepository class which handles all database operations
|
|
4
|
-
related to code indexes, including creating indexes, managing files and snippets,
|
|
5
|
-
and retrieving index information with their associated metadata.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from datetime import UTC, datetime
|
|
9
|
-
from typing import TypeVar
|
|
10
|
-
|
|
11
|
-
from sqlalchemy import delete, func, select
|
|
12
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
|
-
|
|
14
|
-
from kodit.embedding.embedding_models import Embedding
|
|
15
|
-
from kodit.indexing.indexing_models import Index, Snippet
|
|
16
|
-
from kodit.source.source_models import File, Source
|
|
17
|
-
|
|
18
|
-
T = TypeVar("T")
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class IndexRepository:
|
|
22
|
-
"""Repository for managing code indexes and their associated data.
|
|
23
|
-
|
|
24
|
-
This class provides methods for creating and managing code indexes, including
|
|
25
|
-
their associated files and snippets. It handles all database operations related
|
|
26
|
-
to indexing code sources.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
def __init__(self, session: AsyncSession) -> None:
|
|
30
|
-
"""Initialize the index repository.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
session: The SQLAlchemy async session to use for database operations.
|
|
34
|
-
|
|
35
|
-
"""
|
|
36
|
-
self.session = session
|
|
37
|
-
|
|
38
|
-
async def create(self, source_id: int) -> Index:
|
|
39
|
-
"""Create a new index for a source.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
source_id: The ID of the source to create an index for.
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
The newly created Index instance.
|
|
46
|
-
|
|
47
|
-
"""
|
|
48
|
-
index = Index(source_id=source_id)
|
|
49
|
-
self.session.add(index)
|
|
50
|
-
await self.session.commit()
|
|
51
|
-
return index
|
|
52
|
-
|
|
53
|
-
async def get_by_id(self, index_id: int) -> Index | None:
|
|
54
|
-
"""Get an index by its ID.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
index_id: The ID of the index to retrieve.
|
|
58
|
-
|
|
59
|
-
Returns:
|
|
60
|
-
The Index instance if found, None otherwise.
|
|
61
|
-
|
|
62
|
-
"""
|
|
63
|
-
query = select(Index).where(Index.id == index_id)
|
|
64
|
-
result = await self.session.execute(query)
|
|
65
|
-
return result.scalar_one_or_none()
|
|
66
|
-
|
|
67
|
-
async def get_by_source_id(self, source_id: int) -> Index | None:
|
|
68
|
-
"""Get an index by its source ID.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
source_id: The ID of the source to retrieve an index for.
|
|
72
|
-
|
|
73
|
-
"""
|
|
74
|
-
query = select(Index).where(Index.source_id == source_id)
|
|
75
|
-
result = await self.session.execute(query)
|
|
76
|
-
return result.scalar_one_or_none()
|
|
77
|
-
|
|
78
|
-
async def files_for_index(self, index_id: int) -> list[File]:
|
|
79
|
-
"""Get all files for an index.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
index_id: The ID of the index to get files for.
|
|
83
|
-
|
|
84
|
-
Returns:
|
|
85
|
-
A list of File instances.
|
|
86
|
-
|
|
87
|
-
"""
|
|
88
|
-
query = (
|
|
89
|
-
select(File)
|
|
90
|
-
.join(Source, File.source_id == Source.id)
|
|
91
|
-
.join(Index, Index.source_id == Source.id)
|
|
92
|
-
.where(Index.id == index_id)
|
|
93
|
-
)
|
|
94
|
-
result = await self.session.execute(query)
|
|
95
|
-
return list(result.scalars())
|
|
96
|
-
|
|
97
|
-
async def list_indexes(self) -> list[tuple[Index, Source]]:
|
|
98
|
-
"""List all indexes.
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
A list of tuples containing index information, source details,
|
|
102
|
-
and counts of files and snippets.
|
|
103
|
-
|
|
104
|
-
"""
|
|
105
|
-
query = select(Index, Source).join(
|
|
106
|
-
Source, Index.source_id == Source.id, full=True
|
|
107
|
-
)
|
|
108
|
-
result = await self.session.execute(query)
|
|
109
|
-
return list(result.tuples())
|
|
110
|
-
|
|
111
|
-
async def num_snippets_for_index(self, index_id: int) -> int:
|
|
112
|
-
"""Get the number of snippets for an index."""
|
|
113
|
-
query = select(func.count()).where(Snippet.index_id == index_id)
|
|
114
|
-
result = await self.session.execute(query)
|
|
115
|
-
return result.scalar_one()
|
|
116
|
-
|
|
117
|
-
async def update_index_timestamp(self, index: Index) -> None:
|
|
118
|
-
"""Update the updated_at timestamp of an index.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
index: The Index instance to update.
|
|
122
|
-
|
|
123
|
-
"""
|
|
124
|
-
index.updated_at = datetime.now(UTC)
|
|
125
|
-
await self.session.commit()
|
|
126
|
-
|
|
127
|
-
async def add_snippet(self, snippet: Snippet) -> None:
|
|
128
|
-
"""Add a new snippet to the database if it doesn't exist, otherwise update it.
|
|
129
|
-
|
|
130
|
-
Args:
|
|
131
|
-
snippet: The Snippet instance to add.
|
|
132
|
-
|
|
133
|
-
"""
|
|
134
|
-
self.session.add(snippet)
|
|
135
|
-
await self.session.commit()
|
|
136
|
-
|
|
137
|
-
async def delete_all_snippets(self, index_id: int) -> None:
|
|
138
|
-
"""Delete all snippets for an index.
|
|
139
|
-
|
|
140
|
-
Args:
|
|
141
|
-
index_id: The ID of the index to delete snippets for.
|
|
142
|
-
|
|
143
|
-
"""
|
|
144
|
-
# First get all snippets for this index
|
|
145
|
-
snippets = await self.get_snippets_for_index(index_id)
|
|
146
|
-
|
|
147
|
-
# Delete all embeddings for these snippets, if there are any
|
|
148
|
-
for snippet in snippets:
|
|
149
|
-
query = delete(Embedding).where(Embedding.snippet_id == snippet.id)
|
|
150
|
-
await self.session.execute(query)
|
|
151
|
-
|
|
152
|
-
# Now delete the snippets
|
|
153
|
-
query = delete(Snippet).where(Snippet.index_id == index_id)
|
|
154
|
-
await self.session.execute(query)
|
|
155
|
-
await self.session.commit()
|
|
156
|
-
|
|
157
|
-
async def get_snippets_for_index(self, index_id: int) -> list[Snippet]:
|
|
158
|
-
"""Get all snippets for an index.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
index_id: The ID of the index to get snippets for.
|
|
162
|
-
|
|
163
|
-
"""
|
|
164
|
-
query = select(Snippet).where(Snippet.index_id == index_id)
|
|
165
|
-
result = await self.session.execute(query)
|
|
166
|
-
return list(result.scalars())
|
|
167
|
-
|
|
168
|
-
async def get_all_snippets(self, index_id: int) -> list[Snippet]:
|
|
169
|
-
"""Get all snippets.
|
|
170
|
-
|
|
171
|
-
Returns:
|
|
172
|
-
A list of all snippets.
|
|
173
|
-
|
|
174
|
-
"""
|
|
175
|
-
query = select(Snippet).where(Snippet.index_id == index_id).order_by(Snippet.id)
|
|
176
|
-
result = await self.session.execute(query)
|
|
177
|
-
return list(result.scalars())
|
|
178
|
-
|
|
179
|
-
async def add_embedding(self, embedding: Embedding) -> None:
|
|
180
|
-
"""Add a new embedding to the database.
|
|
181
|
-
|
|
182
|
-
Args:
|
|
183
|
-
embedding: The Embedding instance to add.
|
|
184
|
-
|
|
185
|
-
"""
|
|
186
|
-
self.session.add(embedding)
|
|
187
|
-
await self.session.commit()
|
|
188
|
-
|
|
189
|
-
async def list_snippets_by_ids(self, ids: list[int]) -> list[tuple[File, Snippet]]:
|
|
190
|
-
"""List snippets by IDs.
|
|
191
|
-
|
|
192
|
-
Returns:
|
|
193
|
-
A list of snippets in the same order as the input IDs.
|
|
194
|
-
|
|
195
|
-
"""
|
|
196
|
-
query = (
|
|
197
|
-
select(Snippet, File)
|
|
198
|
-
.where(Snippet.id.in_(ids))
|
|
199
|
-
.join(File, Snippet.file_id == File.id)
|
|
200
|
-
)
|
|
201
|
-
rows = await self.session.execute(query)
|
|
202
|
-
|
|
203
|
-
# Create a dictionary for O(1) lookup of results by ID
|
|
204
|
-
id_to_result = {snippet.id: (file, snippet) for snippet, file in rows.all()}
|
|
205
|
-
|
|
206
|
-
# Check that all IDs are present
|
|
207
|
-
if len(id_to_result) != len(ids):
|
|
208
|
-
# Create a list of missing IDs
|
|
209
|
-
missing_ids = [
|
|
210
|
-
snippet_id for snippet_id in ids if snippet_id not in id_to_result
|
|
211
|
-
]
|
|
212
|
-
msg = f"Some IDs are not present: {missing_ids}"
|
|
213
|
-
raise ValueError(msg)
|
|
214
|
-
|
|
215
|
-
# Rebuild the list in the same order that it was passed in
|
|
216
|
-
return [id_to_result[i] for i in ids]
|
|
@@ -1,338 +0,0 @@
|
|
|
1
|
-
"""Index service for managing code indexes.
|
|
2
|
-
|
|
3
|
-
This module provides the IndexService class which handles the business logic for
|
|
4
|
-
creating, listing, and running code indexes. It orchestrates the interaction between the
|
|
5
|
-
file system, database operations (via IndexRepository), and provides a clean API for
|
|
6
|
-
index management.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
|
|
12
|
-
import pydantic
|
|
13
|
-
import structlog
|
|
14
|
-
from tqdm.asyncio import tqdm
|
|
15
|
-
|
|
16
|
-
from kodit.bm25.keyword_search_service import (
|
|
17
|
-
BM25Document,
|
|
18
|
-
BM25Result,
|
|
19
|
-
KeywordSearchProvider,
|
|
20
|
-
)
|
|
21
|
-
from kodit.embedding.vector_search_service import (
|
|
22
|
-
VectorSearchRequest,
|
|
23
|
-
VectorSearchService,
|
|
24
|
-
)
|
|
25
|
-
from kodit.enrichment.enrichment_service import EnrichmentService
|
|
26
|
-
from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
|
|
27
|
-
from kodit.indexing.indexing_models import Snippet
|
|
28
|
-
from kodit.indexing.indexing_repository import IndexRepository
|
|
29
|
-
from kodit.log import log_event
|
|
30
|
-
from kodit.snippets.snippets import SnippetService
|
|
31
|
-
from kodit.source.source_service import SourceService
|
|
32
|
-
from kodit.util.spinner import Spinner
|
|
33
|
-
|
|
34
|
-
# List of MIME types that are blacklisted from being indexed
|
|
35
|
-
MIME_BLACKLIST = ["unknown/unknown"]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class IndexView(pydantic.BaseModel):
|
|
39
|
-
"""Data transfer object for index information.
|
|
40
|
-
|
|
41
|
-
This model represents the public interface for index data, providing a clean
|
|
42
|
-
view of index information without exposing internal implementation details.
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
|
-
id: int
|
|
46
|
-
created_at: datetime
|
|
47
|
-
updated_at: datetime | None = None
|
|
48
|
-
source: str | None = None
|
|
49
|
-
num_snippets: int
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class SearchRequest(pydantic.BaseModel):
|
|
53
|
-
"""Request for a search."""
|
|
54
|
-
|
|
55
|
-
text_query: str | None = None
|
|
56
|
-
code_query: str | None = None
|
|
57
|
-
keywords: list[str] | None = None
|
|
58
|
-
top_k: int = 10
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class SearchResult(pydantic.BaseModel):
|
|
62
|
-
"""Data transfer object for search results.
|
|
63
|
-
|
|
64
|
-
This model represents a single search result, containing both the file path
|
|
65
|
-
and the matching snippet content.
|
|
66
|
-
"""
|
|
67
|
-
|
|
68
|
-
id: int
|
|
69
|
-
uri: str
|
|
70
|
-
content: str
|
|
71
|
-
original_scores: list[float]
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
class IndexService:
|
|
75
|
-
"""Service for managing code indexes.
|
|
76
|
-
|
|
77
|
-
This service handles the business logic for creating, listing, and running code
|
|
78
|
-
indexes. It coordinates between file system operations, database operations (via
|
|
79
|
-
IndexRepository), and provides a clean API for index management.
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
def __init__( # noqa: PLR0913
|
|
83
|
-
self,
|
|
84
|
-
repository: IndexRepository,
|
|
85
|
-
source_service: SourceService,
|
|
86
|
-
keyword_search_provider: KeywordSearchProvider,
|
|
87
|
-
code_search_service: VectorSearchService,
|
|
88
|
-
text_search_service: VectorSearchService,
|
|
89
|
-
enrichment_service: EnrichmentService,
|
|
90
|
-
) -> None:
|
|
91
|
-
"""Initialize the index service.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
repository: The repository instance to use for database operations.
|
|
95
|
-
source_service: The source service instance to use for source validation.
|
|
96
|
-
|
|
97
|
-
"""
|
|
98
|
-
self.repository = repository
|
|
99
|
-
self.source_service = source_service
|
|
100
|
-
self.snippet_service = SnippetService()
|
|
101
|
-
self.log = structlog.get_logger(__name__)
|
|
102
|
-
self.keyword_search_provider = keyword_search_provider
|
|
103
|
-
self.code_search_service = code_search_service
|
|
104
|
-
self.text_search_service = text_search_service
|
|
105
|
-
self.enrichment_service = enrichment_service
|
|
106
|
-
|
|
107
|
-
async def create(self, source_id: int) -> IndexView:
|
|
108
|
-
"""Create a new index for a source.
|
|
109
|
-
|
|
110
|
-
This method creates a new index for the specified source, after validating
|
|
111
|
-
that the source exists and doesn't already have an index.
|
|
112
|
-
|
|
113
|
-
Args:
|
|
114
|
-
source_id: The ID of the source to create an index for.
|
|
115
|
-
|
|
116
|
-
Returns:
|
|
117
|
-
An Index object representing the newly created index.
|
|
118
|
-
|
|
119
|
-
Raises:
|
|
120
|
-
ValueError: If the source doesn't exist or already has an index.
|
|
121
|
-
|
|
122
|
-
"""
|
|
123
|
-
log_event("kodit.index.create")
|
|
124
|
-
|
|
125
|
-
# Check if the source exists
|
|
126
|
-
source = await self.source_service.get(source_id)
|
|
127
|
-
|
|
128
|
-
# Check if the index already exists
|
|
129
|
-
index = await self.repository.get_by_source_id(source.id)
|
|
130
|
-
if not index:
|
|
131
|
-
index = await self.repository.create(source.id)
|
|
132
|
-
return IndexView(
|
|
133
|
-
id=index.id,
|
|
134
|
-
created_at=index.created_at,
|
|
135
|
-
num_snippets=await self.repository.num_snippets_for_index(index.id),
|
|
136
|
-
source=source.uri,
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
async def list_indexes(self) -> list[IndexView]:
|
|
140
|
-
"""List all available indexes with their details.
|
|
141
|
-
|
|
142
|
-
Returns:
|
|
143
|
-
A list of Index objects containing information about each index,
|
|
144
|
-
including file and snippet counts.
|
|
145
|
-
|
|
146
|
-
"""
|
|
147
|
-
indexes = await self.repository.list_indexes()
|
|
148
|
-
|
|
149
|
-
# Transform database results into DTOs
|
|
150
|
-
indexes = [
|
|
151
|
-
IndexView(
|
|
152
|
-
id=index.id,
|
|
153
|
-
created_at=index.created_at,
|
|
154
|
-
updated_at=index.updated_at,
|
|
155
|
-
num_snippets=await self.repository.num_snippets_for_index(index.id)
|
|
156
|
-
or 0,
|
|
157
|
-
source=source.uri,
|
|
158
|
-
)
|
|
159
|
-
for index, source in indexes
|
|
160
|
-
]
|
|
161
|
-
|
|
162
|
-
# Help Kodit by measuring how much people are using indexes
|
|
163
|
-
log_event(
|
|
164
|
-
"kodit.index.list",
|
|
165
|
-
{
|
|
166
|
-
"num_indexes": len(indexes),
|
|
167
|
-
"num_snippets": sum([index.num_snippets for index in indexes]),
|
|
168
|
-
},
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
return indexes
|
|
172
|
-
|
|
173
|
-
async def run(self, index_id: int) -> None:
|
|
174
|
-
"""Run the indexing process for a specific index."""
|
|
175
|
-
log_event("kodit.index.run")
|
|
176
|
-
|
|
177
|
-
# Get and validate index
|
|
178
|
-
index = await self.repository.get_by_id(index_id)
|
|
179
|
-
if not index:
|
|
180
|
-
msg = f"Index not found: {index_id}"
|
|
181
|
-
raise ValueError(msg)
|
|
182
|
-
|
|
183
|
-
# Delete old snippets so we don't duplicate. In the future should probably check
|
|
184
|
-
# which files have changed and only change those.
|
|
185
|
-
await self.repository.delete_all_snippets(index.id)
|
|
186
|
-
|
|
187
|
-
# Create snippets for supported file types
|
|
188
|
-
self.log.info("Creating snippets for files", index_id=index.id)
|
|
189
|
-
await self._create_snippets(index.id)
|
|
190
|
-
|
|
191
|
-
snippets = await self.repository.get_all_snippets(index.id)
|
|
192
|
-
|
|
193
|
-
self.log.info("Creating keyword index")
|
|
194
|
-
with Spinner():
|
|
195
|
-
await self.keyword_search_provider.index(
|
|
196
|
-
[
|
|
197
|
-
BM25Document(snippet_id=snippet.id, text=snippet.content)
|
|
198
|
-
for snippet in snippets
|
|
199
|
-
]
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
self.log.info("Creating semantic code index")
|
|
203
|
-
with Spinner():
|
|
204
|
-
await self.code_search_service.index(
|
|
205
|
-
[
|
|
206
|
-
VectorSearchRequest(snippet.id, snippet.content)
|
|
207
|
-
for snippet in snippets
|
|
208
|
-
]
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
212
|
-
enriched_contents = await self.enrichment_service.enrich(
|
|
213
|
-
[snippet.content for snippet in snippets]
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
self.log.info("Creating semantic text index")
|
|
217
|
-
with Spinner():
|
|
218
|
-
await self.text_search_service.index(
|
|
219
|
-
[
|
|
220
|
-
VectorSearchRequest(snippet.id, enriched_content)
|
|
221
|
-
for snippet, enriched_content in zip(
|
|
222
|
-
snippets, enriched_contents, strict=True
|
|
223
|
-
)
|
|
224
|
-
]
|
|
225
|
-
)
|
|
226
|
-
# Add the enriched text back to the snippets and write to the database
|
|
227
|
-
for snippet, enriched_content in zip(
|
|
228
|
-
snippets, enriched_contents, strict=True
|
|
229
|
-
):
|
|
230
|
-
snippet.content = (
|
|
231
|
-
enriched_content + "\n\n```\n" + snippet.content + "\n```"
|
|
232
|
-
)
|
|
233
|
-
await self.repository.add_snippet(snippet)
|
|
234
|
-
|
|
235
|
-
# Update index timestamp
|
|
236
|
-
await self.repository.update_index_timestamp(index)
|
|
237
|
-
|
|
238
|
-
async def search(self, request: SearchRequest) -> list[SearchResult]:
|
|
239
|
-
"""Search for relevant data."""
|
|
240
|
-
log_event("kodit.index.search")
|
|
241
|
-
|
|
242
|
-
fusion_list: list[list[FusionRequest]] = []
|
|
243
|
-
if request.keywords:
|
|
244
|
-
# Gather results for each keyword
|
|
245
|
-
result_ids: list[BM25Result] = []
|
|
246
|
-
for keyword in request.keywords:
|
|
247
|
-
results = await self.keyword_search_provider.retrieve(
|
|
248
|
-
keyword, request.top_k
|
|
249
|
-
)
|
|
250
|
-
result_ids.extend(results)
|
|
251
|
-
|
|
252
|
-
fusion_list.append(
|
|
253
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
# Compute embedding for semantic query
|
|
257
|
-
if request.code_query:
|
|
258
|
-
query_embedding = await self.code_search_service.retrieve(
|
|
259
|
-
request.code_query, top_k=request.top_k
|
|
260
|
-
)
|
|
261
|
-
fusion_list.append(
|
|
262
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
if request.text_query:
|
|
266
|
-
query_embedding = await self.text_search_service.retrieve(
|
|
267
|
-
request.text_query, top_k=request.top_k
|
|
268
|
-
)
|
|
269
|
-
fusion_list.append(
|
|
270
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
if len(fusion_list) == 0:
|
|
274
|
-
return []
|
|
275
|
-
|
|
276
|
-
# Combine all results together with RFF if required
|
|
277
|
-
final_results = reciprocal_rank_fusion(
|
|
278
|
-
rankings=fusion_list,
|
|
279
|
-
k=60,
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
# Only keep top_k results
|
|
283
|
-
final_results = final_results[: request.top_k]
|
|
284
|
-
|
|
285
|
-
# Get snippets from database (up to top_k)
|
|
286
|
-
search_results = await self.repository.list_snippets_by_ids(
|
|
287
|
-
[x.id for x in final_results]
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
return [
|
|
291
|
-
SearchResult(
|
|
292
|
-
id=snippet.id,
|
|
293
|
-
uri=file.uri,
|
|
294
|
-
content=snippet.content,
|
|
295
|
-
original_scores=fr.original_scores,
|
|
296
|
-
)
|
|
297
|
-
for (file, snippet), fr in zip(search_results, final_results, strict=True)
|
|
298
|
-
]
|
|
299
|
-
|
|
300
|
-
async def _create_snippets(
|
|
301
|
-
self,
|
|
302
|
-
index_id: int,
|
|
303
|
-
) -> None:
|
|
304
|
-
"""Create snippets for supported files.
|
|
305
|
-
|
|
306
|
-
Args:
|
|
307
|
-
index: The index to create snippets for.
|
|
308
|
-
file_list: List of files to create snippets from.
|
|
309
|
-
existing_snippets_set: Set of file IDs that already have snippets.
|
|
310
|
-
|
|
311
|
-
"""
|
|
312
|
-
files = await self.repository.files_for_index(index_id)
|
|
313
|
-
if not files:
|
|
314
|
-
self.log.warning("No files to create snippets for")
|
|
315
|
-
return
|
|
316
|
-
|
|
317
|
-
for file in tqdm(files, total=len(files), leave=False):
|
|
318
|
-
# Skip unsupported file types
|
|
319
|
-
if file.mime_type in MIME_BLACKLIST:
|
|
320
|
-
self.log.debug("Skipping mime type", mime_type=file.mime_type)
|
|
321
|
-
continue
|
|
322
|
-
|
|
323
|
-
# Create snippet from file content
|
|
324
|
-
try:
|
|
325
|
-
snippets = self.snippet_service.snippets_for_file(
|
|
326
|
-
Path(file.cloned_path)
|
|
327
|
-
)
|
|
328
|
-
except ValueError as e:
|
|
329
|
-
self.log.debug("Skipping file", file=file.cloned_path, error=e)
|
|
330
|
-
continue
|
|
331
|
-
|
|
332
|
-
for snippet in snippets:
|
|
333
|
-
s = Snippet(
|
|
334
|
-
index_id=index_id,
|
|
335
|
-
file_id=file.id,
|
|
336
|
-
content=snippet.text,
|
|
337
|
-
)
|
|
338
|
-
await self.repository.add_snippet(s)
|
kodit/snippets/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Extract method snippets from source code."""
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
"""Detect the language of a file."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import cast
|
|
5
|
-
|
|
6
|
-
from tree_sitter_language_pack import SupportedLanguage
|
|
7
|
-
|
|
8
|
-
# Mapping of file extensions to programming languages
|
|
9
|
-
LANGUAGE_MAP: dict[str, str] = {
|
|
10
|
-
# JavaScript/TypeScript
|
|
11
|
-
"js": "javascript",
|
|
12
|
-
"jsx": "javascript",
|
|
13
|
-
"ts": "typescript",
|
|
14
|
-
"tsx": "typescript",
|
|
15
|
-
# Python
|
|
16
|
-
"py": "python",
|
|
17
|
-
# Rust
|
|
18
|
-
"rs": "rust",
|
|
19
|
-
# Go
|
|
20
|
-
"go": "go",
|
|
21
|
-
# C/C++
|
|
22
|
-
"cpp": "cpp",
|
|
23
|
-
"hpp": "cpp",
|
|
24
|
-
"c": "c",
|
|
25
|
-
"h": "c",
|
|
26
|
-
# C#
|
|
27
|
-
"cs": "csharp",
|
|
28
|
-
# Ruby
|
|
29
|
-
"rb": "ruby",
|
|
30
|
-
# Java
|
|
31
|
-
"java": "java",
|
|
32
|
-
# PHP
|
|
33
|
-
"php": "php",
|
|
34
|
-
# Swift
|
|
35
|
-
"swift": "swift",
|
|
36
|
-
# Kotlin
|
|
37
|
-
"kt": "kotlin",
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def detect_language(file_path: Path) -> SupportedLanguage:
|
|
42
|
-
"""Detect the language of a file."""
|
|
43
|
-
suffix = file_path.suffix.removeprefix(".").lower()
|
|
44
|
-
msg = f"Unsupported language for file suffix: {suffix}"
|
|
45
|
-
lang = LANGUAGE_MAP.get(suffix)
|
|
46
|
-
if lang is None:
|
|
47
|
-
raise ValueError(msg)
|
|
48
|
-
|
|
49
|
-
# Try to cast the language to a SupportedLanguage
|
|
50
|
-
try:
|
|
51
|
-
return cast("SupportedLanguage", lang)
|
|
52
|
-
except Exception as e:
|
|
53
|
-
raise ValueError(msg) from e
|
kodit/snippets/snippets.py
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
"""Generate snippets from a file."""
|
|
2
|
-
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from kodit.snippets.languages import detect_language
|
|
7
|
-
from kodit.snippets.method_snippets import MethodSnippets
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class Snippet:
|
|
12
|
-
"""A snippet of code."""
|
|
13
|
-
|
|
14
|
-
text: str
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class SnippetService:
|
|
18
|
-
"""Factory for generating snippets from a file.
|
|
19
|
-
|
|
20
|
-
This is required because there's going to be multiple ways to generate snippets.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
def __init__(self) -> None:
|
|
24
|
-
"""Initialize the snippet factory."""
|
|
25
|
-
self.language_dir = Path(__file__).parent / "languages"
|
|
26
|
-
|
|
27
|
-
def snippets_for_file(self, file_path: Path) -> list[Snippet]:
|
|
28
|
-
"""Generate snippets from a file."""
|
|
29
|
-
language = detect_language(file_path)
|
|
30
|
-
|
|
31
|
-
try:
|
|
32
|
-
query_path = self.language_dir / f"{language}.scm"
|
|
33
|
-
with query_path.open() as f:
|
|
34
|
-
query = f.read()
|
|
35
|
-
except Exception as e:
|
|
36
|
-
msg = f"Unsupported language: {file_path}"
|
|
37
|
-
raise ValueError(msg) from e
|
|
38
|
-
|
|
39
|
-
method_analser = MethodSnippets(language, query)
|
|
40
|
-
|
|
41
|
-
try:
|
|
42
|
-
file_bytes = file_path.read_bytes()
|
|
43
|
-
except Exception as e:
|
|
44
|
-
msg = f"Failed to read file: {file_path}"
|
|
45
|
-
raise ValueError(msg) from e
|
|
46
|
-
|
|
47
|
-
method_snippets = method_analser.extract(file_bytes)
|
|
48
|
-
all_snippets = [Snippet(text=snippet) for snippet in method_snippets]
|
|
49
|
-
# Remove any snippets that are empty
|
|
50
|
-
return [snippet for snippet in all_snippets if snippet.text.strip()]
|