kodit 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/__init__.py +1 -0
- kodit/application/commands/__init__.py +1 -0
- kodit/application/commands/snippet_commands.py +22 -0
- kodit/application/services/__init__.py +1 -0
- kodit/application/services/indexing_application_service.py +363 -0
- kodit/application/services/snippet_application_service.py +143 -0
- kodit/cli.py +105 -82
- kodit/database.py +0 -22
- kodit/domain/__init__.py +1 -0
- kodit/{source/source_models.py → domain/entities.py} +88 -19
- kodit/domain/enums.py +9 -0
- kodit/domain/interfaces.py +27 -0
- kodit/domain/repositories.py +95 -0
- kodit/domain/services/__init__.py +1 -0
- kodit/domain/services/bm25_service.py +124 -0
- kodit/domain/services/embedding_service.py +155 -0
- kodit/domain/services/enrichment_service.py +48 -0
- kodit/domain/services/ignore_service.py +45 -0
- kodit/domain/services/indexing_service.py +203 -0
- kodit/domain/services/snippet_extraction_service.py +89 -0
- kodit/domain/services/source_service.py +83 -0
- kodit/domain/value_objects.py +215 -0
- kodit/infrastructure/__init__.py +1 -0
- kodit/infrastructure/bm25/__init__.py +1 -0
- kodit/infrastructure/bm25/bm25_factory.py +28 -0
- kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
- kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
- kodit/infrastructure/cloning/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/factory.py +119 -0
- kodit/infrastructure/cloning/folder/working_copy.py +38 -0
- kodit/infrastructure/cloning/git/__init__.py +1 -0
- kodit/infrastructure/cloning/git/factory.py +133 -0
- kodit/infrastructure/cloning/git/working_copy.py +32 -0
- kodit/infrastructure/cloning/metadata.py +127 -0
- kodit/infrastructure/embedding/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_factory.py +87 -0
- kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
- kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
- kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
- kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +65 -46
- kodit/infrastructure/enrichment/__init__.py +1 -0
- kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
- kodit/{enrichment/enrichment_provider → infrastructure/enrichment}/local_enrichment_provider.py +38 -26
- kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
- kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
- kodit/infrastructure/git/__init__.py +1 -0
- kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
- kodit/infrastructure/ignore/__init__.py +1 -0
- kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
- kodit/infrastructure/indexing/__init__.py +1 -0
- kodit/infrastructure/indexing/fusion_service.py +55 -0
- kodit/infrastructure/indexing/index_repository.py +296 -0
- kodit/infrastructure/indexing/indexing_factory.py +111 -0
- kodit/infrastructure/snippet_extraction/__init__.py +1 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
- kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
- kodit/infrastructure/sqlalchemy/__init__.py +1 -0
- kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
- kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
- kodit/infrastructure/sqlalchemy/repository.py +121 -0
- kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
- kodit/infrastructure/ui/__init__.py +1 -0
- kodit/infrastructure/ui/progress.py +127 -0
- kodit/{util → infrastructure/ui}/spinner.py +19 -4
- kodit/mcp.py +50 -28
- kodit/migrations/env.py +1 -4
- kodit/reporting.py +78 -0
- {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
- kodit-0.2.5.dist-info/RECORD +99 -0
- kodit/bm25/__init__.py +0 -1
- kodit/bm25/keyword_search_factory.py +0 -17
- kodit/bm25/keyword_search_service.py +0 -34
- kodit/embedding/__init__.py +0 -1
- kodit/embedding/embedding_factory.py +0 -69
- kodit/embedding/embedding_models.py +0 -28
- kodit/embedding/embedding_provider/__init__.py +0 -1
- kodit/embedding/embedding_provider/embedding_provider.py +0 -92
- kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -86
- kodit/embedding/embedding_provider/local_embedding_provider.py +0 -96
- kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -73
- kodit/embedding/local_vector_search_service.py +0 -87
- kodit/embedding/vector_search_service.py +0 -55
- kodit/enrichment/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -36
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -79
- kodit/enrichment/enrichment_service.py +0 -45
- kodit/indexing/__init__.py +0 -1
- kodit/indexing/fusion.py +0 -67
- kodit/indexing/indexing_models.py +0 -43
- kodit/indexing/indexing_repository.py +0 -216
- kodit/indexing/indexing_service.py +0 -344
- kodit/snippets/__init__.py +0 -1
- kodit/snippets/languages/__init__.py +0 -53
- kodit/snippets/snippets.py +0 -50
- kodit/source/__init__.py +0 -1
- kodit/source/source_factories.py +0 -356
- kodit/source/source_repository.py +0 -169
- kodit/source/source_service.py +0 -150
- kodit/util/__init__.py +0 -1
- kodit-0.2.4.dist-info/RECORD +0 -71
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
kodit/indexing/fusion.py
DELETED
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
"""Fusion functions for combining search results."""
|
|
2
|
-
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@dataclass
|
|
8
|
-
class FusionResult:
|
|
9
|
-
"""Result of a fusion operation."""
|
|
10
|
-
|
|
11
|
-
id: int
|
|
12
|
-
score: float
|
|
13
|
-
original_scores: list[float]
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dataclass
|
|
17
|
-
class FusionRequest:
|
|
18
|
-
"""Result of a RRF operation."""
|
|
19
|
-
|
|
20
|
-
id: int
|
|
21
|
-
score: float
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def reciprocal_rank_fusion(
|
|
25
|
-
rankings: list[list[FusionRequest]], k: float = 60
|
|
26
|
-
) -> list[FusionResult]:
|
|
27
|
-
"""RRF prioritises results that are present in all results.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
rankings: List of rankers, each containing a list of document ids. Top of the
|
|
31
|
-
list is considered to be the best result.
|
|
32
|
-
k: Parameter for RRF.
|
|
33
|
-
|
|
34
|
-
Returns:
|
|
35
|
-
Dictionary of ids and their scores.
|
|
36
|
-
|
|
37
|
-
"""
|
|
38
|
-
scores = {}
|
|
39
|
-
for ranker in rankings:
|
|
40
|
-
for rank in ranker:
|
|
41
|
-
scores[rank.id] = float(0)
|
|
42
|
-
|
|
43
|
-
for ranker in rankings:
|
|
44
|
-
for i, rank in enumerate(ranker):
|
|
45
|
-
scores[rank.id] += 1.0 / (k + i)
|
|
46
|
-
|
|
47
|
-
# Create a list of tuples of ids and their scores
|
|
48
|
-
results = [(rank, scores[rank]) for rank in scores]
|
|
49
|
-
|
|
50
|
-
# Sort results by score
|
|
51
|
-
results.sort(key=lambda x: x[1], reverse=True)
|
|
52
|
-
|
|
53
|
-
# Create a map of original scores to ids
|
|
54
|
-
original_scores_to_ids = defaultdict(list)
|
|
55
|
-
for ranker in rankings:
|
|
56
|
-
for rank in ranker:
|
|
57
|
-
original_scores_to_ids[rank.id].append(rank.score)
|
|
58
|
-
|
|
59
|
-
# Rebuild a list of final results with their original scores
|
|
60
|
-
return [
|
|
61
|
-
FusionResult(
|
|
62
|
-
id=result[0],
|
|
63
|
-
score=result[1],
|
|
64
|
-
original_scores=original_scores_to_ids[result[0]],
|
|
65
|
-
)
|
|
66
|
-
for result in results
|
|
67
|
-
]
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
"""Index models for managing code indexes.
|
|
2
|
-
|
|
3
|
-
This module defines the SQLAlchemy models used for storing and managing code indexes,
|
|
4
|
-
including files and snippets. It provides the data structures for tracking indexed
|
|
5
|
-
files and their content.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from sqlalchemy import ForeignKey, UnicodeText
|
|
9
|
-
from sqlalchemy.orm import Mapped, mapped_column
|
|
10
|
-
|
|
11
|
-
from kodit.database import Base, CommonMixin
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Index(Base, CommonMixin):
|
|
15
|
-
"""Index model."""
|
|
16
|
-
|
|
17
|
-
__tablename__ = "indexes"
|
|
18
|
-
|
|
19
|
-
source_id: Mapped[int] = mapped_column(
|
|
20
|
-
ForeignKey("sources.id"), unique=True, index=True
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
def __init__(self, source_id: int) -> None:
|
|
24
|
-
"""Initialize the index."""
|
|
25
|
-
super().__init__()
|
|
26
|
-
self.source_id = source_id
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class Snippet(Base, CommonMixin):
|
|
30
|
-
"""Snippet model."""
|
|
31
|
-
|
|
32
|
-
__tablename__ = "snippets"
|
|
33
|
-
|
|
34
|
-
file_id: Mapped[int] = mapped_column(ForeignKey("files.id"), index=True)
|
|
35
|
-
index_id: Mapped[int] = mapped_column(ForeignKey("indexes.id"), index=True)
|
|
36
|
-
content: Mapped[str] = mapped_column(UnicodeText, default="")
|
|
37
|
-
|
|
38
|
-
def __init__(self, file_id: int, index_id: int, content: str) -> None:
|
|
39
|
-
"""Initialize the snippet."""
|
|
40
|
-
super().__init__()
|
|
41
|
-
self.file_id = file_id
|
|
42
|
-
self.index_id = index_id
|
|
43
|
-
self.content = content
|
|
@@ -1,216 +0,0 @@
|
|
|
1
|
-
"""Repository for managing code indexes and their associated files and snippets.
|
|
2
|
-
|
|
3
|
-
This module provides the IndexRepository class which handles all database operations
|
|
4
|
-
related to code indexes, including creating indexes, managing files and snippets,
|
|
5
|
-
and retrieving index information with their associated metadata.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from datetime import UTC, datetime
|
|
9
|
-
from typing import TypeVar
|
|
10
|
-
|
|
11
|
-
from sqlalchemy import delete, func, select
|
|
12
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
|
-
|
|
14
|
-
from kodit.embedding.embedding_models import Embedding
|
|
15
|
-
from kodit.indexing.indexing_models import Index, Snippet
|
|
16
|
-
from kodit.source.source_models import File, Source
|
|
17
|
-
|
|
18
|
-
T = TypeVar("T")
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class IndexRepository:
|
|
22
|
-
"""Repository for managing code indexes and their associated data.
|
|
23
|
-
|
|
24
|
-
This class provides methods for creating and managing code indexes, including
|
|
25
|
-
their associated files and snippets. It handles all database operations related
|
|
26
|
-
to indexing code sources.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
def __init__(self, session: AsyncSession) -> None:
|
|
30
|
-
"""Initialize the index repository.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
session: The SQLAlchemy async session to use for database operations.
|
|
34
|
-
|
|
35
|
-
"""
|
|
36
|
-
self.session = session
|
|
37
|
-
|
|
38
|
-
async def create(self, source_id: int) -> Index:
|
|
39
|
-
"""Create a new index for a source.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
source_id: The ID of the source to create an index for.
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
The newly created Index instance.
|
|
46
|
-
|
|
47
|
-
"""
|
|
48
|
-
index = Index(source_id=source_id)
|
|
49
|
-
self.session.add(index)
|
|
50
|
-
await self.session.commit()
|
|
51
|
-
return index
|
|
52
|
-
|
|
53
|
-
async def get_by_id(self, index_id: int) -> Index | None:
|
|
54
|
-
"""Get an index by its ID.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
index_id: The ID of the index to retrieve.
|
|
58
|
-
|
|
59
|
-
Returns:
|
|
60
|
-
The Index instance if found, None otherwise.
|
|
61
|
-
|
|
62
|
-
"""
|
|
63
|
-
query = select(Index).where(Index.id == index_id)
|
|
64
|
-
result = await self.session.execute(query)
|
|
65
|
-
return result.scalar_one_or_none()
|
|
66
|
-
|
|
67
|
-
async def get_by_source_id(self, source_id: int) -> Index | None:
|
|
68
|
-
"""Get an index by its source ID.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
source_id: The ID of the source to retrieve an index for.
|
|
72
|
-
|
|
73
|
-
"""
|
|
74
|
-
query = select(Index).where(Index.source_id == source_id)
|
|
75
|
-
result = await self.session.execute(query)
|
|
76
|
-
return result.scalar_one_or_none()
|
|
77
|
-
|
|
78
|
-
async def files_for_index(self, index_id: int) -> list[File]:
|
|
79
|
-
"""Get all files for an index.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
index_id: The ID of the index to get files for.
|
|
83
|
-
|
|
84
|
-
Returns:
|
|
85
|
-
A list of File instances.
|
|
86
|
-
|
|
87
|
-
"""
|
|
88
|
-
query = (
|
|
89
|
-
select(File)
|
|
90
|
-
.join(Source, File.source_id == Source.id)
|
|
91
|
-
.join(Index, Index.source_id == Source.id)
|
|
92
|
-
.where(Index.id == index_id)
|
|
93
|
-
)
|
|
94
|
-
result = await self.session.execute(query)
|
|
95
|
-
return list(result.scalars())
|
|
96
|
-
|
|
97
|
-
async def list_indexes(self) -> list[tuple[Index, Source]]:
|
|
98
|
-
"""List all indexes.
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
A list of tuples containing index information, source details,
|
|
102
|
-
and counts of files and snippets.
|
|
103
|
-
|
|
104
|
-
"""
|
|
105
|
-
query = select(Index, Source).join(
|
|
106
|
-
Source, Index.source_id == Source.id, full=True
|
|
107
|
-
)
|
|
108
|
-
result = await self.session.execute(query)
|
|
109
|
-
return list(result.tuples())
|
|
110
|
-
|
|
111
|
-
async def num_snippets_for_index(self, index_id: int) -> int:
|
|
112
|
-
"""Get the number of snippets for an index."""
|
|
113
|
-
query = select(func.count()).where(Snippet.index_id == index_id)
|
|
114
|
-
result = await self.session.execute(query)
|
|
115
|
-
return result.scalar_one()
|
|
116
|
-
|
|
117
|
-
async def update_index_timestamp(self, index: Index) -> None:
|
|
118
|
-
"""Update the updated_at timestamp of an index.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
index: The Index instance to update.
|
|
122
|
-
|
|
123
|
-
"""
|
|
124
|
-
index.updated_at = datetime.now(UTC)
|
|
125
|
-
await self.session.commit()
|
|
126
|
-
|
|
127
|
-
async def add_snippet(self, snippet: Snippet) -> None:
|
|
128
|
-
"""Add a new snippet to the database if it doesn't exist, otherwise update it.
|
|
129
|
-
|
|
130
|
-
Args:
|
|
131
|
-
snippet: The Snippet instance to add.
|
|
132
|
-
|
|
133
|
-
"""
|
|
134
|
-
self.session.add(snippet)
|
|
135
|
-
await self.session.commit()
|
|
136
|
-
|
|
137
|
-
async def delete_all_snippets(self, index_id: int) -> None:
|
|
138
|
-
"""Delete all snippets for an index.
|
|
139
|
-
|
|
140
|
-
Args:
|
|
141
|
-
index_id: The ID of the index to delete snippets for.
|
|
142
|
-
|
|
143
|
-
"""
|
|
144
|
-
# First get all snippets for this index
|
|
145
|
-
snippets = await self.get_snippets_for_index(index_id)
|
|
146
|
-
|
|
147
|
-
# Delete all embeddings for these snippets, if there are any
|
|
148
|
-
for snippet in snippets:
|
|
149
|
-
query = delete(Embedding).where(Embedding.snippet_id == snippet.id)
|
|
150
|
-
await self.session.execute(query)
|
|
151
|
-
|
|
152
|
-
# Now delete the snippets
|
|
153
|
-
query = delete(Snippet).where(Snippet.index_id == index_id)
|
|
154
|
-
await self.session.execute(query)
|
|
155
|
-
await self.session.commit()
|
|
156
|
-
|
|
157
|
-
async def get_snippets_for_index(self, index_id: int) -> list[Snippet]:
|
|
158
|
-
"""Get all snippets for an index.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
index_id: The ID of the index to get snippets for.
|
|
162
|
-
|
|
163
|
-
"""
|
|
164
|
-
query = select(Snippet).where(Snippet.index_id == index_id)
|
|
165
|
-
result = await self.session.execute(query)
|
|
166
|
-
return list(result.scalars())
|
|
167
|
-
|
|
168
|
-
async def get_all_snippets(self, index_id: int) -> list[Snippet]:
|
|
169
|
-
"""Get all snippets.
|
|
170
|
-
|
|
171
|
-
Returns:
|
|
172
|
-
A list of all snippets.
|
|
173
|
-
|
|
174
|
-
"""
|
|
175
|
-
query = select(Snippet).where(Snippet.index_id == index_id).order_by(Snippet.id)
|
|
176
|
-
result = await self.session.execute(query)
|
|
177
|
-
return list(result.scalars())
|
|
178
|
-
|
|
179
|
-
async def add_embedding(self, embedding: Embedding) -> None:
|
|
180
|
-
"""Add a new embedding to the database.
|
|
181
|
-
|
|
182
|
-
Args:
|
|
183
|
-
embedding: The Embedding instance to add.
|
|
184
|
-
|
|
185
|
-
"""
|
|
186
|
-
self.session.add(embedding)
|
|
187
|
-
await self.session.commit()
|
|
188
|
-
|
|
189
|
-
async def list_snippets_by_ids(self, ids: list[int]) -> list[tuple[File, Snippet]]:
|
|
190
|
-
"""List snippets by IDs.
|
|
191
|
-
|
|
192
|
-
Returns:
|
|
193
|
-
A list of snippets in the same order as the input IDs.
|
|
194
|
-
|
|
195
|
-
"""
|
|
196
|
-
query = (
|
|
197
|
-
select(Snippet, File)
|
|
198
|
-
.where(Snippet.id.in_(ids))
|
|
199
|
-
.join(File, Snippet.file_id == File.id)
|
|
200
|
-
)
|
|
201
|
-
rows = await self.session.execute(query)
|
|
202
|
-
|
|
203
|
-
# Create a dictionary for O(1) lookup of results by ID
|
|
204
|
-
id_to_result = {snippet.id: (file, snippet) for snippet, file in rows.all()}
|
|
205
|
-
|
|
206
|
-
# Check that all IDs are present
|
|
207
|
-
if len(id_to_result) != len(ids):
|
|
208
|
-
# Create a list of missing IDs
|
|
209
|
-
missing_ids = [
|
|
210
|
-
snippet_id for snippet_id in ids if snippet_id not in id_to_result
|
|
211
|
-
]
|
|
212
|
-
msg = f"Some IDs are not present: {missing_ids}"
|
|
213
|
-
raise ValueError(msg)
|
|
214
|
-
|
|
215
|
-
# Rebuild the list in the same order that it was passed in
|
|
216
|
-
return [id_to_result[i] for i in ids]
|
|
@@ -1,344 +0,0 @@
|
|
|
1
|
-
"""Index service for managing code indexes.
|
|
2
|
-
|
|
3
|
-
This module provides the IndexService class which handles the business logic for
|
|
4
|
-
creating, listing, and running code indexes. It orchestrates the interaction between the
|
|
5
|
-
file system, database operations (via IndexRepository), and provides a clean API for
|
|
6
|
-
index management.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
|
|
12
|
-
import pydantic
|
|
13
|
-
import structlog
|
|
14
|
-
from tqdm.asyncio import tqdm
|
|
15
|
-
|
|
16
|
-
from kodit.bm25.keyword_search_service import (
|
|
17
|
-
BM25Document,
|
|
18
|
-
BM25Result,
|
|
19
|
-
KeywordSearchProvider,
|
|
20
|
-
)
|
|
21
|
-
from kodit.embedding.vector_search_service import (
|
|
22
|
-
VectorSearchRequest,
|
|
23
|
-
VectorSearchService,
|
|
24
|
-
)
|
|
25
|
-
from kodit.enrichment.enrichment_provider.enrichment_provider import EnrichmentRequest
|
|
26
|
-
from kodit.enrichment.enrichment_service import EnrichmentService
|
|
27
|
-
from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
|
|
28
|
-
from kodit.indexing.indexing_models import Snippet
|
|
29
|
-
from kodit.indexing.indexing_repository import IndexRepository
|
|
30
|
-
from kodit.log import log_event
|
|
31
|
-
from kodit.snippets.snippets import SnippetService
|
|
32
|
-
from kodit.source.source_service import SourceService
|
|
33
|
-
from kodit.util.spinner import Spinner
|
|
34
|
-
|
|
35
|
-
# List of MIME types that are blacklisted from being indexed
|
|
36
|
-
MIME_BLACKLIST = ["unknown/unknown"]
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class IndexView(pydantic.BaseModel):
|
|
40
|
-
"""Data transfer object for index information.
|
|
41
|
-
|
|
42
|
-
This model represents the public interface for index data, providing a clean
|
|
43
|
-
view of index information without exposing internal implementation details.
|
|
44
|
-
"""
|
|
45
|
-
|
|
46
|
-
id: int
|
|
47
|
-
created_at: datetime
|
|
48
|
-
updated_at: datetime | None = None
|
|
49
|
-
source: str | None = None
|
|
50
|
-
num_snippets: int
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
class SearchRequest(pydantic.BaseModel):
|
|
54
|
-
"""Request for a search."""
|
|
55
|
-
|
|
56
|
-
text_query: str | None = None
|
|
57
|
-
code_query: str | None = None
|
|
58
|
-
keywords: list[str] | None = None
|
|
59
|
-
top_k: int = 10
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class SearchResult(pydantic.BaseModel):
|
|
63
|
-
"""Data transfer object for search results.
|
|
64
|
-
|
|
65
|
-
This model represents a single search result, containing both the file path
|
|
66
|
-
and the matching snippet content.
|
|
67
|
-
"""
|
|
68
|
-
|
|
69
|
-
id: int
|
|
70
|
-
uri: str
|
|
71
|
-
content: str
|
|
72
|
-
original_scores: list[float]
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
class IndexService:
|
|
76
|
-
"""Service for managing code indexes.
|
|
77
|
-
|
|
78
|
-
This service handles the business logic for creating, listing, and running code
|
|
79
|
-
indexes. It coordinates between file system operations, database operations (via
|
|
80
|
-
IndexRepository), and provides a clean API for index management.
|
|
81
|
-
"""
|
|
82
|
-
|
|
83
|
-
def __init__( # noqa: PLR0913
|
|
84
|
-
self,
|
|
85
|
-
repository: IndexRepository,
|
|
86
|
-
source_service: SourceService,
|
|
87
|
-
keyword_search_provider: KeywordSearchProvider,
|
|
88
|
-
code_search_service: VectorSearchService,
|
|
89
|
-
text_search_service: VectorSearchService,
|
|
90
|
-
enrichment_service: EnrichmentService,
|
|
91
|
-
) -> None:
|
|
92
|
-
"""Initialize the index service.
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
repository: The repository instance to use for database operations.
|
|
96
|
-
source_service: The source service instance to use for source validation.
|
|
97
|
-
|
|
98
|
-
"""
|
|
99
|
-
self.repository = repository
|
|
100
|
-
self.source_service = source_service
|
|
101
|
-
self.snippet_service = SnippetService()
|
|
102
|
-
self.log = structlog.get_logger(__name__)
|
|
103
|
-
self.keyword_search_provider = keyword_search_provider
|
|
104
|
-
self.code_search_service = code_search_service
|
|
105
|
-
self.text_search_service = text_search_service
|
|
106
|
-
self.enrichment_service = enrichment_service
|
|
107
|
-
|
|
108
|
-
async def create(self, source_id: int) -> IndexView:
|
|
109
|
-
"""Create a new index for a source.
|
|
110
|
-
|
|
111
|
-
This method creates a new index for the specified source, after validating
|
|
112
|
-
that the source exists and doesn't already have an index.
|
|
113
|
-
|
|
114
|
-
Args:
|
|
115
|
-
source_id: The ID of the source to create an index for.
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
An Index object representing the newly created index.
|
|
119
|
-
|
|
120
|
-
Raises:
|
|
121
|
-
ValueError: If the source doesn't exist or already has an index.
|
|
122
|
-
|
|
123
|
-
"""
|
|
124
|
-
log_event("kodit.index.create")
|
|
125
|
-
|
|
126
|
-
# Check if the source exists
|
|
127
|
-
source = await self.source_service.get(source_id)
|
|
128
|
-
|
|
129
|
-
# Check if the index already exists
|
|
130
|
-
index = await self.repository.get_by_source_id(source.id)
|
|
131
|
-
if not index:
|
|
132
|
-
index = await self.repository.create(source.id)
|
|
133
|
-
return IndexView(
|
|
134
|
-
id=index.id,
|
|
135
|
-
created_at=index.created_at,
|
|
136
|
-
num_snippets=await self.repository.num_snippets_for_index(index.id),
|
|
137
|
-
source=source.uri,
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
async def list_indexes(self) -> list[IndexView]:
|
|
141
|
-
"""List all available indexes with their details.
|
|
142
|
-
|
|
143
|
-
Returns:
|
|
144
|
-
A list of Index objects containing information about each index,
|
|
145
|
-
including file and snippet counts.
|
|
146
|
-
|
|
147
|
-
"""
|
|
148
|
-
indexes = await self.repository.list_indexes()
|
|
149
|
-
|
|
150
|
-
# Transform database results into DTOs
|
|
151
|
-
indexes = [
|
|
152
|
-
IndexView(
|
|
153
|
-
id=index.id,
|
|
154
|
-
created_at=index.created_at,
|
|
155
|
-
updated_at=index.updated_at,
|
|
156
|
-
num_snippets=await self.repository.num_snippets_for_index(index.id)
|
|
157
|
-
or 0,
|
|
158
|
-
source=source.uri,
|
|
159
|
-
)
|
|
160
|
-
for index, source in indexes
|
|
161
|
-
]
|
|
162
|
-
|
|
163
|
-
# Help Kodit by measuring how much people are using indexes
|
|
164
|
-
log_event(
|
|
165
|
-
"kodit.index.list",
|
|
166
|
-
{
|
|
167
|
-
"num_indexes": len(indexes),
|
|
168
|
-
"num_snippets": sum([index.num_snippets for index in indexes]),
|
|
169
|
-
},
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
return indexes
|
|
173
|
-
|
|
174
|
-
async def run(self, index_id: int) -> None:
|
|
175
|
-
"""Run the indexing process for a specific index."""
|
|
176
|
-
log_event("kodit.index.run")
|
|
177
|
-
|
|
178
|
-
# Get and validate index
|
|
179
|
-
index = await self.repository.get_by_id(index_id)
|
|
180
|
-
if not index:
|
|
181
|
-
msg = f"Index not found: {index_id}"
|
|
182
|
-
raise ValueError(msg)
|
|
183
|
-
|
|
184
|
-
# Delete old snippets so we don't duplicate. In the future should probably check
|
|
185
|
-
# which files have changed and only change those.
|
|
186
|
-
await self.repository.delete_all_snippets(index.id)
|
|
187
|
-
|
|
188
|
-
# Create snippets for supported file types
|
|
189
|
-
self.log.info("Creating snippets for files", index_id=index.id)
|
|
190
|
-
await self._create_snippets(index.id)
|
|
191
|
-
|
|
192
|
-
snippets = await self.repository.get_all_snippets(index.id)
|
|
193
|
-
|
|
194
|
-
self.log.info("Creating keyword index")
|
|
195
|
-
with Spinner():
|
|
196
|
-
await self.keyword_search_provider.index(
|
|
197
|
-
[
|
|
198
|
-
BM25Document(snippet_id=snippet.id, text=snippet.content)
|
|
199
|
-
for snippet in snippets
|
|
200
|
-
]
|
|
201
|
-
)
|
|
202
|
-
|
|
203
|
-
self.log.info("Creating semantic code index")
|
|
204
|
-
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
205
|
-
async for result in self.code_search_service.index(
|
|
206
|
-
[
|
|
207
|
-
VectorSearchRequest(snippet.id, snippet.content)
|
|
208
|
-
for snippet in snippets
|
|
209
|
-
]
|
|
210
|
-
):
|
|
211
|
-
pbar.update(len(result))
|
|
212
|
-
|
|
213
|
-
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
214
|
-
enriched_contents = []
|
|
215
|
-
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
216
|
-
async for result in self.enrichment_service.enrich(
|
|
217
|
-
[
|
|
218
|
-
EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
|
|
219
|
-
for snippet in snippets
|
|
220
|
-
]
|
|
221
|
-
):
|
|
222
|
-
snippet = next(s for s in snippets if s.id == result.snippet_id)
|
|
223
|
-
if snippet:
|
|
224
|
-
snippet.content = (
|
|
225
|
-
result.text + "\n\n```\n" + snippet.content + "\n```"
|
|
226
|
-
)
|
|
227
|
-
await self.repository.add_snippet(snippet)
|
|
228
|
-
enriched_contents.append(result)
|
|
229
|
-
pbar.update(1)
|
|
230
|
-
|
|
231
|
-
self.log.info("Creating semantic text index")
|
|
232
|
-
with tqdm(total=len(snippets), leave=False) as pbar:
|
|
233
|
-
async for result in self.text_search_service.index(
|
|
234
|
-
[
|
|
235
|
-
VectorSearchRequest(snippet.id, snippet.content)
|
|
236
|
-
for snippet in snippets
|
|
237
|
-
]
|
|
238
|
-
):
|
|
239
|
-
pbar.update(len(result))
|
|
240
|
-
|
|
241
|
-
# Update index timestamp
|
|
242
|
-
await self.repository.update_index_timestamp(index)
|
|
243
|
-
|
|
244
|
-
async def search(self, request: SearchRequest) -> list[SearchResult]:
|
|
245
|
-
"""Search for relevant data."""
|
|
246
|
-
log_event("kodit.index.search")
|
|
247
|
-
|
|
248
|
-
fusion_list: list[list[FusionRequest]] = []
|
|
249
|
-
if request.keywords:
|
|
250
|
-
# Gather results for each keyword
|
|
251
|
-
result_ids: list[BM25Result] = []
|
|
252
|
-
for keyword in request.keywords:
|
|
253
|
-
results = await self.keyword_search_provider.retrieve(
|
|
254
|
-
keyword, request.top_k
|
|
255
|
-
)
|
|
256
|
-
result_ids.extend(results)
|
|
257
|
-
|
|
258
|
-
fusion_list.append(
|
|
259
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
# Compute embedding for semantic query
|
|
263
|
-
if request.code_query:
|
|
264
|
-
query_embedding = await self.code_search_service.retrieve(
|
|
265
|
-
request.code_query, top_k=request.top_k
|
|
266
|
-
)
|
|
267
|
-
fusion_list.append(
|
|
268
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
if request.text_query:
|
|
272
|
-
query_embedding = await self.text_search_service.retrieve(
|
|
273
|
-
request.text_query, top_k=request.top_k
|
|
274
|
-
)
|
|
275
|
-
fusion_list.append(
|
|
276
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
if len(fusion_list) == 0:
|
|
280
|
-
return []
|
|
281
|
-
|
|
282
|
-
# Combine all results together with RFF if required
|
|
283
|
-
final_results = reciprocal_rank_fusion(
|
|
284
|
-
rankings=fusion_list,
|
|
285
|
-
k=60,
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
# Only keep top_k results
|
|
289
|
-
final_results = final_results[: request.top_k]
|
|
290
|
-
|
|
291
|
-
# Get snippets from database (up to top_k)
|
|
292
|
-
search_results = await self.repository.list_snippets_by_ids(
|
|
293
|
-
[x.id for x in final_results]
|
|
294
|
-
)
|
|
295
|
-
|
|
296
|
-
return [
|
|
297
|
-
SearchResult(
|
|
298
|
-
id=snippet.id,
|
|
299
|
-
uri=file.uri,
|
|
300
|
-
content=snippet.content,
|
|
301
|
-
original_scores=fr.original_scores,
|
|
302
|
-
)
|
|
303
|
-
for (file, snippet), fr in zip(search_results, final_results, strict=True)
|
|
304
|
-
]
|
|
305
|
-
|
|
306
|
-
async def _create_snippets(
|
|
307
|
-
self,
|
|
308
|
-
index_id: int,
|
|
309
|
-
) -> None:
|
|
310
|
-
"""Create snippets for supported files.
|
|
311
|
-
|
|
312
|
-
Args:
|
|
313
|
-
index: The index to create snippets for.
|
|
314
|
-
file_list: List of files to create snippets from.
|
|
315
|
-
existing_snippets_set: Set of file IDs that already have snippets.
|
|
316
|
-
|
|
317
|
-
"""
|
|
318
|
-
files = await self.repository.files_for_index(index_id)
|
|
319
|
-
if not files:
|
|
320
|
-
self.log.warning("No files to create snippets for")
|
|
321
|
-
return
|
|
322
|
-
|
|
323
|
-
for file in tqdm(files, total=len(files), leave=False):
|
|
324
|
-
# Skip unsupported file types
|
|
325
|
-
if file.mime_type in MIME_BLACKLIST:
|
|
326
|
-
self.log.debug("Skipping mime type", mime_type=file.mime_type)
|
|
327
|
-
continue
|
|
328
|
-
|
|
329
|
-
# Create snippet from file content
|
|
330
|
-
try:
|
|
331
|
-
snippets = self.snippet_service.snippets_for_file(
|
|
332
|
-
Path(file.cloned_path)
|
|
333
|
-
)
|
|
334
|
-
except ValueError as e:
|
|
335
|
-
self.log.debug("Skipping file", file=file.cloned_path, error=e)
|
|
336
|
-
continue
|
|
337
|
-
|
|
338
|
-
for snippet in snippets:
|
|
339
|
-
s = Snippet(
|
|
340
|
-
index_id=index_id,
|
|
341
|
-
file_id=file.id,
|
|
342
|
-
content=snippet.text,
|
|
343
|
-
)
|
|
344
|
-
await self.repository.add_snippet(s)
|
kodit/snippets/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Extract method snippets from source code."""
|