kodit 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/retreival/repository.py +99 -51
- {kodit-0.1.10.dist-info → kodit-0.1.11.dist-info}/METADATA +1 -1
- {kodit-0.1.10.dist-info → kodit-0.1.11.dist-info}/RECORD +7 -7
- {kodit-0.1.10.dist-info → kodit-0.1.11.dist-info}/WHEEL +0 -0
- {kodit-0.1.10.dist-info → kodit-0.1.11.dist-info}/entry_points.txt +0 -0
- {kodit-0.1.10.dist-info → kodit-0.1.11.dist-info}/licenses/LICENSE +0 -0
kodit/_version.py
CHANGED
kodit/retreival/repository.py
CHANGED
|
@@ -5,21 +5,14 @@ related to searching and retrieving code snippets, including string-based search
|
|
|
5
5
|
and their associated file information.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
import
|
|
9
|
-
from typing import Any, TypeVar
|
|
8
|
+
from typing import TypeVar
|
|
10
9
|
|
|
10
|
+
import numpy as np
|
|
11
11
|
import pydantic
|
|
12
12
|
from sqlalchemy import (
|
|
13
|
-
ColumnElement,
|
|
14
|
-
Float,
|
|
15
|
-
cast,
|
|
16
|
-
desc,
|
|
17
|
-
func,
|
|
18
|
-
literal,
|
|
19
13
|
select,
|
|
20
14
|
)
|
|
21
15
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
22
|
-
from sqlalchemy.orm import Mapped
|
|
23
16
|
|
|
24
17
|
from kodit.embedding.models import Embedding, EmbeddingType
|
|
25
18
|
from kodit.indexing.models import Snippet
|
|
@@ -129,55 +122,110 @@ class RetrievalRepository:
|
|
|
129
122
|
# Return results in the same order as input IDs
|
|
130
123
|
return [id_to_result[i] for i in ids]
|
|
131
124
|
|
|
132
|
-
async def
|
|
133
|
-
self, embedding_type: EmbeddingType
|
|
134
|
-
) -> list[tuple[int, float]]:
|
|
135
|
-
"""
|
|
136
|
-
cosine_similarity = cosine_similarity_json(Embedding.embedding, embedding)
|
|
125
|
+
async def fetch_embeddings(
|
|
126
|
+
self, embedding_type: EmbeddingType
|
|
127
|
+
) -> list[tuple[int, list[float]]]:
|
|
128
|
+
"""Fetch all embeddings of a given type from the database.
|
|
137
129
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
130
|
+
Args:
|
|
131
|
+
embedding_type: The type of embeddings to fetch
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of (snippet_id, embedding) tuples
|
|
135
|
+
|
|
136
|
+
"""
|
|
137
|
+
# Only select the fields we need and use a more efficient query
|
|
138
|
+
query = select(Embedding.snippet_id, Embedding.embedding).where(
|
|
139
|
+
Embedding.type == embedding_type
|
|
143
140
|
)
|
|
144
141
|
rows = await self.session.execute(query)
|
|
145
|
-
return [(
|
|
142
|
+
return [tuple(row) for row in rows.all()] # Convert Row objects to tuples
|
|
143
|
+
|
|
144
|
+
def prepare_vectors(
|
|
145
|
+
self, embeddings: list[tuple[int, list[float]]], query_embedding: list[float]
|
|
146
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
147
|
+
"""Convert embeddings to numpy arrays.
|
|
146
148
|
|
|
149
|
+
Args:
|
|
150
|
+
embeddings: List of (snippet_id, embedding) tuples
|
|
151
|
+
query_embedding: Query embedding vector
|
|
147
152
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
) -> ColumnElement[Any]:
|
|
151
|
-
"""Calculate the cosine similarity using pure sqlalchemy.
|
|
153
|
+
Returns:
|
|
154
|
+
Tuple of (stored_vectors, query_vector) as numpy arrays
|
|
152
155
|
|
|
153
|
-
|
|
154
|
-
|
|
156
|
+
"""
|
|
157
|
+
stored_vecs = np.array(
|
|
158
|
+
[emb[1] for emb in embeddings]
|
|
159
|
+
) # Use index 1 to get embedding
|
|
160
|
+
query_vec = np.array(query_embedding)
|
|
161
|
+
return stored_vecs, query_vec
|
|
155
162
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
163
|
+
def compute_similarities(
|
|
164
|
+
self, stored_vecs: np.ndarray, query_vec: np.ndarray
|
|
165
|
+
) -> np.ndarray:
|
|
166
|
+
"""Compute cosine similarities between stored vectors and query vector.
|
|
159
167
|
|
|
160
|
-
|
|
161
|
-
|
|
168
|
+
Args:
|
|
169
|
+
stored_vecs: Array of stored embedding vectors
|
|
170
|
+
query_vec: Query embedding vector
|
|
162
171
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
172
|
+
Returns:
|
|
173
|
+
Array of similarity scores
|
|
174
|
+
|
|
175
|
+
"""
|
|
176
|
+
stored_norms = np.linalg.norm(stored_vecs, axis=1)
|
|
177
|
+
query_norm = np.linalg.norm(query_vec)
|
|
178
|
+
return np.dot(stored_vecs, query_vec) / (stored_norms * query_norm)
|
|
179
|
+
|
|
180
|
+
def get_top_k_results(
|
|
181
|
+
self,
|
|
182
|
+
similarities: np.ndarray,
|
|
183
|
+
embeddings: list[tuple[int, list[float]]],
|
|
184
|
+
top_k: int,
|
|
185
|
+
) -> list[tuple[int, float]]:
|
|
186
|
+
"""Get top-k results by similarity score.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
similarities: Array of similarity scores
|
|
190
|
+
embeddings: List of (snippet_id, embedding) tuples
|
|
191
|
+
top_k: Number of results to return
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
List of (snippet_id, similarity_score) tuples
|
|
195
|
+
|
|
196
|
+
"""
|
|
197
|
+
top_indices = np.argsort(similarities)[::-1][:top_k]
|
|
198
|
+
return [
|
|
199
|
+
(embeddings[i][0], float(similarities[i])) for i in top_indices
|
|
200
|
+
] # Use index 0 to get snippet_id
|
|
201
|
+
|
|
202
|
+
async def list_semantic_results(
|
|
203
|
+
self, embedding_type: EmbeddingType, embedding: list[float], top_k: int = 10
|
|
204
|
+
) -> list[tuple[int, float]]:
|
|
205
|
+
"""List semantic results using cosine similarity.
|
|
206
|
+
|
|
207
|
+
This implementation fetches all embeddings of the given type and computes
|
|
208
|
+
cosine similarity in Python using NumPy for better performance.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
embedding_type: The type of embeddings to search
|
|
212
|
+
embedding: The query embedding vector
|
|
213
|
+
top_k: Number of results to return
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
List of (snippet_id, similarity_score) tuples, sorted by similarity
|
|
217
|
+
|
|
218
|
+
"""
|
|
219
|
+
# Step 1: Fetch embeddings from database
|
|
220
|
+
embeddings = await self.fetch_embeddings(embedding_type)
|
|
221
|
+
if not embeddings:
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
# Step 2: Convert to numpy arrays
|
|
225
|
+
stored_vecs, query_vec = self.prepare_vectors(embeddings, embedding)
|
|
226
|
+
|
|
227
|
+
# Step 3: Compute similarities
|
|
228
|
+
similarities = self.compute_similarities(stored_vecs, query_vec)
|
|
181
229
|
|
|
182
|
-
|
|
183
|
-
|
|
230
|
+
# Step 4: Get top-k results
|
|
231
|
+
return self.get_top_k_results(similarities, embeddings, top_k)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
|
|
2
2
|
kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
|
|
3
|
-
kodit/_version.py,sha256=
|
|
3
|
+
kodit/_version.py,sha256=xfwL5IZGNNwnNDAQtGFjpvlNxqYn3U9IM9B98Du9pJw,513
|
|
4
4
|
kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
|
|
5
5
|
kodit/cli.py,sha256=qEQy_Sd64cEV5KzYsKlGLyMxFQ4fFi-as4QO8CRrKYo,8978
|
|
6
6
|
kodit/config.py,sha256=hQshTMW_8jpk94zP-1JaxowgmW_LrT534ipHFaRUGMw,3006
|
|
@@ -25,7 +25,7 @@ kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQ
|
|
|
25
25
|
kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
|
|
26
26
|
kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
|
|
27
27
|
kodit/retreival/__init__.py,sha256=33PhJU-3gtsqYq6A1UkaLNKbev_Zee9Lq6dYC59-CsA,69
|
|
28
|
-
kodit/retreival/repository.py,sha256=
|
|
28
|
+
kodit/retreival/repository.py,sha256=XHkkeUsnXSrrcthJOL9FXgivn5kkaPnC9Qci6ebwjZc,7294
|
|
29
29
|
kodit/retreival/service.py,sha256=gGp74jnqhyCDF5vKOrN2dJKDnhlfR4HZaxADSrjTb4s,3778
|
|
30
30
|
kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
|
|
31
31
|
kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
|
|
@@ -37,8 +37,8 @@ kodit/sources/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
|
|
|
37
37
|
kodit/sources/models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
|
|
38
38
|
kodit/sources/repository.py,sha256=mGJrHWH6Uo8YABdoojHFbzaf_jW-2ywJpAHIa1gnc3U,3401
|
|
39
39
|
kodit/sources/service.py,sha256=aV_qiqkU2kMBNPvye5_v4NnZiK-lJ64rQdmFtBtsQaY,9243
|
|
40
|
-
kodit-0.1.
|
|
41
|
-
kodit-0.1.
|
|
42
|
-
kodit-0.1.
|
|
43
|
-
kodit-0.1.
|
|
44
|
-
kodit-0.1.
|
|
40
|
+
kodit-0.1.11.dist-info/METADATA,sha256=yUO645VYUiVrJMRtwNB71O-6qvC94nS7_ILQ8eQEvoY,2288
|
|
41
|
+
kodit-0.1.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
42
|
+
kodit-0.1.11.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
|
|
43
|
+
kodit-0.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
44
|
+
kodit-0.1.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|