kodit 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

kodit/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.10'
21
- __version_tuple__ = version_tuple = (0, 1, 10)
20
+ __version__ = version = '0.1.11'
21
+ __version_tuple__ = version_tuple = (0, 1, 11)
@@ -5,21 +5,14 @@ related to searching and retrieving code snippets, including string-based search
5
5
  and their associated file information.
6
6
  """
7
7
 
8
- import math
9
- from typing import Any, TypeVar
8
+ from typing import TypeVar
10
9
 
10
+ import numpy as np
11
11
  import pydantic
12
12
  from sqlalchemy import (
13
- ColumnElement,
14
- Float,
15
- cast,
16
- desc,
17
- func,
18
- literal,
19
13
  select,
20
14
  )
21
15
  from sqlalchemy.ext.asyncio import AsyncSession
22
- from sqlalchemy.orm import Mapped
23
16
 
24
17
  from kodit.embedding.models import Embedding, EmbeddingType
25
18
  from kodit.indexing.models import Snippet
@@ -129,55 +122,110 @@ class RetrievalRepository:
129
122
  # Return results in the same order as input IDs
130
123
  return [id_to_result[i] for i in ids]
131
124
 
132
- async def list_semantic_results(
133
- self, embedding_type: EmbeddingType, embedding: list[float], top_k: int = 10
134
- ) -> list[tuple[int, float]]:
135
- """List semantic results."""
136
- cosine_similarity = cosine_similarity_json(Embedding.embedding, embedding)
125
+ async def fetch_embeddings(
126
+ self, embedding_type: EmbeddingType
127
+ ) -> list[tuple[int, list[float]]]:
128
+ """Fetch all embeddings of a given type from the database.
137
129
 
138
- query = (
139
- select(Embedding, cosine_similarity)
140
- .where(Embedding.type == embedding_type)
141
- .order_by(desc(cosine_similarity))
142
- .limit(top_k)
130
+ Args:
131
+ embedding_type: The type of embeddings to fetch
132
+
133
+ Returns:
134
+ List of (snippet_id, embedding) tuples
135
+
136
+ """
137
+ # Only select the fields we need and use a more efficient query
138
+ query = select(Embedding.snippet_id, Embedding.embedding).where(
139
+ Embedding.type == embedding_type
143
140
  )
144
141
  rows = await self.session.execute(query)
145
- return [(embedding.snippet_id, distance) for embedding, distance in rows.all()]
142
+ return [tuple(row) for row in rows.all()] # Convert Row objects to tuples
143
+
144
+ def prepare_vectors(
145
+ self, embeddings: list[tuple[int, list[float]]], query_embedding: list[float]
146
+ ) -> tuple[np.ndarray, np.ndarray]:
147
+ """Convert embeddings to numpy arrays.
146
148
 
149
+ Args:
150
+ embeddings: List of (snippet_id, embedding) tuples
151
+ query_embedding: Query embedding vector
147
152
 
148
- def cosine_similarity_json(
149
- col: Mapped[Any], query_vec: list[float]
150
- ) -> ColumnElement[Any]:
151
- """Calculate the cosine similarity using pure sqlalchemy.
153
+ Returns:
154
+ Tuple of (stored_vectors, query_vector) as numpy arrays
152
155
 
153
- Works for a *fixed-length* vector stored as a JSON array in SQLite.
154
- The calculation is done entirely in SQL using SQLite's JSON functions.
156
+ """
157
+ stored_vecs = np.array(
158
+ [emb[1] for emb in embeddings]
159
+ ) # Use index 1 to get embedding
160
+ query_vec = np.array(query_embedding)
161
+ return stored_vecs, query_vec
155
162
 
156
- Args:
157
- col: The column containing the JSON array of floats
158
- query_vec: The query vector to compare against
163
+ def compute_similarities(
164
+ self, stored_vecs: np.ndarray, query_vec: np.ndarray
165
+ ) -> np.ndarray:
166
+ """Compute cosine similarities between stored vectors and query vector.
159
167
 
160
- Returns:
161
- A SQLAlchemy expression that computes the cosine similarity
168
+ Args:
169
+ stored_vecs: Array of stored embedding vectors
170
+ query_vec: Query embedding vector
162
171
 
163
- """
164
- # Pre-compute query norm
165
- q_norm = math.sqrt(sum(x * x for x in query_vec))
166
-
167
- # Calculate dot product using JSON array functions
168
- dot = sum(
169
- cast(func.json_extract(col, f"$[{i}]"), Float) * literal(float(q))
170
- for i, q in enumerate(query_vec)
171
- )
172
-
173
- # Calculate row norm on the fly
174
- row_norm = func.sqrt(
175
- sum(
176
- cast(func.json_extract(col, f"$[{i}]"), Float)
177
- * cast(func.json_extract(col, f"$[{i}]"), Float)
178
- for i in range(len(query_vec))
179
- )
180
- )
172
+ Returns:
173
+ Array of similarity scores
174
+
175
+ """
176
+ stored_norms = np.linalg.norm(stored_vecs, axis=1)
177
+ query_norm = np.linalg.norm(query_vec)
178
+ return np.dot(stored_vecs, query_vec) / (stored_norms * query_norm)
179
+
180
+ def get_top_k_results(
181
+ self,
182
+ similarities: np.ndarray,
183
+ embeddings: list[tuple[int, list[float]]],
184
+ top_k: int,
185
+ ) -> list[tuple[int, float]]:
186
+ """Get top-k results by similarity score.
187
+
188
+ Args:
189
+ similarities: Array of similarity scores
190
+ embeddings: List of (snippet_id, embedding) tuples
191
+ top_k: Number of results to return
192
+
193
+ Returns:
194
+ List of (snippet_id, similarity_score) tuples
195
+
196
+ """
197
+ top_indices = np.argsort(similarities)[::-1][:top_k]
198
+ return [
199
+ (embeddings[i][0], float(similarities[i])) for i in top_indices
200
+ ] # Use index 0 to get snippet_id
201
+
202
+ async def list_semantic_results(
203
+ self, embedding_type: EmbeddingType, embedding: list[float], top_k: int = 10
204
+ ) -> list[tuple[int, float]]:
205
+ """List semantic results using cosine similarity.
206
+
207
+ This implementation fetches all embeddings of the given type and computes
208
+ cosine similarity in Python using NumPy for better performance.
209
+
210
+ Args:
211
+ embedding_type: The type of embeddings to search
212
+ embedding: The query embedding vector
213
+ top_k: Number of results to return
214
+
215
+ Returns:
216
+ List of (snippet_id, similarity_score) tuples, sorted by similarity
217
+
218
+ """
219
+ # Step 1: Fetch embeddings from database
220
+ embeddings = await self.fetch_embeddings(embedding_type)
221
+ if not embeddings:
222
+ return []
223
+
224
+ # Step 2: Convert to numpy arrays
225
+ stored_vecs, query_vec = self.prepare_vectors(embeddings, embedding)
226
+
227
+ # Step 3: Compute similarities
228
+ similarities = self.compute_similarities(stored_vecs, query_vec)
181
229
 
182
- # Calculate cosine similarity
183
- return (dot / (row_norm * literal(q_norm))).label("cosine_similarity")
230
+ # Step 4: Get top-k results
231
+ return self.get_top_k_results(similarities, embeddings, top_k)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -1,6 +1,6 @@
1
1
  kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
2
2
  kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
3
- kodit/_version.py,sha256=HsFzfK6RsoG-sFr1kLh3t-q2dq2wNylFvX6VW_rx5vM,513
3
+ kodit/_version.py,sha256=xfwL5IZGNNwnNDAQtGFjpvlNxqYn3U9IM9B98Du9pJw,513
4
4
  kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
5
5
  kodit/cli.py,sha256=qEQy_Sd64cEV5KzYsKlGLyMxFQ4fFi-as4QO8CRrKYo,8978
6
6
  kodit/config.py,sha256=hQshTMW_8jpk94zP-1JaxowgmW_LrT534ipHFaRUGMw,3006
@@ -25,7 +25,7 @@ kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQ
25
25
  kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
26
26
  kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
27
27
  kodit/retreival/__init__.py,sha256=33PhJU-3gtsqYq6A1UkaLNKbev_Zee9Lq6dYC59-CsA,69
28
- kodit/retreival/repository.py,sha256=ZXHUYJrsmHCII9PUgYzLfN0EhiyWw7eJ3_rKCvMrSpY,5465
28
+ kodit/retreival/repository.py,sha256=XHkkeUsnXSrrcthJOL9FXgivn5kkaPnC9Qci6ebwjZc,7294
29
29
  kodit/retreival/service.py,sha256=gGp74jnqhyCDF5vKOrN2dJKDnhlfR4HZaxADSrjTb4s,3778
30
30
  kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
31
31
  kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
@@ -37,8 +37,8 @@ kodit/sources/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
37
37
  kodit/sources/models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
38
38
  kodit/sources/repository.py,sha256=mGJrHWH6Uo8YABdoojHFbzaf_jW-2ywJpAHIa1gnc3U,3401
39
39
  kodit/sources/service.py,sha256=aV_qiqkU2kMBNPvye5_v4NnZiK-lJ64rQdmFtBtsQaY,9243
40
- kodit-0.1.10.dist-info/METADATA,sha256=wi-_Yl0ZPw898Mc1QjtvNQRl5-4xkdfBUlf6isC7Wr0,2288
41
- kodit-0.1.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
- kodit-0.1.10.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
43
- kodit-0.1.10.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- kodit-0.1.10.dist-info/RECORD,,
40
+ kodit-0.1.11.dist-info/METADATA,sha256=yUO645VYUiVrJMRtwNB71O-6qvC94nS7_ILQ8eQEvoY,2288
41
+ kodit-0.1.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
+ kodit-0.1.11.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
43
+ kodit-0.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ kodit-0.1.11.dist-info/RECORD,,
File without changes