kodit 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/bm25/keyword_search_factory.py +17 -0
- kodit/bm25/keyword_search_service.py +34 -0
- kodit/bm25/{bm25.py → local_bm25.py} +40 -14
- kodit/bm25/vectorchord_bm25.py +193 -0
- kodit/cli.py +114 -25
- kodit/config.py +9 -2
- kodit/database.py +4 -2
- kodit/embedding/embedding_factory.py +44 -0
- kodit/embedding/embedding_provider/__init__.py +1 -0
- kodit/embedding/embedding_provider/embedding_provider.py +60 -0
- kodit/embedding/embedding_provider/hash_embedding_provider.py +77 -0
- kodit/embedding/embedding_provider/local_embedding_provider.py +58 -0
- kodit/embedding/embedding_provider/openai_embedding_provider.py +75 -0
- kodit/{search/search_repository.py → embedding/embedding_repository.py} +61 -33
- kodit/embedding/local_vector_search_service.py +50 -0
- kodit/embedding/vector_search_service.py +38 -0
- kodit/embedding/vectorchord_vector_search_service.py +154 -0
- kodit/enrichment/__init__.py +1 -0
- kodit/enrichment/enrichment_factory.py +23 -0
- kodit/enrichment/enrichment_provider/__init__.py +1 -0
- kodit/enrichment/enrichment_provider/enrichment_provider.py +16 -0
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +63 -0
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +77 -0
- kodit/enrichment/enrichment_service.py +33 -0
- kodit/indexing/fusion.py +67 -0
- kodit/indexing/indexing_repository.py +44 -4
- kodit/indexing/indexing_service.py +142 -31
- kodit/mcp.py +31 -18
- kodit/snippets/languages/go.scm +26 -0
- kodit/source/source_service.py +9 -3
- kodit/util/__init__.py +1 -0
- kodit/util/spinner.py +59 -0
- {kodit-0.1.14.dist-info → kodit-0.1.16.dist-info}/METADATA +4 -1
- kodit-0.1.16.dist-info/RECORD +64 -0
- kodit/embedding/embedding.py +0 -203
- kodit/search/__init__.py +0 -1
- kodit/search/search_service.py +0 -147
- kodit-0.1.14.dist-info/RECORD +0 -44
- {kodit-0.1.14.dist-info → kodit-0.1.16.dist-info}/WHEEL +0 -0
- {kodit-0.1.14.dist-info → kodit-0.1.16.dist-info}/entry_points.txt +0 -0
- {kodit-0.1.14.dist-info → kodit-0.1.16.dist-info}/licenses/LICENSE +0 -0
kodit/source/source_service.py
CHANGED
|
@@ -109,6 +109,8 @@ class SourceService:
|
|
|
109
109
|
uri_or_path_like = uri_or_path_like + ".git"
|
|
110
110
|
try:
|
|
111
111
|
return await self._create_git_source(uri_or_path_like)
|
|
112
|
+
except git.GitCommandError:
|
|
113
|
+
raise
|
|
112
114
|
except ValueError:
|
|
113
115
|
pass
|
|
114
116
|
|
|
@@ -197,11 +199,14 @@ class SourceService:
|
|
|
197
199
|
clone_path.mkdir(parents=True, exist_ok=True)
|
|
198
200
|
|
|
199
201
|
try:
|
|
200
|
-
|
|
202
|
+
self.log.info("Cloning repository", uri=uri, clone_path=str(clone_path))
|
|
201
203
|
git.Repo.clone_from(uri, clone_path)
|
|
202
204
|
except git.GitCommandError as e:
|
|
203
|
-
|
|
204
|
-
|
|
205
|
+
if "already exists and is not an empty directory" in str(e):
|
|
206
|
+
self.log.info("Repository already exists, reusing...", uri=uri)
|
|
207
|
+
else:
|
|
208
|
+
msg = f"Failed to clone repository: {e}"
|
|
209
|
+
raise ValueError(msg) from e
|
|
205
210
|
|
|
206
211
|
source = await self.repository.create_source(
|
|
207
212
|
Source(uri=uri, cloned_path=str(clone_path)),
|
|
@@ -212,6 +217,7 @@ class SourceService:
|
|
|
212
217
|
file_count = sum(1 for _ in clone_path.rglob("*") if _.is_file())
|
|
213
218
|
|
|
214
219
|
# Process each file in the source directory
|
|
220
|
+
self.log.info("Inspecting files", source_id=source.id)
|
|
215
221
|
for path in tqdm(clone_path.rglob("*"), total=file_count, leave=False):
|
|
216
222
|
await self._process_file(source.id, path.absolute())
|
|
217
223
|
|
kodit/util/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utility functions and classes."""
|
kodit/util/spinner.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Spinner for long-running tasks."""
|
|
2
|
+
|
|
3
|
+
import itertools
|
|
4
|
+
import sys
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Spinner:
|
|
10
|
+
"""Spinner for long-running tasks."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, delay: float = 0.1) -> None:
|
|
13
|
+
"""Initialize the spinner."""
|
|
14
|
+
self.spinner = itertools.cycle(["-", "/", "|", "\\"])
|
|
15
|
+
self.delay = delay
|
|
16
|
+
self.busy = False
|
|
17
|
+
self.spinner_visible = False
|
|
18
|
+
|
|
19
|
+
def write_next(self) -> None:
|
|
20
|
+
"""Write the next character of the spinner."""
|
|
21
|
+
with self._screen_lock:
|
|
22
|
+
if not self.spinner_visible:
|
|
23
|
+
sys.stdout.write(next(self.spinner))
|
|
24
|
+
self.spinner_visible = True
|
|
25
|
+
sys.stdout.flush()
|
|
26
|
+
|
|
27
|
+
def remove_spinner(self, cleanup: bool = False) -> None: # noqa: FBT001, FBT002
|
|
28
|
+
"""Remove the spinner."""
|
|
29
|
+
with self._screen_lock:
|
|
30
|
+
if self.spinner_visible:
|
|
31
|
+
sys.stdout.write("\b")
|
|
32
|
+
self.spinner_visible = False
|
|
33
|
+
if cleanup:
|
|
34
|
+
sys.stdout.write(" ") # overwrite spinner with blank
|
|
35
|
+
sys.stdout.write("\r") # move to next line
|
|
36
|
+
sys.stdout.flush()
|
|
37
|
+
|
|
38
|
+
def spinner_task(self) -> None:
|
|
39
|
+
"""Task that runs the spinner."""
|
|
40
|
+
while self.busy:
|
|
41
|
+
self.write_next()
|
|
42
|
+
time.sleep(self.delay)
|
|
43
|
+
self.remove_spinner()
|
|
44
|
+
|
|
45
|
+
def __enter__(self) -> None:
|
|
46
|
+
"""Enter the context manager."""
|
|
47
|
+
if sys.stdout.isatty():
|
|
48
|
+
self._screen_lock = threading.Lock()
|
|
49
|
+
self.busy = True
|
|
50
|
+
self.thread = threading.Thread(target=self.spinner_task)
|
|
51
|
+
self.thread.start()
|
|
52
|
+
|
|
53
|
+
def __exit__(self, exception: object, value: object, tb: object) -> None:
|
|
54
|
+
"""Exit the context manager."""
|
|
55
|
+
if sys.stdout.isatty():
|
|
56
|
+
self.busy = False
|
|
57
|
+
self.remove_spinner(cleanup=True)
|
|
58
|
+
else:
|
|
59
|
+
sys.stdout.write("\r")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kodit
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.16
|
|
4
4
|
Summary: Code indexing for better AI code generation
|
|
5
5
|
Project-URL: Homepage, https://docs.helixml.tech/kodit/
|
|
6
6
|
Project-URL: Documentation, https://docs.helixml.tech/kodit/
|
|
@@ -15,12 +15,14 @@ Keywords: ai,indexing,mcp,rag
|
|
|
15
15
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
16
16
|
Classifier: Intended Audience :: Developers
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
19
|
Classifier: Topic :: Software Development :: Code Generators
|
|
19
20
|
Requires-Python: >=3.12
|
|
20
21
|
Requires-Dist: aiofiles>=24.1.0
|
|
21
22
|
Requires-Dist: aiosqlite>=0.20.0
|
|
22
23
|
Requires-Dist: alembic>=1.15.2
|
|
23
24
|
Requires-Dist: asgi-correlation-id>=4.3.4
|
|
25
|
+
Requires-Dist: asyncpg>=0.30.0
|
|
24
26
|
Requires-Dist: better-exceptions>=0.3.3
|
|
25
27
|
Requires-Dist: bm25s[core]>=0.2.12
|
|
26
28
|
Requires-Dist: click>=8.1.8
|
|
@@ -41,6 +43,7 @@ Requires-Dist: sqlalchemy[asyncio]>=2.0.40
|
|
|
41
43
|
Requires-Dist: structlog>=25.3.0
|
|
42
44
|
Requires-Dist: tdqm>=0.0.1
|
|
43
45
|
Requires-Dist: tiktoken>=0.9.0
|
|
46
|
+
Requires-Dist: transformers>=4.51.3
|
|
44
47
|
Requires-Dist: tree-sitter-language-pack>=0.7.3
|
|
45
48
|
Requires-Dist: tree-sitter>=0.24.0
|
|
46
49
|
Requires-Dist: uritools>=5.0.0
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
|
|
2
|
+
kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
|
|
3
|
+
kodit/_version.py,sha256=VYJNWHISWEW-KD_clKUYcTY_Z30r993Sjws4URJIL0g,513
|
|
4
|
+
kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
|
|
5
|
+
kodit/cli.py,sha256=i7eEt0FdIQGEfXKFte-8fBcZZGE8BPXBp40aGwJDQGI,11323
|
|
6
|
+
kodit/config.py,sha256=2W2u5J8j-Mbt-C4xzOuK-PeuDCx0S_rnCXPhBwvfLT4,4353
|
|
7
|
+
kodit/database.py,sha256=WB1KpVxUYPgiJGU0gJa2hqytYB8wJEJ5z3WayhWzNMU,2403
|
|
8
|
+
kodit/log.py,sha256=HU1OmuxO4FcVw61k4WW7Y4WM7BrDaeplw1PcBHhuIZY,5434
|
|
9
|
+
kodit/mcp.py,sha256=QruyPskWB0_x59pkfj5BBeXuR13GMny5TAZEa2j4U9s,5752
|
|
10
|
+
kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
|
|
11
|
+
kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
|
|
12
|
+
kodit/bm25/keyword_search_factory.py,sha256=rp-wx3DJsc2KlELK1V337EyeYvmwnMQwUqOo1WVPSmg,631
|
|
13
|
+
kodit/bm25/keyword_search_service.py,sha256=aBbWQKgQmi2re3EIHdXFS00n7Wj3b2D0pZsLZ4qmHfE,754
|
|
14
|
+
kodit/bm25/local_bm25.py,sha256=AAbFhbQDqyL3d7jsPL7W4HsLxdoYctaDsREUXOLy6jM,3260
|
|
15
|
+
kodit/bm25/vectorchord_bm25.py,sha256=_nGrkUReYLLV-L8RIuIVLwjuhSYZl9T532n5OVf0kWs,6393
|
|
16
|
+
kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
|
|
17
|
+
kodit/embedding/embedding_factory.py,sha256=UGnFRyyQXazSUOwyW4Hg7Vq2-kfAoDj9lD4CTLu8x04,1630
|
|
18
|
+
kodit/embedding/embedding_models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
|
|
19
|
+
kodit/embedding/embedding_repository.py,sha256=-ux3scpBzel8c0pMH9fNOEsSXFIzl-IfgaWrkTb1szo,6907
|
|
20
|
+
kodit/embedding/local_vector_search_service.py,sha256=hkF0qlfzjyGt400qIX9Mr6B7b7i8WvYIYWN2Z2C_pcs,1907
|
|
21
|
+
kodit/embedding/vector_search_service.py,sha256=pQJ129QjGrAWOXzqkywmgtDRpy8_gtzYgkivyqF9Vrs,1009
|
|
22
|
+
kodit/embedding/vectorchord_vector_search_service.py,sha256=KSs0IMFHHIllwq2d3A0LGqGGZDqO1Ht6K-BCfBBWW0Y,5051
|
|
23
|
+
kodit/embedding/embedding_provider/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
|
|
24
|
+
kodit/embedding/embedding_provider/embedding_provider.py,sha256=Tf3bwUsUMzAgoyLFM5qBtOLqPp1qr03TzrwGczkDvy0,1835
|
|
25
|
+
kodit/embedding/embedding_provider/hash_embedding_provider.py,sha256=nAhlhh8j8PqqCCbhVl26Y8ntFBm2vJBCtB4X04g5Wwg,2638
|
|
26
|
+
kodit/embedding/embedding_provider/local_embedding_provider.py,sha256=4ER-UPq506Y0TWU6qcs0nUqw6bSKQkSrdog-DhNQWM8,1906
|
|
27
|
+
kodit/embedding/embedding_provider/openai_embedding_provider.py,sha256=V_jdUXiaGdslplwxMlfgFc4_hAVS2eaJXMTs2C7RiLI,2666
|
|
28
|
+
kodit/enrichment/__init__.py,sha256=vBEolHpKaHUhfINX0dSGyAPlvgpLNAer9YzFtdvCB24,18
|
|
29
|
+
kodit/enrichment/enrichment_factory.py,sha256=vKjkUTdhj74IW2S4GENDWdWMJx6BwUSZjJGDC0i7DSk,787
|
|
30
|
+
kodit/enrichment/enrichment_service.py,sha256=87Sd3gGbEMJYb_wVrHG8L1yGIZmQNR7foUS4_y94azI,977
|
|
31
|
+
kodit/enrichment/enrichment_provider/__init__.py,sha256=klf8iuLVWX4iRz-DZQauFFNAoJC5CByczh48TBZPW-o,27
|
|
32
|
+
kodit/enrichment/enrichment_provider/enrichment_provider.py,sha256=E0H5rq3OENM0yYbA8K_3nSnj5lUHCpoIOqpWLo-2MVU,413
|
|
33
|
+
kodit/enrichment/enrichment_provider/local_enrichment_provider.py,sha256=bR6HR1gH7wtZdMLOwaKdASjvllRo1FlNW9GyZC11zAM,2164
|
|
34
|
+
kodit/enrichment/enrichment_provider/openai_enrichment_provider.py,sha256=gYuFTAeIVdQNlCUvNSPgRoiRwCvRD0C8419h8ubyABA,2725
|
|
35
|
+
kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
|
|
36
|
+
kodit/indexing/fusion.py,sha256=TZb4fPAedXdEUXzwzOofW98QIOymdbclBOP1KOijuEk,1674
|
|
37
|
+
kodit/indexing/indexing_models.py,sha256=6NX9HVcj6Pu9ePwHC7n-PWSyAgukpJq0nCNmUIigtbo,1282
|
|
38
|
+
kodit/indexing/indexing_repository.py,sha256=GYHoACUWYKQdVTwP7tfik_TMUD1WUK76nywH88eCSwg,7006
|
|
39
|
+
kodit/indexing/indexing_service.py,sha256=tKcZpi0pzsmF6OpqnqF0Q5HfSXxi5iLTysrVSou4JiQ,10579
|
|
40
|
+
kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
|
|
41
|
+
kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
|
|
42
|
+
kodit/migrations/env.py,sha256=w1M7OZh-ZeR2dPHS0ByXAUxQjfZQ8xIzMseWuzLDTWw,2469
|
|
43
|
+
kodit/migrations/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
|
|
44
|
+
kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQKILCDQRA5jEaats9aGZs9Wdtp-j-38SF4,1644
|
|
45
|
+
kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
|
|
46
|
+
kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
|
|
47
|
+
kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
|
|
48
|
+
kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
|
|
49
|
+
kodit/snippets/snippets.py,sha256=mwN0bM1Msu8ZeEsUHyQ7tx3Hj3vZsm8G7Wu4eWSkLY8,1539
|
|
50
|
+
kodit/snippets/languages/__init__.py,sha256=Bj5KKZSls2MQ8ZY1S_nHg447MgGZW-2WZM-oq6vjwwA,1187
|
|
51
|
+
kodit/snippets/languages/csharp.scm,sha256=gbBN4RiV1FBuTJF6orSnDFi8H9JwTw-d4piLJYsWUsc,222
|
|
52
|
+
kodit/snippets/languages/go.scm,sha256=SEX9mTOrhP2KiQW7oflDKkd21u5dK56QbJ4LvTDxY8A,533
|
|
53
|
+
kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
|
|
54
|
+
kodit/source/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
|
|
55
|
+
kodit/source/source_models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
|
|
56
|
+
kodit/source/source_repository.py,sha256=0EksMpoLzdkfe8S4eeCm4Sf7TuxsOzOzaF4BBsMYo-4,3163
|
|
57
|
+
kodit/source/source_service.py,sha256=u_GaH07ewakThQJRfT8O_yZ54A52qLtJuM1bF3xUT2A,9633
|
|
58
|
+
kodit/util/__init__.py,sha256=bPu6CtqDWCRGU7VgW2_aiQrCBi8G89FS6k1PjvDajJ0,37
|
|
59
|
+
kodit/util/spinner.py,sha256=R9bzrHtBiIH6IfLbmsIVHL53s8vg-tqW4lwGGALu4dw,1932
|
|
60
|
+
kodit-0.1.16.dist-info/METADATA,sha256=1lR4ZSTiRBzUv9Gj8FPspv4GU2vWGQU6HSiffWgU2Do,2467
|
|
61
|
+
kodit-0.1.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
62
|
+
kodit-0.1.16.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
|
|
63
|
+
kodit-0.1.16.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
64
|
+
kodit-0.1.16.dist-info/RECORD,,
|
kodit/embedding/embedding.py
DELETED
|
@@ -1,203 +0,0 @@
|
|
|
1
|
-
"""Embedding service."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import os
|
|
5
|
-
from abc import ABC, abstractmethod
|
|
6
|
-
from collections.abc import AsyncGenerator
|
|
7
|
-
from typing import NamedTuple
|
|
8
|
-
|
|
9
|
-
import structlog
|
|
10
|
-
import tiktoken
|
|
11
|
-
from openai import AsyncOpenAI
|
|
12
|
-
from sentence_transformers import SentenceTransformer
|
|
13
|
-
|
|
14
|
-
TINY = "tiny"
|
|
15
|
-
CODE = "code"
|
|
16
|
-
TEST = "test"
|
|
17
|
-
|
|
18
|
-
COMMON_EMBEDDING_MODELS = {
|
|
19
|
-
TINY: "ibm-granite/granite-embedding-30m-english",
|
|
20
|
-
CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
|
|
21
|
-
TEST: "minishlab/potion-base-4M",
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class EmbeddingInput(NamedTuple):
|
|
26
|
-
"""Input for embedding."""
|
|
27
|
-
|
|
28
|
-
id: int
|
|
29
|
-
text: str
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class EmbeddingOutput(NamedTuple):
|
|
33
|
-
"""Output for embedding."""
|
|
34
|
-
|
|
35
|
-
id: int
|
|
36
|
-
embedding: list[float]
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class Embedder(ABC):
|
|
40
|
-
"""Embedder interface."""
|
|
41
|
-
|
|
42
|
-
@abstractmethod
|
|
43
|
-
def embed(
|
|
44
|
-
self, data: list[EmbeddingInput]
|
|
45
|
-
) -> AsyncGenerator[EmbeddingOutput, None]:
|
|
46
|
-
"""Embed a list of documents.
|
|
47
|
-
|
|
48
|
-
The embedding service accepts a massive list of id,strings to embed. Behind the
|
|
49
|
-
scenes it batches up requests and parallelizes them for performance according to
|
|
50
|
-
the specifics of the embedding service.
|
|
51
|
-
|
|
52
|
-
The id reference is required because the parallelization may return results out
|
|
53
|
-
of order.
|
|
54
|
-
"""
|
|
55
|
-
|
|
56
|
-
@abstractmethod
|
|
57
|
-
def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
|
|
58
|
-
"""Query the embedding model."""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def embedding_factory(openai_client: AsyncOpenAI | None = None) -> Embedder:
|
|
62
|
-
"""Create an embedding service."""
|
|
63
|
-
if openai_client is not None:
|
|
64
|
-
return OpenAIEmbedder(openai_client)
|
|
65
|
-
return LocalEmbedder(model_name=TINY)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class LocalEmbedder(Embedder):
|
|
69
|
-
"""Local embedder."""
|
|
70
|
-
|
|
71
|
-
def __init__(self, model_name: str) -> None:
|
|
72
|
-
"""Initialize the local embedder."""
|
|
73
|
-
self.log = structlog.get_logger(__name__)
|
|
74
|
-
self.log.info("Creating local embedder", model_name=model_name)
|
|
75
|
-
self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
|
|
76
|
-
self.embedding_model = None
|
|
77
|
-
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
78
|
-
|
|
79
|
-
def _model(self) -> SentenceTransformer:
|
|
80
|
-
"""Get the embedding model."""
|
|
81
|
-
if self.embedding_model is None:
|
|
82
|
-
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
|
|
83
|
-
self.embedding_model = SentenceTransformer(
|
|
84
|
-
self.model_name,
|
|
85
|
-
trust_remote_code=True,
|
|
86
|
-
device="cpu", # Force CPU so we don't have to install accelerate, etc.
|
|
87
|
-
)
|
|
88
|
-
return self.embedding_model
|
|
89
|
-
|
|
90
|
-
async def embed(
|
|
91
|
-
self, data: list[EmbeddingInput]
|
|
92
|
-
) -> AsyncGenerator[EmbeddingOutput, None]:
|
|
93
|
-
"""Embed a list of documents."""
|
|
94
|
-
model = self._model()
|
|
95
|
-
|
|
96
|
-
batched_data = _split_sub_batches(self.encoding, data)
|
|
97
|
-
|
|
98
|
-
for batch in batched_data:
|
|
99
|
-
embeddings = model.encode(
|
|
100
|
-
[i.text for i in batch], show_progress_bar=False, batch_size=4
|
|
101
|
-
)
|
|
102
|
-
for i, x in zip(batch, embeddings, strict=False):
|
|
103
|
-
yield EmbeddingOutput(i.id, [float(y) for y in x])
|
|
104
|
-
|
|
105
|
-
async def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
|
|
106
|
-
"""Query the embedding model."""
|
|
107
|
-
model = self._model()
|
|
108
|
-
embeddings = model.encode(data, show_progress_bar=False, batch_size=4)
|
|
109
|
-
for embedding in embeddings:
|
|
110
|
-
yield [float(x) for x in embedding]
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
OPENAI_MAX_EMBEDDING_SIZE = 8192
|
|
114
|
-
OPENAI_NUM_PARALLEL_TASKS = 10
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def _split_sub_batches(
|
|
118
|
-
encoding: tiktoken.Encoding, data: list[EmbeddingInput]
|
|
119
|
-
) -> list[list[EmbeddingInput]]:
|
|
120
|
-
"""Split a list of strings into smaller sub-batches."""
|
|
121
|
-
log = structlog.get_logger(__name__)
|
|
122
|
-
result = []
|
|
123
|
-
data_to_process = [s for s in data if s.text.strip()] # Filter out empty strings
|
|
124
|
-
|
|
125
|
-
while data_to_process:
|
|
126
|
-
next_batch = []
|
|
127
|
-
current_tokens = 0
|
|
128
|
-
|
|
129
|
-
while data_to_process:
|
|
130
|
-
next_item = data_to_process[0]
|
|
131
|
-
item_tokens = len(encoding.encode(next_item.text))
|
|
132
|
-
|
|
133
|
-
if item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
|
|
134
|
-
log.warning("Skipping too long snippet", snippet=data_to_process.pop(0))
|
|
135
|
-
continue
|
|
136
|
-
|
|
137
|
-
if current_tokens + item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
|
|
138
|
-
break
|
|
139
|
-
|
|
140
|
-
next_batch.append(data_to_process.pop(0))
|
|
141
|
-
current_tokens += item_tokens
|
|
142
|
-
|
|
143
|
-
if next_batch:
|
|
144
|
-
result.append(next_batch)
|
|
145
|
-
|
|
146
|
-
return result
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
class OpenAIEmbedder(Embedder):
|
|
150
|
-
"""OpenAI embedder."""
|
|
151
|
-
|
|
152
|
-
def __init__(
|
|
153
|
-
self, openai_client: AsyncOpenAI, model_name: str = "text-embedding-3-small"
|
|
154
|
-
) -> None:
|
|
155
|
-
"""Initialize the OpenAI embedder."""
|
|
156
|
-
self.log = structlog.get_logger(__name__)
|
|
157
|
-
self.log.info("Creating OpenAI embedder", model_name=model_name)
|
|
158
|
-
self.openai_client = openai_client
|
|
159
|
-
self.encoding = tiktoken.encoding_for_model(model_name)
|
|
160
|
-
self.log = structlog.get_logger(__name__)
|
|
161
|
-
|
|
162
|
-
async def embed(
|
|
163
|
-
self,
|
|
164
|
-
data: list[EmbeddingInput],
|
|
165
|
-
) -> AsyncGenerator[EmbeddingOutput, None]:
|
|
166
|
-
"""Embed a list of documents."""
|
|
167
|
-
# First split the list into a list of list where each sublist has fewer than
|
|
168
|
-
# max tokens.
|
|
169
|
-
batched_data = _split_sub_batches(self.encoding, data)
|
|
170
|
-
|
|
171
|
-
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
172
|
-
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
173
|
-
|
|
174
|
-
async def process_batch(batch: list[EmbeddingInput]) -> list[EmbeddingOutput]:
|
|
175
|
-
async with sem:
|
|
176
|
-
try:
|
|
177
|
-
response = await self.openai_client.embeddings.create(
|
|
178
|
-
model="text-embedding-3-small",
|
|
179
|
-
input=[i.text for i in batch],
|
|
180
|
-
)
|
|
181
|
-
return [
|
|
182
|
-
EmbeddingOutput(i.id, x.embedding)
|
|
183
|
-
for i, x in zip(batch, response.data, strict=False)
|
|
184
|
-
]
|
|
185
|
-
except Exception as e:
|
|
186
|
-
self.log.exception("Error embedding batch", error=str(e))
|
|
187
|
-
return []
|
|
188
|
-
|
|
189
|
-
# Create tasks for all batches
|
|
190
|
-
tasks = [process_batch(batch) for batch in batched_data]
|
|
191
|
-
|
|
192
|
-
# Process all batches and yield results as they complete
|
|
193
|
-
for task in asyncio.as_completed(tasks):
|
|
194
|
-
embeddings = await task
|
|
195
|
-
for e in embeddings:
|
|
196
|
-
yield e
|
|
197
|
-
|
|
198
|
-
async def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
|
|
199
|
-
"""Query the embedding model."""
|
|
200
|
-
async for e in self.embed(
|
|
201
|
-
[EmbeddingInput(i, text) for i, text in enumerate(data)]
|
|
202
|
-
):
|
|
203
|
-
yield e.embedding
|
kodit/search/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Search for relevant snippets."""
|
kodit/search/search_service.py
DELETED
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
"""Search service."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pydantic
|
|
6
|
-
import structlog
|
|
7
|
-
|
|
8
|
-
from kodit.bm25.bm25 import BM25Service
|
|
9
|
-
from kodit.embedding.embedding import Embedder
|
|
10
|
-
from kodit.embedding.embedding_models import EmbeddingType
|
|
11
|
-
from kodit.search.search_repository import SearchRepository
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class SearchRequest(pydantic.BaseModel):
|
|
15
|
-
"""Request for a search."""
|
|
16
|
-
|
|
17
|
-
code_query: str | None = None
|
|
18
|
-
keywords: list[str] | None = None
|
|
19
|
-
top_k: int = 10
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class SearchResult(pydantic.BaseModel):
|
|
23
|
-
"""Data transfer object for search results.
|
|
24
|
-
|
|
25
|
-
This model represents a single search result, containing both the file path
|
|
26
|
-
and the matching snippet content.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
id: int
|
|
30
|
-
uri: str
|
|
31
|
-
content: str
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class Snippet(pydantic.BaseModel):
|
|
35
|
-
"""Snippet model."""
|
|
36
|
-
|
|
37
|
-
content: str
|
|
38
|
-
file_path: str
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class SearchService:
|
|
42
|
-
"""Service for searching for relevant data."""
|
|
43
|
-
|
|
44
|
-
def __init__(
|
|
45
|
-
self,
|
|
46
|
-
repository: SearchRepository,
|
|
47
|
-
data_dir: Path,
|
|
48
|
-
embedding_service: Embedder,
|
|
49
|
-
) -> None:
|
|
50
|
-
"""Initialize the search service."""
|
|
51
|
-
self.repository = repository
|
|
52
|
-
self.log = structlog.get_logger(__name__)
|
|
53
|
-
self.bm25 = BM25Service(data_dir)
|
|
54
|
-
self.code_embedding_service = embedding_service
|
|
55
|
-
|
|
56
|
-
async def search(self, request: SearchRequest) -> list[SearchResult]:
|
|
57
|
-
"""Search for relevant data."""
|
|
58
|
-
fusion_list = []
|
|
59
|
-
if request.keywords:
|
|
60
|
-
snippet_ids = await self.repository.list_snippet_ids()
|
|
61
|
-
|
|
62
|
-
# Gather results for each keyword
|
|
63
|
-
result_ids: list[tuple[int, float]] = []
|
|
64
|
-
for keyword in request.keywords:
|
|
65
|
-
results = self.bm25.retrieve(snippet_ids, keyword, request.top_k)
|
|
66
|
-
result_ids.extend(results)
|
|
67
|
-
|
|
68
|
-
# Sort results by score
|
|
69
|
-
result_ids.sort(key=lambda x: x[1], reverse=True)
|
|
70
|
-
|
|
71
|
-
self.log.debug("Search results (BM25)", results=result_ids)
|
|
72
|
-
|
|
73
|
-
bm25_results = [x[0] for x in result_ids]
|
|
74
|
-
fusion_list.append(bm25_results)
|
|
75
|
-
|
|
76
|
-
# Compute embedding for semantic query
|
|
77
|
-
semantic_results = []
|
|
78
|
-
if request.code_query:
|
|
79
|
-
query_embedding = await anext(
|
|
80
|
-
self.code_embedding_service.query([request.code_query])
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
query_results = await self.repository.list_semantic_results(
|
|
84
|
-
EmbeddingType.CODE, query_embedding, top_k=request.top_k
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
# Sort results by score
|
|
88
|
-
query_results.sort(key=lambda x: x[1], reverse=True)
|
|
89
|
-
|
|
90
|
-
# Extract the snippet ids from the query results
|
|
91
|
-
semantic_results = [x[0] for x in query_results]
|
|
92
|
-
fusion_list.append(semantic_results)
|
|
93
|
-
|
|
94
|
-
if len(fusion_list) == 0:
|
|
95
|
-
return []
|
|
96
|
-
|
|
97
|
-
# Combine all results together with RFF if required
|
|
98
|
-
final_results = reciprocal_rank_fusion(fusion_list, k=60)
|
|
99
|
-
|
|
100
|
-
# Extract ids from final results
|
|
101
|
-
final_ids = [x[0] for x in final_results]
|
|
102
|
-
|
|
103
|
-
# Get snippets from database (up to top_k)
|
|
104
|
-
search_results = await self.repository.list_snippets_by_ids(
|
|
105
|
-
final_ids[: request.top_k]
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
return [
|
|
109
|
-
SearchResult(
|
|
110
|
-
id=snippet.id,
|
|
111
|
-
uri=file.uri,
|
|
112
|
-
content=snippet.content,
|
|
113
|
-
)
|
|
114
|
-
for file, snippet in search_results
|
|
115
|
-
]
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def reciprocal_rank_fusion(
|
|
119
|
-
rankings: list[list[int]], k: float = 60
|
|
120
|
-
) -> list[tuple[int, float]]:
|
|
121
|
-
"""RRF prioritises results that are present in all results.
|
|
122
|
-
|
|
123
|
-
Args:
|
|
124
|
-
rankings: List of rankers, each containing a list of document ids. Top of the
|
|
125
|
-
list is considered to be the best result.
|
|
126
|
-
k: Parameter for RRF.
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
Dictionary of ids and their scores.
|
|
130
|
-
|
|
131
|
-
"""
|
|
132
|
-
scores = {}
|
|
133
|
-
for ranker in rankings:
|
|
134
|
-
for rank in ranker:
|
|
135
|
-
scores[rank] = float(0)
|
|
136
|
-
|
|
137
|
-
for ranker in rankings:
|
|
138
|
-
for i, rank in enumerate(ranker):
|
|
139
|
-
scores[rank] += 1.0 / (k + i)
|
|
140
|
-
|
|
141
|
-
# Create a list of tuples of ids and their scores
|
|
142
|
-
results = [(rank, scores[rank]) for rank in scores]
|
|
143
|
-
|
|
144
|
-
# Sort results by score
|
|
145
|
-
results.sort(key=lambda x: x[1], reverse=True)
|
|
146
|
-
|
|
147
|
-
return results
|
kodit-0.1.14.dist-info/RECORD
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
|
|
2
|
-
kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
|
|
3
|
-
kodit/_version.py,sha256=O_r2EWoixTKREu-RyeL8e93UHfqprj1LCIlwiWXfHcg,513
|
|
4
|
-
kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
|
|
5
|
-
kodit/cli.py,sha256=VLoXFS1xJnQ0TLy3_cO8-B9tCb4NJHiYPfzZtHxpgRY,7784
|
|
6
|
-
kodit/config.py,sha256=TDcLt6fiJn9cI1PoO5AqBqsL_Bxmm9JV5GqRxhj1tLw,4202
|
|
7
|
-
kodit/database.py,sha256=kekSdyEATdb47jxzQemkSOXMNOwnUwmVVTpn9hYaDK8,2356
|
|
8
|
-
kodit/log.py,sha256=HU1OmuxO4FcVw61k4WW7Y4WM7BrDaeplw1PcBHhuIZY,5434
|
|
9
|
-
kodit/mcp.py,sha256=ot5CIH240mSXK3sJcxTf4lBfthq0tcMS8XBGTaHY-n8,5088
|
|
10
|
-
kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
|
|
11
|
-
kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
|
|
12
|
-
kodit/bm25/bm25.py,sha256=JtgJfsHz-2SHx96zxWjkPFSH7fXkahFMp01cDwl4YBg,2298
|
|
13
|
-
kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
|
|
14
|
-
kodit/embedding/embedding.py,sha256=EMJpHK8ICZk_FjiO9Aqr2IO20qkGOmj_PfA1hyfI7Vk,6745
|
|
15
|
-
kodit/embedding/embedding_models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
|
|
16
|
-
kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
|
|
17
|
-
kodit/indexing/indexing_models.py,sha256=6NX9HVcj6Pu9ePwHC7n-PWSyAgukpJq0nCNmUIigtbo,1282
|
|
18
|
-
kodit/indexing/indexing_repository.py,sha256=7bkAiBwtr3qlkdhNIalwMwbxezVz_RQGOhLVWPKHwNk,5506
|
|
19
|
-
kodit/indexing/indexing_service.py,sha256=VGfKgbkYEAYP_gIubvhMxo3yThT20ndS5xdg2LxwRgA,6685
|
|
20
|
-
kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
|
|
21
|
-
kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
|
|
22
|
-
kodit/migrations/env.py,sha256=w1M7OZh-ZeR2dPHS0ByXAUxQjfZQ8xIzMseWuzLDTWw,2469
|
|
23
|
-
kodit/migrations/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
|
|
24
|
-
kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQKILCDQRA5jEaats9aGZs9Wdtp-j-38SF4,1644
|
|
25
|
-
kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
|
|
26
|
-
kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
|
|
27
|
-
kodit/search/__init__.py,sha256=4QbdjbrlhNKMovmuKHxJnUeZT7KNjTTFU0GdnuwUHdQ,36
|
|
28
|
-
kodit/search/search_repository.py,sha256=r1fkV6-cy9BKsy5J4WTHaY_FcjMaT1PV5qqqq0gvjZw,5833
|
|
29
|
-
kodit/search/search_service.py,sha256=KePkqCAc3CUcrpNsbDc5DqbF6W2m0TG6TDa9-VSJZS0,4227
|
|
30
|
-
kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
|
|
31
|
-
kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
|
|
32
|
-
kodit/snippets/snippets.py,sha256=mwN0bM1Msu8ZeEsUHyQ7tx3Hj3vZsm8G7Wu4eWSkLY8,1539
|
|
33
|
-
kodit/snippets/languages/__init__.py,sha256=Bj5KKZSls2MQ8ZY1S_nHg447MgGZW-2WZM-oq6vjwwA,1187
|
|
34
|
-
kodit/snippets/languages/csharp.scm,sha256=gbBN4RiV1FBuTJF6orSnDFi8H9JwTw-d4piLJYsWUsc,222
|
|
35
|
-
kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
|
|
36
|
-
kodit/source/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
|
|
37
|
-
kodit/source/source_models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
|
|
38
|
-
kodit/source/source_repository.py,sha256=0EksMpoLzdkfe8S4eeCm4Sf7TuxsOzOzaF4BBsMYo-4,3163
|
|
39
|
-
kodit/source/source_service.py,sha256=qBV9FCFQbJppeFrVo4uMgvC_mzWRIKldymp5yqLx9pw,9255
|
|
40
|
-
kodit-0.1.14.dist-info/METADATA,sha256=acFpcf0ODyUSnA1hg4BPlLexpOEh-0yuaqsaWUNopOs,2349
|
|
41
|
-
kodit-0.1.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
42
|
-
kodit-0.1.14.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
|
|
43
|
-
kodit-0.1.14.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
44
|
-
kodit-0.1.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|