kodit 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/bm25/keyword_search_factory.py +17 -0
- kodit/bm25/keyword_search_service.py +34 -0
- kodit/bm25/{bm25.py → local_bm25.py} +40 -14
- kodit/bm25/vectorchord_bm25.py +193 -0
- kodit/cli.py +14 -11
- kodit/config.py +9 -2
- kodit/database.py +4 -2
- kodit/embedding/embedding_factory.py +44 -0
- kodit/embedding/embedding_provider/__init__.py +1 -0
- kodit/embedding/embedding_provider/embedding_provider.py +53 -0
- kodit/embedding/embedding_provider/hash_embedding_provider.py +77 -0
- kodit/embedding/embedding_provider/local_embedding_provider.py +58 -0
- kodit/embedding/embedding_provider/openai_embedding_provider.py +63 -0
- kodit/embedding/embedding_repository.py +206 -0
- kodit/embedding/local_vector_search_service.py +50 -0
- kodit/embedding/vector_search_service.py +38 -0
- kodit/embedding/vectorchord_vector_search_service.py +145 -0
- kodit/indexing/indexing_repository.py +24 -4
- kodit/indexing/indexing_service.py +25 -30
- kodit/mcp.py +28 -7
- kodit/search/search_repository.py +0 -121
- kodit/search/search_service.py +12 -24
- kodit/source/source_service.py +9 -3
- kodit/util/__init__.py +1 -0
- kodit/util/spinner.py +59 -0
- {kodit-0.1.13.dist-info → kodit-0.1.15.dist-info}/METADATA +2 -1
- kodit-0.1.15.dist-info/RECORD +58 -0
- kodit/embedding/embedding.py +0 -203
- kodit-0.1.13.dist-info/RECORD +0 -44
- {kodit-0.1.13.dist-info → kodit-0.1.15.dist-info}/WHEEL +0 -0
- {kodit-0.1.13.dist-info → kodit-0.1.15.dist-info}/entry_points.txt +0 -0
- {kodit-0.1.13.dist-info → kodit-0.1.15.dist-info}/licenses/LICENSE +0 -0
kodit/embedding/embedding.py
DELETED
|
@@ -1,203 +0,0 @@
|
|
|
1
|
-
"""Embedding service."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import os
|
|
5
|
-
from abc import ABC, abstractmethod
|
|
6
|
-
from collections.abc import AsyncGenerator
|
|
7
|
-
from typing import NamedTuple
|
|
8
|
-
|
|
9
|
-
import structlog
|
|
10
|
-
import tiktoken
|
|
11
|
-
from openai import AsyncOpenAI
|
|
12
|
-
from sentence_transformers import SentenceTransformer
|
|
13
|
-
|
|
14
|
-
TINY = "tiny"
|
|
15
|
-
CODE = "code"
|
|
16
|
-
TEST = "test"
|
|
17
|
-
|
|
18
|
-
COMMON_EMBEDDING_MODELS = {
|
|
19
|
-
TINY: "ibm-granite/granite-embedding-30m-english",
|
|
20
|
-
CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
|
|
21
|
-
TEST: "minishlab/potion-base-4M",
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class EmbeddingInput(NamedTuple):
|
|
26
|
-
"""Input for embedding."""
|
|
27
|
-
|
|
28
|
-
id: int
|
|
29
|
-
text: str
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class EmbeddingOutput(NamedTuple):
|
|
33
|
-
"""Output for embedding."""
|
|
34
|
-
|
|
35
|
-
id: int
|
|
36
|
-
embedding: list[float]
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class Embedder(ABC):
|
|
40
|
-
"""Embedder interface."""
|
|
41
|
-
|
|
42
|
-
@abstractmethod
|
|
43
|
-
def embed(
|
|
44
|
-
self, data: list[EmbeddingInput]
|
|
45
|
-
) -> AsyncGenerator[EmbeddingOutput, None]:
|
|
46
|
-
"""Embed a list of documents.
|
|
47
|
-
|
|
48
|
-
The embedding service accepts a massive list of id,strings to embed. Behind the
|
|
49
|
-
scenes it batches up requests and parallelizes them for performance according to
|
|
50
|
-
the specifics of the embedding service.
|
|
51
|
-
|
|
52
|
-
The id reference is required because the parallelization may return results out
|
|
53
|
-
of order.
|
|
54
|
-
"""
|
|
55
|
-
|
|
56
|
-
@abstractmethod
|
|
57
|
-
def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
|
|
58
|
-
"""Query the embedding model."""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def embedding_factory(openai_client: AsyncOpenAI | None = None) -> Embedder:
|
|
62
|
-
"""Create an embedding service."""
|
|
63
|
-
if openai_client is not None:
|
|
64
|
-
return OpenAIEmbedder(openai_client)
|
|
65
|
-
return LocalEmbedder(model_name=TINY)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class LocalEmbedder(Embedder):
|
|
69
|
-
"""Local embedder."""
|
|
70
|
-
|
|
71
|
-
def __init__(self, model_name: str) -> None:
|
|
72
|
-
"""Initialize the local embedder."""
|
|
73
|
-
self.log = structlog.get_logger(__name__)
|
|
74
|
-
self.log.info("Creating local embedder", model_name=model_name)
|
|
75
|
-
self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
|
|
76
|
-
self.embedding_model = None
|
|
77
|
-
self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
|
|
78
|
-
|
|
79
|
-
def _model(self) -> SentenceTransformer:
|
|
80
|
-
"""Get the embedding model."""
|
|
81
|
-
if self.embedding_model is None:
|
|
82
|
-
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
|
|
83
|
-
self.embedding_model = SentenceTransformer(
|
|
84
|
-
self.model_name,
|
|
85
|
-
trust_remote_code=True,
|
|
86
|
-
device="cpu", # Force CPU so we don't have to install accelerate, etc.
|
|
87
|
-
)
|
|
88
|
-
return self.embedding_model
|
|
89
|
-
|
|
90
|
-
async def embed(
|
|
91
|
-
self, data: list[EmbeddingInput]
|
|
92
|
-
) -> AsyncGenerator[EmbeddingOutput, None]:
|
|
93
|
-
"""Embed a list of documents."""
|
|
94
|
-
model = self._model()
|
|
95
|
-
|
|
96
|
-
batched_data = _split_sub_batches(self.encoding, data)
|
|
97
|
-
|
|
98
|
-
for batch in batched_data:
|
|
99
|
-
embeddings = model.encode(
|
|
100
|
-
[i.text for i in batch], show_progress_bar=False, batch_size=4
|
|
101
|
-
)
|
|
102
|
-
for i, x in zip(batch, embeddings, strict=False):
|
|
103
|
-
yield EmbeddingOutput(i.id, [float(y) for y in x])
|
|
104
|
-
|
|
105
|
-
async def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
|
|
106
|
-
"""Query the embedding model."""
|
|
107
|
-
model = self._model()
|
|
108
|
-
embeddings = model.encode(data, show_progress_bar=False, batch_size=4)
|
|
109
|
-
for embedding in embeddings:
|
|
110
|
-
yield [float(x) for x in embedding]
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
OPENAI_MAX_EMBEDDING_SIZE = 8192
|
|
114
|
-
OPENAI_NUM_PARALLEL_TASKS = 10
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def _split_sub_batches(
|
|
118
|
-
encoding: tiktoken.Encoding, data: list[EmbeddingInput]
|
|
119
|
-
) -> list[list[EmbeddingInput]]:
|
|
120
|
-
"""Split a list of strings into smaller sub-batches."""
|
|
121
|
-
log = structlog.get_logger(__name__)
|
|
122
|
-
result = []
|
|
123
|
-
data_to_process = [s for s in data if s.text.strip()] # Filter out empty strings
|
|
124
|
-
|
|
125
|
-
while data_to_process:
|
|
126
|
-
next_batch = []
|
|
127
|
-
current_tokens = 0
|
|
128
|
-
|
|
129
|
-
while data_to_process:
|
|
130
|
-
next_item = data_to_process[0]
|
|
131
|
-
item_tokens = len(encoding.encode(next_item.text))
|
|
132
|
-
|
|
133
|
-
if item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
|
|
134
|
-
log.warning("Skipping too long snippet", snippet=data_to_process.pop(0))
|
|
135
|
-
continue
|
|
136
|
-
|
|
137
|
-
if current_tokens + item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
|
|
138
|
-
break
|
|
139
|
-
|
|
140
|
-
next_batch.append(data_to_process.pop(0))
|
|
141
|
-
current_tokens += item_tokens
|
|
142
|
-
|
|
143
|
-
if next_batch:
|
|
144
|
-
result.append(next_batch)
|
|
145
|
-
|
|
146
|
-
return result
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
class OpenAIEmbedder(Embedder):
|
|
150
|
-
"""OpenAI embedder."""
|
|
151
|
-
|
|
152
|
-
def __init__(
|
|
153
|
-
self, openai_client: AsyncOpenAI, model_name: str = "text-embedding-3-small"
|
|
154
|
-
) -> None:
|
|
155
|
-
"""Initialize the OpenAI embedder."""
|
|
156
|
-
self.log = structlog.get_logger(__name__)
|
|
157
|
-
self.log.info("Creating OpenAI embedder", model_name=model_name)
|
|
158
|
-
self.openai_client = openai_client
|
|
159
|
-
self.encoding = tiktoken.encoding_for_model(model_name)
|
|
160
|
-
self.log = structlog.get_logger(__name__)
|
|
161
|
-
|
|
162
|
-
async def embed(
|
|
163
|
-
self,
|
|
164
|
-
data: list[EmbeddingInput],
|
|
165
|
-
) -> AsyncGenerator[EmbeddingOutput, None]:
|
|
166
|
-
"""Embed a list of documents."""
|
|
167
|
-
# First split the list into a list of list where each sublist has fewer than
|
|
168
|
-
# max tokens.
|
|
169
|
-
batched_data = _split_sub_batches(self.encoding, data)
|
|
170
|
-
|
|
171
|
-
# Process batches in parallel with a semaphore to limit concurrent requests
|
|
172
|
-
sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
|
|
173
|
-
|
|
174
|
-
async def process_batch(batch: list[EmbeddingInput]) -> list[EmbeddingOutput]:
|
|
175
|
-
async with sem:
|
|
176
|
-
try:
|
|
177
|
-
response = await self.openai_client.embeddings.create(
|
|
178
|
-
model="text-embedding-3-small",
|
|
179
|
-
input=[i.text for i in batch],
|
|
180
|
-
)
|
|
181
|
-
return [
|
|
182
|
-
EmbeddingOutput(i.id, x.embedding)
|
|
183
|
-
for i, x in zip(batch, response.data, strict=False)
|
|
184
|
-
]
|
|
185
|
-
except Exception as e:
|
|
186
|
-
self.log.exception("Error embedding batch", error=str(e))
|
|
187
|
-
return []
|
|
188
|
-
|
|
189
|
-
# Create tasks for all batches
|
|
190
|
-
tasks = [process_batch(batch) for batch in batched_data]
|
|
191
|
-
|
|
192
|
-
# Process all batches and yield results as they complete
|
|
193
|
-
for task in asyncio.as_completed(tasks):
|
|
194
|
-
embeddings = await task
|
|
195
|
-
for e in embeddings:
|
|
196
|
-
yield e
|
|
197
|
-
|
|
198
|
-
async def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
|
|
199
|
-
"""Query the embedding model."""
|
|
200
|
-
async for e in self.embed(
|
|
201
|
-
[EmbeddingInput(i, text) for i, text in enumerate(data)]
|
|
202
|
-
):
|
|
203
|
-
yield e.embedding
|
kodit-0.1.13.dist-info/RECORD
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
|
|
2
|
-
kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
|
|
3
|
-
kodit/_version.py,sha256=Ln0urWB3R3JaxFwIIvoej0v08KbDCO89NUBxWx-zj0U,513
|
|
4
|
-
kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
|
|
5
|
-
kodit/cli.py,sha256=VLoXFS1xJnQ0TLy3_cO8-B9tCb4NJHiYPfzZtHxpgRY,7784
|
|
6
|
-
kodit/config.py,sha256=TDcLt6fiJn9cI1PoO5AqBqsL_Bxmm9JV5GqRxhj1tLw,4202
|
|
7
|
-
kodit/database.py,sha256=kekSdyEATdb47jxzQemkSOXMNOwnUwmVVTpn9hYaDK8,2356
|
|
8
|
-
kodit/log.py,sha256=HU1OmuxO4FcVw61k4WW7Y4WM7BrDaeplw1PcBHhuIZY,5434
|
|
9
|
-
kodit/mcp.py,sha256=I_ZFzQOR0gyS8LO8td-q-utPZpqiOnIkn7O-SIBUi0g,4384
|
|
10
|
-
kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
|
|
11
|
-
kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
|
|
12
|
-
kodit/bm25/bm25.py,sha256=JtgJfsHz-2SHx96zxWjkPFSH7fXkahFMp01cDwl4YBg,2298
|
|
13
|
-
kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
|
|
14
|
-
kodit/embedding/embedding.py,sha256=EMJpHK8ICZk_FjiO9Aqr2IO20qkGOmj_PfA1hyfI7Vk,6745
|
|
15
|
-
kodit/embedding/embedding_models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
|
|
16
|
-
kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
|
|
17
|
-
kodit/indexing/indexing_models.py,sha256=6NX9HVcj6Pu9ePwHC7n-PWSyAgukpJq0nCNmUIigtbo,1282
|
|
18
|
-
kodit/indexing/indexing_repository.py,sha256=7bkAiBwtr3qlkdhNIalwMwbxezVz_RQGOhLVWPKHwNk,5506
|
|
19
|
-
kodit/indexing/indexing_service.py,sha256=VGfKgbkYEAYP_gIubvhMxo3yThT20ndS5xdg2LxwRgA,6685
|
|
20
|
-
kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
|
|
21
|
-
kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
|
|
22
|
-
kodit/migrations/env.py,sha256=w1M7OZh-ZeR2dPHS0ByXAUxQjfZQ8xIzMseWuzLDTWw,2469
|
|
23
|
-
kodit/migrations/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
|
|
24
|
-
kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQKILCDQRA5jEaats9aGZs9Wdtp-j-38SF4,1644
|
|
25
|
-
kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
|
|
26
|
-
kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
|
|
27
|
-
kodit/search/__init__.py,sha256=4QbdjbrlhNKMovmuKHxJnUeZT7KNjTTFU0GdnuwUHdQ,36
|
|
28
|
-
kodit/search/search_repository.py,sha256=r1fkV6-cy9BKsy5J4WTHaY_FcjMaT1PV5qqqq0gvjZw,5833
|
|
29
|
-
kodit/search/search_service.py,sha256=KePkqCAc3CUcrpNsbDc5DqbF6W2m0TG6TDa9-VSJZS0,4227
|
|
30
|
-
kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
|
|
31
|
-
kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
|
|
32
|
-
kodit/snippets/snippets.py,sha256=mwN0bM1Msu8ZeEsUHyQ7tx3Hj3vZsm8G7Wu4eWSkLY8,1539
|
|
33
|
-
kodit/snippets/languages/__init__.py,sha256=Bj5KKZSls2MQ8ZY1S_nHg447MgGZW-2WZM-oq6vjwwA,1187
|
|
34
|
-
kodit/snippets/languages/csharp.scm,sha256=gbBN4RiV1FBuTJF6orSnDFi8H9JwTw-d4piLJYsWUsc,222
|
|
35
|
-
kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
|
|
36
|
-
kodit/source/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
|
|
37
|
-
kodit/source/source_models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
|
|
38
|
-
kodit/source/source_repository.py,sha256=0EksMpoLzdkfe8S4eeCm4Sf7TuxsOzOzaF4BBsMYo-4,3163
|
|
39
|
-
kodit/source/source_service.py,sha256=qBV9FCFQbJppeFrVo4uMgvC_mzWRIKldymp5yqLx9pw,9255
|
|
40
|
-
kodit-0.1.13.dist-info/METADATA,sha256=Od1OTG0tkd0Cf82juR2DGKBQ8l1RwHQ5VLgtiIW5qeA,2349
|
|
41
|
-
kodit-0.1.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
42
|
-
kodit-0.1.13.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
|
|
43
|
-
kodit-0.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
44
|
-
kodit-0.1.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|