kodit 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (33) hide show
  1. kodit/_version.py +2 -2
  2. kodit/bm25/keyword_search_factory.py +17 -0
  3. kodit/bm25/keyword_search_service.py +34 -0
  4. kodit/bm25/{bm25.py → local_bm25.py} +40 -14
  5. kodit/bm25/vectorchord_bm25.py +193 -0
  6. kodit/cli.py +14 -11
  7. kodit/config.py +9 -2
  8. kodit/database.py +4 -2
  9. kodit/embedding/embedding_factory.py +44 -0
  10. kodit/embedding/embedding_provider/__init__.py +1 -0
  11. kodit/embedding/embedding_provider/embedding_provider.py +53 -0
  12. kodit/embedding/embedding_provider/hash_embedding_provider.py +77 -0
  13. kodit/embedding/embedding_provider/local_embedding_provider.py +58 -0
  14. kodit/embedding/embedding_provider/openai_embedding_provider.py +63 -0
  15. kodit/embedding/embedding_repository.py +206 -0
  16. kodit/embedding/local_vector_search_service.py +50 -0
  17. kodit/embedding/vector_search_service.py +38 -0
  18. kodit/embedding/vectorchord_vector_search_service.py +145 -0
  19. kodit/indexing/indexing_repository.py +24 -4
  20. kodit/indexing/indexing_service.py +25 -30
  21. kodit/mcp.py +7 -3
  22. kodit/search/search_repository.py +0 -121
  23. kodit/search/search_service.py +12 -24
  24. kodit/source/source_service.py +9 -3
  25. kodit/util/__init__.py +1 -0
  26. kodit/util/spinner.py +59 -0
  27. {kodit-0.1.14.dist-info → kodit-0.1.15.dist-info}/METADATA +2 -1
  28. kodit-0.1.15.dist-info/RECORD +58 -0
  29. kodit/embedding/embedding.py +0 -203
  30. kodit-0.1.14.dist-info/RECORD +0 -44
  31. {kodit-0.1.14.dist-info → kodit-0.1.15.dist-info}/WHEEL +0 -0
  32. {kodit-0.1.14.dist-info → kodit-0.1.15.dist-info}/entry_points.txt +0 -0
  33. {kodit-0.1.14.dist-info → kodit-0.1.15.dist-info}/licenses/LICENSE +0 -0
@@ -1,203 +0,0 @@
1
- """Embedding service."""
2
-
3
- import asyncio
4
- import os
5
- from abc import ABC, abstractmethod
6
- from collections.abc import AsyncGenerator
7
- from typing import NamedTuple
8
-
9
- import structlog
10
- import tiktoken
11
- from openai import AsyncOpenAI
12
- from sentence_transformers import SentenceTransformer
13
-
14
- TINY = "tiny"
15
- CODE = "code"
16
- TEST = "test"
17
-
18
- COMMON_EMBEDDING_MODELS = {
19
- TINY: "ibm-granite/granite-embedding-30m-english",
20
- CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
21
- TEST: "minishlab/potion-base-4M",
22
- }
23
-
24
-
25
- class EmbeddingInput(NamedTuple):
26
- """Input for embedding."""
27
-
28
- id: int
29
- text: str
30
-
31
-
32
- class EmbeddingOutput(NamedTuple):
33
- """Output for embedding."""
34
-
35
- id: int
36
- embedding: list[float]
37
-
38
-
39
- class Embedder(ABC):
40
- """Embedder interface."""
41
-
42
- @abstractmethod
43
- def embed(
44
- self, data: list[EmbeddingInput]
45
- ) -> AsyncGenerator[EmbeddingOutput, None]:
46
- """Embed a list of documents.
47
-
48
- The embedding service accepts a massive list of id,strings to embed. Behind the
49
- scenes it batches up requests and parallelizes them for performance according to
50
- the specifics of the embedding service.
51
-
52
- The id reference is required because the parallelization may return results out
53
- of order.
54
- """
55
-
56
- @abstractmethod
57
- def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
58
- """Query the embedding model."""
59
-
60
-
61
- def embedding_factory(openai_client: AsyncOpenAI | None = None) -> Embedder:
62
- """Create an embedding service."""
63
- if openai_client is not None:
64
- return OpenAIEmbedder(openai_client)
65
- return LocalEmbedder(model_name=TINY)
66
-
67
-
68
- class LocalEmbedder(Embedder):
69
- """Local embedder."""
70
-
71
- def __init__(self, model_name: str) -> None:
72
- """Initialize the local embedder."""
73
- self.log = structlog.get_logger(__name__)
74
- self.log.info("Creating local embedder", model_name=model_name)
75
- self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
76
- self.embedding_model = None
77
- self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
78
-
79
- def _model(self) -> SentenceTransformer:
80
- """Get the embedding model."""
81
- if self.embedding_model is None:
82
- os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
83
- self.embedding_model = SentenceTransformer(
84
- self.model_name,
85
- trust_remote_code=True,
86
- device="cpu", # Force CPU so we don't have to install accelerate, etc.
87
- )
88
- return self.embedding_model
89
-
90
- async def embed(
91
- self, data: list[EmbeddingInput]
92
- ) -> AsyncGenerator[EmbeddingOutput, None]:
93
- """Embed a list of documents."""
94
- model = self._model()
95
-
96
- batched_data = _split_sub_batches(self.encoding, data)
97
-
98
- for batch in batched_data:
99
- embeddings = model.encode(
100
- [i.text for i in batch], show_progress_bar=False, batch_size=4
101
- )
102
- for i, x in zip(batch, embeddings, strict=False):
103
- yield EmbeddingOutput(i.id, [float(y) for y in x])
104
-
105
- async def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
106
- """Query the embedding model."""
107
- model = self._model()
108
- embeddings = model.encode(data, show_progress_bar=False, batch_size=4)
109
- for embedding in embeddings:
110
- yield [float(x) for x in embedding]
111
-
112
-
113
- OPENAI_MAX_EMBEDDING_SIZE = 8192
114
- OPENAI_NUM_PARALLEL_TASKS = 10
115
-
116
-
117
- def _split_sub_batches(
118
- encoding: tiktoken.Encoding, data: list[EmbeddingInput]
119
- ) -> list[list[EmbeddingInput]]:
120
- """Split a list of strings into smaller sub-batches."""
121
- log = structlog.get_logger(__name__)
122
- result = []
123
- data_to_process = [s for s in data if s.text.strip()] # Filter out empty strings
124
-
125
- while data_to_process:
126
- next_batch = []
127
- current_tokens = 0
128
-
129
- while data_to_process:
130
- next_item = data_to_process[0]
131
- item_tokens = len(encoding.encode(next_item.text))
132
-
133
- if item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
134
- log.warning("Skipping too long snippet", snippet=data_to_process.pop(0))
135
- continue
136
-
137
- if current_tokens + item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
138
- break
139
-
140
- next_batch.append(data_to_process.pop(0))
141
- current_tokens += item_tokens
142
-
143
- if next_batch:
144
- result.append(next_batch)
145
-
146
- return result
147
-
148
-
149
- class OpenAIEmbedder(Embedder):
150
- """OpenAI embedder."""
151
-
152
- def __init__(
153
- self, openai_client: AsyncOpenAI, model_name: str = "text-embedding-3-small"
154
- ) -> None:
155
- """Initialize the OpenAI embedder."""
156
- self.log = structlog.get_logger(__name__)
157
- self.log.info("Creating OpenAI embedder", model_name=model_name)
158
- self.openai_client = openai_client
159
- self.encoding = tiktoken.encoding_for_model(model_name)
160
- self.log = structlog.get_logger(__name__)
161
-
162
- async def embed(
163
- self,
164
- data: list[EmbeddingInput],
165
- ) -> AsyncGenerator[EmbeddingOutput, None]:
166
- """Embed a list of documents."""
167
- # First split the list into a list of list where each sublist has fewer than
168
- # max tokens.
169
- batched_data = _split_sub_batches(self.encoding, data)
170
-
171
- # Process batches in parallel with a semaphore to limit concurrent requests
172
- sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
173
-
174
- async def process_batch(batch: list[EmbeddingInput]) -> list[EmbeddingOutput]:
175
- async with sem:
176
- try:
177
- response = await self.openai_client.embeddings.create(
178
- model="text-embedding-3-small",
179
- input=[i.text for i in batch],
180
- )
181
- return [
182
- EmbeddingOutput(i.id, x.embedding)
183
- for i, x in zip(batch, response.data, strict=False)
184
- ]
185
- except Exception as e:
186
- self.log.exception("Error embedding batch", error=str(e))
187
- return []
188
-
189
- # Create tasks for all batches
190
- tasks = [process_batch(batch) for batch in batched_data]
191
-
192
- # Process all batches and yield results as they complete
193
- for task in asyncio.as_completed(tasks):
194
- embeddings = await task
195
- for e in embeddings:
196
- yield e
197
-
198
- async def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
199
- """Query the embedding model."""
200
- async for e in self.embed(
201
- [EmbeddingInput(i, text) for i, text in enumerate(data)]
202
- ):
203
- yield e.embedding
@@ -1,44 +0,0 @@
1
- kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
2
- kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
3
- kodit/_version.py,sha256=O_r2EWoixTKREu-RyeL8e93UHfqprj1LCIlwiWXfHcg,513
4
- kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
5
- kodit/cli.py,sha256=VLoXFS1xJnQ0TLy3_cO8-B9tCb4NJHiYPfzZtHxpgRY,7784
6
- kodit/config.py,sha256=TDcLt6fiJn9cI1PoO5AqBqsL_Bxmm9JV5GqRxhj1tLw,4202
7
- kodit/database.py,sha256=kekSdyEATdb47jxzQemkSOXMNOwnUwmVVTpn9hYaDK8,2356
8
- kodit/log.py,sha256=HU1OmuxO4FcVw61k4WW7Y4WM7BrDaeplw1PcBHhuIZY,5434
9
- kodit/mcp.py,sha256=ot5CIH240mSXK3sJcxTf4lBfthq0tcMS8XBGTaHY-n8,5088
10
- kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
11
- kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
12
- kodit/bm25/bm25.py,sha256=JtgJfsHz-2SHx96zxWjkPFSH7fXkahFMp01cDwl4YBg,2298
13
- kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
14
- kodit/embedding/embedding.py,sha256=EMJpHK8ICZk_FjiO9Aqr2IO20qkGOmj_PfA1hyfI7Vk,6745
15
- kodit/embedding/embedding_models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
16
- kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
17
- kodit/indexing/indexing_models.py,sha256=6NX9HVcj6Pu9ePwHC7n-PWSyAgukpJq0nCNmUIigtbo,1282
18
- kodit/indexing/indexing_repository.py,sha256=7bkAiBwtr3qlkdhNIalwMwbxezVz_RQGOhLVWPKHwNk,5506
19
- kodit/indexing/indexing_service.py,sha256=VGfKgbkYEAYP_gIubvhMxo3yThT20ndS5xdg2LxwRgA,6685
20
- kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
21
- kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
22
- kodit/migrations/env.py,sha256=w1M7OZh-ZeR2dPHS0ByXAUxQjfZQ8xIzMseWuzLDTWw,2469
23
- kodit/migrations/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
24
- kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQKILCDQRA5jEaats9aGZs9Wdtp-j-38SF4,1644
25
- kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
26
- kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
27
- kodit/search/__init__.py,sha256=4QbdjbrlhNKMovmuKHxJnUeZT7KNjTTFU0GdnuwUHdQ,36
28
- kodit/search/search_repository.py,sha256=r1fkV6-cy9BKsy5J4WTHaY_FcjMaT1PV5qqqq0gvjZw,5833
29
- kodit/search/search_service.py,sha256=KePkqCAc3CUcrpNsbDc5DqbF6W2m0TG6TDa9-VSJZS0,4227
30
- kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
31
- kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
32
- kodit/snippets/snippets.py,sha256=mwN0bM1Msu8ZeEsUHyQ7tx3Hj3vZsm8G7Wu4eWSkLY8,1539
33
- kodit/snippets/languages/__init__.py,sha256=Bj5KKZSls2MQ8ZY1S_nHg447MgGZW-2WZM-oq6vjwwA,1187
34
- kodit/snippets/languages/csharp.scm,sha256=gbBN4RiV1FBuTJF6orSnDFi8H9JwTw-d4piLJYsWUsc,222
35
- kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
36
- kodit/source/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
37
- kodit/source/source_models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
38
- kodit/source/source_repository.py,sha256=0EksMpoLzdkfe8S4eeCm4Sf7TuxsOzOzaF4BBsMYo-4,3163
39
- kodit/source/source_service.py,sha256=qBV9FCFQbJppeFrVo4uMgvC_mzWRIKldymp5yqLx9pw,9255
40
- kodit-0.1.14.dist-info/METADATA,sha256=acFpcf0ODyUSnA1hg4BPlLexpOEh-0yuaqsaWUNopOs,2349
41
- kodit-0.1.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
- kodit-0.1.14.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
43
- kodit-0.1.14.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- kodit-0.1.14.dist-info/RECORD,,
File without changes