kodit 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (42) hide show
  1. kodit/_version.py +2 -2
  2. kodit/bm25/keyword_search_factory.py +17 -0
  3. kodit/bm25/keyword_search_service.py +34 -0
  4. kodit/bm25/{bm25.py → local_bm25.py} +40 -14
  5. kodit/bm25/vectorchord_bm25.py +193 -0
  6. kodit/cli.py +114 -25
  7. kodit/config.py +9 -2
  8. kodit/database.py +4 -2
  9. kodit/embedding/embedding_factory.py +44 -0
  10. kodit/embedding/embedding_provider/__init__.py +1 -0
  11. kodit/embedding/embedding_provider/embedding_provider.py +60 -0
  12. kodit/embedding/embedding_provider/hash_embedding_provider.py +77 -0
  13. kodit/embedding/embedding_provider/local_embedding_provider.py +58 -0
  14. kodit/embedding/embedding_provider/openai_embedding_provider.py +75 -0
  15. kodit/{search/search_repository.py → embedding/embedding_repository.py} +61 -33
  16. kodit/embedding/local_vector_search_service.py +50 -0
  17. kodit/embedding/vector_search_service.py +38 -0
  18. kodit/embedding/vectorchord_vector_search_service.py +154 -0
  19. kodit/enrichment/__init__.py +1 -0
  20. kodit/enrichment/enrichment_factory.py +23 -0
  21. kodit/enrichment/enrichment_provider/__init__.py +1 -0
  22. kodit/enrichment/enrichment_provider/enrichment_provider.py +16 -0
  23. kodit/enrichment/enrichment_provider/local_enrichment_provider.py +63 -0
  24. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +77 -0
  25. kodit/enrichment/enrichment_service.py +33 -0
  26. kodit/indexing/fusion.py +67 -0
  27. kodit/indexing/indexing_repository.py +44 -4
  28. kodit/indexing/indexing_service.py +142 -31
  29. kodit/mcp.py +31 -18
  30. kodit/snippets/languages/go.scm +26 -0
  31. kodit/source/source_service.py +9 -3
  32. kodit/util/__init__.py +1 -0
  33. kodit/util/spinner.py +59 -0
  34. {kodit-0.1.14.dist-info → kodit-0.1.16.dist-info}/METADATA +4 -1
  35. kodit-0.1.16.dist-info/RECORD +64 -0
  36. kodit/embedding/embedding.py +0 -203
  37. kodit/search/__init__.py +0 -1
  38. kodit/search/search_service.py +0 -147
  39. kodit-0.1.14.dist-info/RECORD +0 -44
  40. {kodit-0.1.14.dist-info → kodit-0.1.16.dist-info}/WHEEL +0 -0
  41. {kodit-0.1.14.dist-info → kodit-0.1.16.dist-info}/entry_points.txt +0 -0
  42. {kodit-0.1.14.dist-info → kodit-0.1.16.dist-info}/licenses/LICENSE +0 -0
@@ -109,6 +109,8 @@ class SourceService:
109
109
  uri_or_path_like = uri_or_path_like + ".git"
110
110
  try:
111
111
  return await self._create_git_source(uri_or_path_like)
112
+ except git.GitCommandError:
113
+ raise
112
114
  except ValueError:
113
115
  pass
114
116
 
@@ -197,11 +199,14 @@ class SourceService:
197
199
  clone_path.mkdir(parents=True, exist_ok=True)
198
200
 
199
201
  try:
200
- # Clone the repository
202
+ self.log.info("Cloning repository", uri=uri, clone_path=str(clone_path))
201
203
  git.Repo.clone_from(uri, clone_path)
202
204
  except git.GitCommandError as e:
203
- msg = f"Failed to clone repository: {e}"
204
- raise ValueError(msg) from e
205
+ if "already exists and is not an empty directory" in str(e):
206
+ self.log.info("Repository already exists, reusing...", uri=uri)
207
+ else:
208
+ msg = f"Failed to clone repository: {e}"
209
+ raise ValueError(msg) from e
205
210
 
206
211
  source = await self.repository.create_source(
207
212
  Source(uri=uri, cloned_path=str(clone_path)),
@@ -212,6 +217,7 @@ class SourceService:
212
217
  file_count = sum(1 for _ in clone_path.rglob("*") if _.is_file())
213
218
 
214
219
  # Process each file in the source directory
220
+ self.log.info("Inspecting files", source_id=source.id)
215
221
  for path in tqdm(clone_path.rglob("*"), total=file_count, leave=False):
216
222
  await self._process_file(source.id, path.absolute())
217
223
 
kodit/util/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Utility functions and classes."""
kodit/util/spinner.py ADDED
@@ -0,0 +1,59 @@
1
+ """Spinner for long-running tasks."""
2
+
3
+ import itertools
4
+ import sys
5
+ import threading
6
+ import time
7
+
8
+
9
+ class Spinner:
10
+ """Spinner for long-running tasks."""
11
+
12
+ def __init__(self, delay: float = 0.1) -> None:
13
+ """Initialize the spinner."""
14
+ self.spinner = itertools.cycle(["-", "/", "|", "\\"])
15
+ self.delay = delay
16
+ self.busy = False
17
+ self.spinner_visible = False
18
+
19
+ def write_next(self) -> None:
20
+ """Write the next character of the spinner."""
21
+ with self._screen_lock:
22
+ if not self.spinner_visible:
23
+ sys.stdout.write(next(self.spinner))
24
+ self.spinner_visible = True
25
+ sys.stdout.flush()
26
+
27
+ def remove_spinner(self, cleanup: bool = False) -> None: # noqa: FBT001, FBT002
28
+ """Remove the spinner."""
29
+ with self._screen_lock:
30
+ if self.spinner_visible:
31
+ sys.stdout.write("\b")
32
+ self.spinner_visible = False
33
+ if cleanup:
34
+ sys.stdout.write(" ") # overwrite spinner with blank
35
+ sys.stdout.write("\r") # move to next line
36
+ sys.stdout.flush()
37
+
38
+ def spinner_task(self) -> None:
39
+ """Task that runs the spinner."""
40
+ while self.busy:
41
+ self.write_next()
42
+ time.sleep(self.delay)
43
+ self.remove_spinner()
44
+
45
+ def __enter__(self) -> None:
46
+ """Enter the context manager."""
47
+ if sys.stdout.isatty():
48
+ self._screen_lock = threading.Lock()
49
+ self.busy = True
50
+ self.thread = threading.Thread(target=self.spinner_task)
51
+ self.thread.start()
52
+
53
+ def __exit__(self, exception: object, value: object, tb: object) -> None:
54
+ """Exit the context manager."""
55
+ if sys.stdout.isatty():
56
+ self.busy = False
57
+ self.remove_spinner(cleanup=True)
58
+ else:
59
+ sys.stdout.write("\r")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -15,12 +15,14 @@ Keywords: ai,indexing,mcp,rag
15
15
  Classifier: Development Status :: 2 - Pre-Alpha
16
16
  Classifier: Intended Audience :: Developers
17
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
18
19
  Classifier: Topic :: Software Development :: Code Generators
19
20
  Requires-Python: >=3.12
20
21
  Requires-Dist: aiofiles>=24.1.0
21
22
  Requires-Dist: aiosqlite>=0.20.0
22
23
  Requires-Dist: alembic>=1.15.2
23
24
  Requires-Dist: asgi-correlation-id>=4.3.4
25
+ Requires-Dist: asyncpg>=0.30.0
24
26
  Requires-Dist: better-exceptions>=0.3.3
25
27
  Requires-Dist: bm25s[core]>=0.2.12
26
28
  Requires-Dist: click>=8.1.8
@@ -41,6 +43,7 @@ Requires-Dist: sqlalchemy[asyncio]>=2.0.40
41
43
  Requires-Dist: structlog>=25.3.0
42
44
  Requires-Dist: tdqm>=0.0.1
43
45
  Requires-Dist: tiktoken>=0.9.0
46
+ Requires-Dist: transformers>=4.51.3
44
47
  Requires-Dist: tree-sitter-language-pack>=0.7.3
45
48
  Requires-Dist: tree-sitter>=0.24.0
46
49
  Requires-Dist: uritools>=5.0.0
@@ -0,0 +1,64 @@
1
+ kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
2
+ kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
3
+ kodit/_version.py,sha256=VYJNWHISWEW-KD_clKUYcTY_Z30r993Sjws4URJIL0g,513
4
+ kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
5
+ kodit/cli.py,sha256=i7eEt0FdIQGEfXKFte-8fBcZZGE8BPXBp40aGwJDQGI,11323
6
+ kodit/config.py,sha256=2W2u5J8j-Mbt-C4xzOuK-PeuDCx0S_rnCXPhBwvfLT4,4353
7
+ kodit/database.py,sha256=WB1KpVxUYPgiJGU0gJa2hqytYB8wJEJ5z3WayhWzNMU,2403
8
+ kodit/log.py,sha256=HU1OmuxO4FcVw61k4WW7Y4WM7BrDaeplw1PcBHhuIZY,5434
9
+ kodit/mcp.py,sha256=QruyPskWB0_x59pkfj5BBeXuR13GMny5TAZEa2j4U9s,5752
10
+ kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
11
+ kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
12
+ kodit/bm25/keyword_search_factory.py,sha256=rp-wx3DJsc2KlELK1V337EyeYvmwnMQwUqOo1WVPSmg,631
13
+ kodit/bm25/keyword_search_service.py,sha256=aBbWQKgQmi2re3EIHdXFS00n7Wj3b2D0pZsLZ4qmHfE,754
14
+ kodit/bm25/local_bm25.py,sha256=AAbFhbQDqyL3d7jsPL7W4HsLxdoYctaDsREUXOLy6jM,3260
15
+ kodit/bm25/vectorchord_bm25.py,sha256=_nGrkUReYLLV-L8RIuIVLwjuhSYZl9T532n5OVf0kWs,6393
16
+ kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
17
+ kodit/embedding/embedding_factory.py,sha256=UGnFRyyQXazSUOwyW4Hg7Vq2-kfAoDj9lD4CTLu8x04,1630
18
+ kodit/embedding/embedding_models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
19
+ kodit/embedding/embedding_repository.py,sha256=-ux3scpBzel8c0pMH9fNOEsSXFIzl-IfgaWrkTb1szo,6907
20
+ kodit/embedding/local_vector_search_service.py,sha256=hkF0qlfzjyGt400qIX9Mr6B7b7i8WvYIYWN2Z2C_pcs,1907
21
+ kodit/embedding/vector_search_service.py,sha256=pQJ129QjGrAWOXzqkywmgtDRpy8_gtzYgkivyqF9Vrs,1009
22
+ kodit/embedding/vectorchord_vector_search_service.py,sha256=KSs0IMFHHIllwq2d3A0LGqGGZDqO1Ht6K-BCfBBWW0Y,5051
23
+ kodit/embedding/embedding_provider/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
24
+ kodit/embedding/embedding_provider/embedding_provider.py,sha256=Tf3bwUsUMzAgoyLFM5qBtOLqPp1qr03TzrwGczkDvy0,1835
25
+ kodit/embedding/embedding_provider/hash_embedding_provider.py,sha256=nAhlhh8j8PqqCCbhVl26Y8ntFBm2vJBCtB4X04g5Wwg,2638
26
+ kodit/embedding/embedding_provider/local_embedding_provider.py,sha256=4ER-UPq506Y0TWU6qcs0nUqw6bSKQkSrdog-DhNQWM8,1906
27
+ kodit/embedding/embedding_provider/openai_embedding_provider.py,sha256=V_jdUXiaGdslplwxMlfgFc4_hAVS2eaJXMTs2C7RiLI,2666
28
+ kodit/enrichment/__init__.py,sha256=vBEolHpKaHUhfINX0dSGyAPlvgpLNAer9YzFtdvCB24,18
29
+ kodit/enrichment/enrichment_factory.py,sha256=vKjkUTdhj74IW2S4GENDWdWMJx6BwUSZjJGDC0i7DSk,787
30
+ kodit/enrichment/enrichment_service.py,sha256=87Sd3gGbEMJYb_wVrHG8L1yGIZmQNR7foUS4_y94azI,977
31
+ kodit/enrichment/enrichment_provider/__init__.py,sha256=klf8iuLVWX4iRz-DZQauFFNAoJC5CByczh48TBZPW-o,27
32
+ kodit/enrichment/enrichment_provider/enrichment_provider.py,sha256=E0H5rq3OENM0yYbA8K_3nSnj5lUHCpoIOqpWLo-2MVU,413
33
+ kodit/enrichment/enrichment_provider/local_enrichment_provider.py,sha256=bR6HR1gH7wtZdMLOwaKdASjvllRo1FlNW9GyZC11zAM,2164
34
+ kodit/enrichment/enrichment_provider/openai_enrichment_provider.py,sha256=gYuFTAeIVdQNlCUvNSPgRoiRwCvRD0C8419h8ubyABA,2725
35
+ kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
36
+ kodit/indexing/fusion.py,sha256=TZb4fPAedXdEUXzwzOofW98QIOymdbclBOP1KOijuEk,1674
37
+ kodit/indexing/indexing_models.py,sha256=6NX9HVcj6Pu9ePwHC7n-PWSyAgukpJq0nCNmUIigtbo,1282
38
+ kodit/indexing/indexing_repository.py,sha256=GYHoACUWYKQdVTwP7tfik_TMUD1WUK76nywH88eCSwg,7006
39
+ kodit/indexing/indexing_service.py,sha256=tKcZpi0pzsmF6OpqnqF0Q5HfSXxi5iLTysrVSou4JiQ,10579
40
+ kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
41
+ kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
42
+ kodit/migrations/env.py,sha256=w1M7OZh-ZeR2dPHS0ByXAUxQjfZQ8xIzMseWuzLDTWw,2469
43
+ kodit/migrations/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
44
+ kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQKILCDQRA5jEaats9aGZs9Wdtp-j-38SF4,1644
45
+ kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
46
+ kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
47
+ kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
48
+ kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
49
+ kodit/snippets/snippets.py,sha256=mwN0bM1Msu8ZeEsUHyQ7tx3Hj3vZsm8G7Wu4eWSkLY8,1539
50
+ kodit/snippets/languages/__init__.py,sha256=Bj5KKZSls2MQ8ZY1S_nHg447MgGZW-2WZM-oq6vjwwA,1187
51
+ kodit/snippets/languages/csharp.scm,sha256=gbBN4RiV1FBuTJF6orSnDFi8H9JwTw-d4piLJYsWUsc,222
52
+ kodit/snippets/languages/go.scm,sha256=SEX9mTOrhP2KiQW7oflDKkd21u5dK56QbJ4LvTDxY8A,533
53
+ kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
54
+ kodit/source/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
55
+ kodit/source/source_models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
56
+ kodit/source/source_repository.py,sha256=0EksMpoLzdkfe8S4eeCm4Sf7TuxsOzOzaF4BBsMYo-4,3163
57
+ kodit/source/source_service.py,sha256=u_GaH07ewakThQJRfT8O_yZ54A52qLtJuM1bF3xUT2A,9633
58
+ kodit/util/__init__.py,sha256=bPu6CtqDWCRGU7VgW2_aiQrCBi8G89FS6k1PjvDajJ0,37
59
+ kodit/util/spinner.py,sha256=R9bzrHtBiIH6IfLbmsIVHL53s8vg-tqW4lwGGALu4dw,1932
60
+ kodit-0.1.16.dist-info/METADATA,sha256=1lR4ZSTiRBzUv9Gj8FPspv4GU2vWGQU6HSiffWgU2Do,2467
61
+ kodit-0.1.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
62
+ kodit-0.1.16.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
63
+ kodit-0.1.16.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
64
+ kodit-0.1.16.dist-info/RECORD,,
@@ -1,203 +0,0 @@
1
- """Embedding service."""
2
-
3
- import asyncio
4
- import os
5
- from abc import ABC, abstractmethod
6
- from collections.abc import AsyncGenerator
7
- from typing import NamedTuple
8
-
9
- import structlog
10
- import tiktoken
11
- from openai import AsyncOpenAI
12
- from sentence_transformers import SentenceTransformer
13
-
14
- TINY = "tiny"
15
- CODE = "code"
16
- TEST = "test"
17
-
18
- COMMON_EMBEDDING_MODELS = {
19
- TINY: "ibm-granite/granite-embedding-30m-english",
20
- CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
21
- TEST: "minishlab/potion-base-4M",
22
- }
23
-
24
-
25
- class EmbeddingInput(NamedTuple):
26
- """Input for embedding."""
27
-
28
- id: int
29
- text: str
30
-
31
-
32
- class EmbeddingOutput(NamedTuple):
33
- """Output for embedding."""
34
-
35
- id: int
36
- embedding: list[float]
37
-
38
-
39
- class Embedder(ABC):
40
- """Embedder interface."""
41
-
42
- @abstractmethod
43
- def embed(
44
- self, data: list[EmbeddingInput]
45
- ) -> AsyncGenerator[EmbeddingOutput, None]:
46
- """Embed a list of documents.
47
-
48
- The embedding service accepts a massive list of id,strings to embed. Behind the
49
- scenes it batches up requests and parallelizes them for performance according to
50
- the specifics of the embedding service.
51
-
52
- The id reference is required because the parallelization may return results out
53
- of order.
54
- """
55
-
56
- @abstractmethod
57
- def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
58
- """Query the embedding model."""
59
-
60
-
61
- def embedding_factory(openai_client: AsyncOpenAI | None = None) -> Embedder:
62
- """Create an embedding service."""
63
- if openai_client is not None:
64
- return OpenAIEmbedder(openai_client)
65
- return LocalEmbedder(model_name=TINY)
66
-
67
-
68
- class LocalEmbedder(Embedder):
69
- """Local embedder."""
70
-
71
- def __init__(self, model_name: str) -> None:
72
- """Initialize the local embedder."""
73
- self.log = structlog.get_logger(__name__)
74
- self.log.info("Creating local embedder", model_name=model_name)
75
- self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
76
- self.embedding_model = None
77
- self.encoding = tiktoken.encoding_for_model("text-embedding-3-small")
78
-
79
- def _model(self) -> SentenceTransformer:
80
- """Get the embedding model."""
81
- if self.embedding_model is None:
82
- os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
83
- self.embedding_model = SentenceTransformer(
84
- self.model_name,
85
- trust_remote_code=True,
86
- device="cpu", # Force CPU so we don't have to install accelerate, etc.
87
- )
88
- return self.embedding_model
89
-
90
- async def embed(
91
- self, data: list[EmbeddingInput]
92
- ) -> AsyncGenerator[EmbeddingOutput, None]:
93
- """Embed a list of documents."""
94
- model = self._model()
95
-
96
- batched_data = _split_sub_batches(self.encoding, data)
97
-
98
- for batch in batched_data:
99
- embeddings = model.encode(
100
- [i.text for i in batch], show_progress_bar=False, batch_size=4
101
- )
102
- for i, x in zip(batch, embeddings, strict=False):
103
- yield EmbeddingOutput(i.id, [float(y) for y in x])
104
-
105
- async def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
106
- """Query the embedding model."""
107
- model = self._model()
108
- embeddings = model.encode(data, show_progress_bar=False, batch_size=4)
109
- for embedding in embeddings:
110
- yield [float(x) for x in embedding]
111
-
112
-
113
- OPENAI_MAX_EMBEDDING_SIZE = 8192
114
- OPENAI_NUM_PARALLEL_TASKS = 10
115
-
116
-
117
- def _split_sub_batches(
118
- encoding: tiktoken.Encoding, data: list[EmbeddingInput]
119
- ) -> list[list[EmbeddingInput]]:
120
- """Split a list of strings into smaller sub-batches."""
121
- log = structlog.get_logger(__name__)
122
- result = []
123
- data_to_process = [s for s in data if s.text.strip()] # Filter out empty strings
124
-
125
- while data_to_process:
126
- next_batch = []
127
- current_tokens = 0
128
-
129
- while data_to_process:
130
- next_item = data_to_process[0]
131
- item_tokens = len(encoding.encode(next_item.text))
132
-
133
- if item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
134
- log.warning("Skipping too long snippet", snippet=data_to_process.pop(0))
135
- continue
136
-
137
- if current_tokens + item_tokens > OPENAI_MAX_EMBEDDING_SIZE:
138
- break
139
-
140
- next_batch.append(data_to_process.pop(0))
141
- current_tokens += item_tokens
142
-
143
- if next_batch:
144
- result.append(next_batch)
145
-
146
- return result
147
-
148
-
149
- class OpenAIEmbedder(Embedder):
150
- """OpenAI embedder."""
151
-
152
- def __init__(
153
- self, openai_client: AsyncOpenAI, model_name: str = "text-embedding-3-small"
154
- ) -> None:
155
- """Initialize the OpenAI embedder."""
156
- self.log = structlog.get_logger(__name__)
157
- self.log.info("Creating OpenAI embedder", model_name=model_name)
158
- self.openai_client = openai_client
159
- self.encoding = tiktoken.encoding_for_model(model_name)
160
- self.log = structlog.get_logger(__name__)
161
-
162
- async def embed(
163
- self,
164
- data: list[EmbeddingInput],
165
- ) -> AsyncGenerator[EmbeddingOutput, None]:
166
- """Embed a list of documents."""
167
- # First split the list into a list of list where each sublist has fewer than
168
- # max tokens.
169
- batched_data = _split_sub_batches(self.encoding, data)
170
-
171
- # Process batches in parallel with a semaphore to limit concurrent requests
172
- sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
173
-
174
- async def process_batch(batch: list[EmbeddingInput]) -> list[EmbeddingOutput]:
175
- async with sem:
176
- try:
177
- response = await self.openai_client.embeddings.create(
178
- model="text-embedding-3-small",
179
- input=[i.text for i in batch],
180
- )
181
- return [
182
- EmbeddingOutput(i.id, x.embedding)
183
- for i, x in zip(batch, response.data, strict=False)
184
- ]
185
- except Exception as e:
186
- self.log.exception("Error embedding batch", error=str(e))
187
- return []
188
-
189
- # Create tasks for all batches
190
- tasks = [process_batch(batch) for batch in batched_data]
191
-
192
- # Process all batches and yield results as they complete
193
- for task in asyncio.as_completed(tasks):
194
- embeddings = await task
195
- for e in embeddings:
196
- yield e
197
-
198
- async def query(self, data: list[str]) -> AsyncGenerator[list[float], None]:
199
- """Query the embedding model."""
200
- async for e in self.embed(
201
- [EmbeddingInput(i, text) for i, text in enumerate(data)]
202
- ):
203
- yield e.embedding
kodit/search/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Search for relevant snippets."""
@@ -1,147 +0,0 @@
1
- """Search service."""
2
-
3
- from pathlib import Path
4
-
5
- import pydantic
6
- import structlog
7
-
8
- from kodit.bm25.bm25 import BM25Service
9
- from kodit.embedding.embedding import Embedder
10
- from kodit.embedding.embedding_models import EmbeddingType
11
- from kodit.search.search_repository import SearchRepository
12
-
13
-
14
- class SearchRequest(pydantic.BaseModel):
15
- """Request for a search."""
16
-
17
- code_query: str | None = None
18
- keywords: list[str] | None = None
19
- top_k: int = 10
20
-
21
-
22
- class SearchResult(pydantic.BaseModel):
23
- """Data transfer object for search results.
24
-
25
- This model represents a single search result, containing both the file path
26
- and the matching snippet content.
27
- """
28
-
29
- id: int
30
- uri: str
31
- content: str
32
-
33
-
34
- class Snippet(pydantic.BaseModel):
35
- """Snippet model."""
36
-
37
- content: str
38
- file_path: str
39
-
40
-
41
- class SearchService:
42
- """Service for searching for relevant data."""
43
-
44
- def __init__(
45
- self,
46
- repository: SearchRepository,
47
- data_dir: Path,
48
- embedding_service: Embedder,
49
- ) -> None:
50
- """Initialize the search service."""
51
- self.repository = repository
52
- self.log = structlog.get_logger(__name__)
53
- self.bm25 = BM25Service(data_dir)
54
- self.code_embedding_service = embedding_service
55
-
56
- async def search(self, request: SearchRequest) -> list[SearchResult]:
57
- """Search for relevant data."""
58
- fusion_list = []
59
- if request.keywords:
60
- snippet_ids = await self.repository.list_snippet_ids()
61
-
62
- # Gather results for each keyword
63
- result_ids: list[tuple[int, float]] = []
64
- for keyword in request.keywords:
65
- results = self.bm25.retrieve(snippet_ids, keyword, request.top_k)
66
- result_ids.extend(results)
67
-
68
- # Sort results by score
69
- result_ids.sort(key=lambda x: x[1], reverse=True)
70
-
71
- self.log.debug("Search results (BM25)", results=result_ids)
72
-
73
- bm25_results = [x[0] for x in result_ids]
74
- fusion_list.append(bm25_results)
75
-
76
- # Compute embedding for semantic query
77
- semantic_results = []
78
- if request.code_query:
79
- query_embedding = await anext(
80
- self.code_embedding_service.query([request.code_query])
81
- )
82
-
83
- query_results = await self.repository.list_semantic_results(
84
- EmbeddingType.CODE, query_embedding, top_k=request.top_k
85
- )
86
-
87
- # Sort results by score
88
- query_results.sort(key=lambda x: x[1], reverse=True)
89
-
90
- # Extract the snippet ids from the query results
91
- semantic_results = [x[0] for x in query_results]
92
- fusion_list.append(semantic_results)
93
-
94
- if len(fusion_list) == 0:
95
- return []
96
-
97
- # Combine all results together with RFF if required
98
- final_results = reciprocal_rank_fusion(fusion_list, k=60)
99
-
100
- # Extract ids from final results
101
- final_ids = [x[0] for x in final_results]
102
-
103
- # Get snippets from database (up to top_k)
104
- search_results = await self.repository.list_snippets_by_ids(
105
- final_ids[: request.top_k]
106
- )
107
-
108
- return [
109
- SearchResult(
110
- id=snippet.id,
111
- uri=file.uri,
112
- content=snippet.content,
113
- )
114
- for file, snippet in search_results
115
- ]
116
-
117
-
118
- def reciprocal_rank_fusion(
119
- rankings: list[list[int]], k: float = 60
120
- ) -> list[tuple[int, float]]:
121
- """RRF prioritises results that are present in all results.
122
-
123
- Args:
124
- rankings: List of rankers, each containing a list of document ids. Top of the
125
- list is considered to be the best result.
126
- k: Parameter for RRF.
127
-
128
- Returns:
129
- Dictionary of ids and their scores.
130
-
131
- """
132
- scores = {}
133
- for ranker in rankings:
134
- for rank in ranker:
135
- scores[rank] = float(0)
136
-
137
- for ranker in rankings:
138
- for i, rank in enumerate(ranker):
139
- scores[rank] += 1.0 / (k + i)
140
-
141
- # Create a list of tuples of ids and their scores
142
- results = [(rank, scores[rank]) for rank in scores]
143
-
144
- # Sort results by score
145
- results.sort(key=lambda x: x[1], reverse=True)
146
-
147
- return results
@@ -1,44 +0,0 @@
1
- kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
2
- kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
3
- kodit/_version.py,sha256=O_r2EWoixTKREu-RyeL8e93UHfqprj1LCIlwiWXfHcg,513
4
- kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
5
- kodit/cli.py,sha256=VLoXFS1xJnQ0TLy3_cO8-B9tCb4NJHiYPfzZtHxpgRY,7784
6
- kodit/config.py,sha256=TDcLt6fiJn9cI1PoO5AqBqsL_Bxmm9JV5GqRxhj1tLw,4202
7
- kodit/database.py,sha256=kekSdyEATdb47jxzQemkSOXMNOwnUwmVVTpn9hYaDK8,2356
8
- kodit/log.py,sha256=HU1OmuxO4FcVw61k4WW7Y4WM7BrDaeplw1PcBHhuIZY,5434
9
- kodit/mcp.py,sha256=ot5CIH240mSXK3sJcxTf4lBfthq0tcMS8XBGTaHY-n8,5088
10
- kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
11
- kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
12
- kodit/bm25/bm25.py,sha256=JtgJfsHz-2SHx96zxWjkPFSH7fXkahFMp01cDwl4YBg,2298
13
- kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
14
- kodit/embedding/embedding.py,sha256=EMJpHK8ICZk_FjiO9Aqr2IO20qkGOmj_PfA1hyfI7Vk,6745
15
- kodit/embedding/embedding_models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
16
- kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
17
- kodit/indexing/indexing_models.py,sha256=6NX9HVcj6Pu9ePwHC7n-PWSyAgukpJq0nCNmUIigtbo,1282
18
- kodit/indexing/indexing_repository.py,sha256=7bkAiBwtr3qlkdhNIalwMwbxezVz_RQGOhLVWPKHwNk,5506
19
- kodit/indexing/indexing_service.py,sha256=VGfKgbkYEAYP_gIubvhMxo3yThT20ndS5xdg2LxwRgA,6685
20
- kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
21
- kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
22
- kodit/migrations/env.py,sha256=w1M7OZh-ZeR2dPHS0ByXAUxQjfZQ8xIzMseWuzLDTWw,2469
23
- kodit/migrations/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
24
- kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQKILCDQRA5jEaats9aGZs9Wdtp-j-38SF4,1644
25
- kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
26
- kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
27
- kodit/search/__init__.py,sha256=4QbdjbrlhNKMovmuKHxJnUeZT7KNjTTFU0GdnuwUHdQ,36
28
- kodit/search/search_repository.py,sha256=r1fkV6-cy9BKsy5J4WTHaY_FcjMaT1PV5qqqq0gvjZw,5833
29
- kodit/search/search_service.py,sha256=KePkqCAc3CUcrpNsbDc5DqbF6W2m0TG6TDa9-VSJZS0,4227
30
- kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
31
- kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
32
- kodit/snippets/snippets.py,sha256=mwN0bM1Msu8ZeEsUHyQ7tx3Hj3vZsm8G7Wu4eWSkLY8,1539
33
- kodit/snippets/languages/__init__.py,sha256=Bj5KKZSls2MQ8ZY1S_nHg447MgGZW-2WZM-oq6vjwwA,1187
34
- kodit/snippets/languages/csharp.scm,sha256=gbBN4RiV1FBuTJF6orSnDFi8H9JwTw-d4piLJYsWUsc,222
35
- kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
36
- kodit/source/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
37
- kodit/source/source_models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
38
- kodit/source/source_repository.py,sha256=0EksMpoLzdkfe8S4eeCm4Sf7TuxsOzOzaF4BBsMYo-4,3163
39
- kodit/source/source_service.py,sha256=qBV9FCFQbJppeFrVo4uMgvC_mzWRIKldymp5yqLx9pw,9255
40
- kodit-0.1.14.dist-info/METADATA,sha256=acFpcf0ODyUSnA1hg4BPlLexpOEh-0yuaqsaWUNopOs,2349
41
- kodit-0.1.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
- kodit-0.1.14.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
43
- kodit-0.1.14.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- kodit-0.1.14.dist-info/RECORD,,
File without changes