kodit 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

kodit/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.2.1'
21
- __version_tuple__ = version_tuple = (0, 2, 1)
20
+ __version__ = version = '0.2.3'
21
+ __version_tuple__ = version_tuple = (0, 2, 3)
kodit/app.py CHANGED
@@ -21,6 +21,12 @@ async def root() -> dict[str, str]:
21
21
  return {"message": "Hello, World!"}
22
22
 
23
23
 
24
+ @app.get("/healthz")
25
+ async def healthz() -> dict[str, str]:
26
+ """Return a health check for the kodit API."""
27
+ return {"status": "ok"}
28
+
29
+
24
30
  # Add mcp routes last, otherwise previous routes aren't added
25
31
  app.mount("", mcp_app)
26
32
 
kodit/bm25/local_bm25.py CHANGED
@@ -64,6 +64,10 @@ class BM25Service(KeywordSearchProvider):
64
64
  async def index(self, corpus: list[BM25Document]) -> None:
65
65
  """Index a new corpus."""
66
66
  self.log.debug("Indexing corpus")
67
+ if not corpus or len(corpus) == 0:
68
+ self.log.warning("Corpus is empty, skipping bm25 index")
69
+ return
70
+
67
71
  vocab = self._tokenize([doc.text for doc in corpus])
68
72
  self._retriever().index(vocab, show_progress=False)
69
73
  self._retriever().save(self.index_path)
@@ -77,6 +81,10 @@ class BM25Service(KeywordSearchProvider):
77
81
  self.log.warning("Top k is 0, returning empty list")
78
82
  return []
79
83
 
84
+ # Check that the index has data
85
+ if not hasattr(self._retriever(), "scores"):
86
+ return []
87
+
80
88
  # Get the number of documents in the index
81
89
  num_docs = self._retriever().scores["num_docs"]
82
90
  if num_docs == 0:
@@ -2,6 +2,7 @@
2
2
 
3
3
  from typing import Any
4
4
 
5
+ import structlog
5
6
  from sqlalchemy import Result, TextClause, bindparam, text
6
7
  from sqlalchemy.ext.asyncio import AsyncSession
7
8
 
@@ -93,6 +94,7 @@ class VectorChordBM25(KeywordSearchProvider):
93
94
  """Initialize the VectorChord BM25."""
94
95
  self.__session = session
95
96
  self._initialized = False
97
+ self.log = structlog.get_logger(__name__)
96
98
 
97
99
  async def _initialize(self) -> None:
98
100
  """Initialize the VectorChord environment."""
@@ -149,7 +151,8 @@ class VectorChordBM25(KeywordSearchProvider):
149
151
  if doc.snippet_id is not None and doc.text is not None and doc.text != ""
150
152
  ]
151
153
 
152
- if not corpus:
154
+ if not corpus or len(corpus) == 0:
155
+ self.log.warning("Corpus is empty, skipping bm25 index")
153
156
  return
154
157
 
155
158
  # Execute inserts
kodit/cli.py CHANGED
@@ -81,6 +81,7 @@ async def index(
81
81
  )
82
82
 
83
83
  if not sources:
84
+ log_event("kodit.cli.index.list")
84
85
  # No source specified, list all indexes
85
86
  indexes = await service.list_indexes()
86
87
  headers: list[str | Cell] = [
@@ -108,7 +109,8 @@ async def index(
108
109
  msg = "File indexing is not implemented yet"
109
110
  raise click.UsageError(msg)
110
111
 
111
- # Index directory
112
+ # Index source
113
+ log_event("kodit.cli.index.create")
112
114
  s = await source_service.create(source)
113
115
  index = await service.create(s.id)
114
116
  await service.run(index.id)
@@ -134,6 +136,7 @@ async def code(
134
136
 
135
137
  This works best if your query is code.
136
138
  """
139
+ log_event("kodit.cli.search.code")
137
140
  source_repository = SourceRepository(session)
138
141
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
139
142
  repository = IndexRepository(session)
@@ -177,6 +180,7 @@ async def keyword(
177
180
  top_k: int,
178
181
  ) -> None:
179
182
  """Search for snippets using keyword search."""
183
+ log_event("kodit.cli.search.keyword")
180
184
  source_repository = SourceRepository(session)
181
185
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
182
186
  repository = IndexRepository(session)
@@ -223,6 +227,7 @@ async def text(
223
227
 
224
228
  This works best if your query is text.
225
229
  """
230
+ log_event("kodit.cli.search.text")
226
231
  source_repository = SourceRepository(session)
227
232
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
228
233
  repository = IndexRepository(session)
@@ -270,6 +275,7 @@ async def hybrid( # noqa: PLR0913
270
275
  text: str,
271
276
  ) -> None:
272
277
  """Search for snippets using hybrid search."""
278
+ log_event("kodit.cli.search.hybrid")
273
279
  source_repository = SourceRepository(session)
274
280
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
275
281
  repository = IndexRepository(session)
@@ -321,7 +327,7 @@ def serve(
321
327
  """Start the kodit server, which hosts the MCP server and the kodit API."""
322
328
  log = structlog.get_logger(__name__)
323
329
  log.info("Starting kodit server", host=host, port=port)
324
- log_event("kodit_server_started")
330
+ log_event("kodit.cli.serve")
325
331
 
326
332
  # Configure uvicorn with graceful shutdown
327
333
  config = uvicorn.Config(
kodit/config.py CHANGED
@@ -14,7 +14,6 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
14
14
  if TYPE_CHECKING:
15
15
  from collections.abc import Callable, Coroutine
16
16
 
17
- from openai import AsyncOpenAI
18
17
 
19
18
  from kodit.database import Database
20
19
 
@@ -25,13 +24,16 @@ DEFAULT_LOG_FORMAT = "pretty"
25
24
  DEFAULT_DISABLE_TELEMETRY = False
26
25
  T = TypeVar("T")
27
26
 
27
+ EndpointType = Literal["openai"]
28
+
28
29
 
29
30
  class Endpoint(BaseModel):
30
31
  """Endpoint provides configuration for an AI service."""
31
32
 
32
- type: Literal["openai"] = Field(default="openai")
33
- api_key: str | None = None
33
+ type: EndpointType | None = None
34
34
  base_url: str | None = None
35
+ model: str | None = None
36
+ api_key: str | None = None
35
37
 
36
38
 
37
39
  class Search(BaseModel):
@@ -57,15 +59,20 @@ class AppContext(BaseSettings):
57
59
  log_format: str = Field(default=DEFAULT_LOG_FORMAT)
58
60
  disable_telemetry: bool = Field(default=DEFAULT_DISABLE_TELEMETRY)
59
61
  default_endpoint: Endpoint | None = Field(
60
- default=Endpoint(
61
- type="openai",
62
- base_url="https://api.openai.com/v1",
63
- ),
62
+ default=None,
64
63
  description=(
65
64
  "Default endpoint to use for all AI interactions "
66
65
  "(can be overridden by task-specific configuration)."
67
66
  ),
68
67
  )
68
+ embedding_endpoint: Endpoint | None = Field(
69
+ default=None,
70
+ description="Endpoint to use for embedding.",
71
+ )
72
+ enrichment_endpoint: Endpoint | None = Field(
73
+ default=None,
74
+ description="Endpoint to use for enrichment.",
75
+ )
69
76
  default_search: Search = Field(
70
77
  default=Search(),
71
78
  )
@@ -95,23 +102,6 @@ class AppContext(BaseSettings):
95
102
  await self._db.run_migrations(self.db_url)
96
103
  return self._db
97
104
 
98
- def get_default_openai_client(self) -> AsyncOpenAI | None:
99
- """Get the default OpenAI client, if it is configured."""
100
- from openai import AsyncOpenAI
101
-
102
- endpoint = self.default_endpoint
103
- if not (
104
- endpoint
105
- and endpoint.type == "openai"
106
- and endpoint.api_key
107
- and endpoint.base_url
108
- ):
109
- return None
110
- return AsyncOpenAI(
111
- api_key=endpoint.api_key,
112
- base_url=endpoint.base_url,
113
- )
114
-
115
105
 
116
106
  with_app_context = click.make_pass_decorator(AppContext)
117
107
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
4
 
5
- from kodit.config import AppContext
5
+ from kodit.config import AppContext, Endpoint
6
6
  from kodit.embedding.embedding_provider.local_embedding_provider import (
7
7
  CODE,
8
8
  LocalEmbeddingProvider,
@@ -16,25 +16,44 @@ from kodit.embedding.vector_search_service import (
16
16
  VectorSearchService,
17
17
  )
18
18
  from kodit.embedding.vectorchord_vector_search_service import (
19
+ TaskName,
19
20
  VectorChordVectorSearchService,
20
21
  )
22
+ from kodit.log import log_event
23
+
24
+
25
+ def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
26
+ """Get the endpoint configuration for the embedding service."""
27
+ return app_context.embedding_endpoint or app_context.default_endpoint or None
21
28
 
22
29
 
23
30
  def embedding_factory(
24
- task_name: str, app_context: AppContext, session: AsyncSession
31
+ task_name: TaskName, app_context: AppContext, session: AsyncSession
25
32
  ) -> VectorSearchService:
26
33
  """Create an embedding service."""
27
34
  embedding_repository = EmbeddingRepository(session=session)
28
- embedding_provider = None
29
- openai_client = app_context.get_default_openai_client()
30
- if openai_client is not None:
31
- embedding_provider = OpenAIEmbeddingProvider(openai_client=openai_client)
35
+ endpoint = _get_endpoint_configuration(app_context)
36
+
37
+ if endpoint and endpoint.type == "openai":
38
+ log_event("kodit.embedding", {"provider": "openai"})
39
+ from openai import AsyncOpenAI
40
+
41
+ embedding_provider = OpenAIEmbeddingProvider(
42
+ openai_client=AsyncOpenAI(
43
+ api_key=endpoint.api_key or "default",
44
+ base_url=endpoint.base_url or "https://api.openai.com/v1",
45
+ ),
46
+ model_name=endpoint.model or "text-embedding-3-small",
47
+ )
32
48
  else:
49
+ log_event("kodit.embedding", {"provider": "local"})
33
50
  embedding_provider = LocalEmbeddingProvider(CODE)
34
51
 
35
52
  if app_context.default_search.provider == "vectorchord":
53
+ log_event("kodit.database", {"provider": "vectorchord"})
36
54
  return VectorChordVectorSearchService(task_name, session, embedding_provider)
37
55
  if app_context.default_search.provider == "sqlite":
56
+ log_event("kodit.database", {"provider": "sqlite"})
38
57
  return LocalVectorSearchService(
39
58
  embedding_repository=embedding_repository,
40
59
  embedding_provider=embedding_provider,
@@ -39,14 +39,14 @@ def split_sub_batches(
39
39
 
40
40
  while data_to_process:
41
41
  next_item = data_to_process[0]
42
- item_tokens = len(encoding.encode(next_item))
42
+ item_tokens = len(encoding.encode(next_item, disallowed_special=()))
43
43
 
44
44
  if item_tokens > max_context_window:
45
45
  # Loop around trying to truncate the snippet until it fits in the max
46
46
  # embedding size
47
47
  while item_tokens > max_context_window:
48
48
  next_item = next_item[:-1]
49
- item_tokens = len(encoding.encode(next_item))
49
+ item_tokens = len(encoding.encode(next_item, disallowed_special=()))
50
50
 
51
51
  data_to_process[0] = next_item
52
52
 
@@ -27,7 +27,9 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
27
27
  self.log = structlog.get_logger(__name__)
28
28
  self.openai_client = openai_client
29
29
  self.model_name = model_name
30
- self.encoding = tiktoken.encoding_for_model(model_name)
30
+ self.encoding = tiktoken.encoding_for_model(
31
+ "text-embedding-3-small"
32
+ ) # Sensible default
31
33
 
32
34
  async def embed(self, data: list[str]) -> list[Vector]:
33
35
  """Embed a list of documents."""
@@ -29,6 +29,10 @@ class LocalVectorSearchService(VectorSearchService):
29
29
 
30
30
  async def index(self, data: list[VectorSearchRequest]) -> None:
31
31
  """Embed a list of documents."""
32
+ if not data or len(data) == 0:
33
+ self.log.warning("Embedding data is empty, skipping embedding")
34
+ return
35
+
32
36
  embeddings = await self.embedding_provider.embed([i.text for i in data])
33
37
  for i, x in zip(data, embeddings, strict=False):
34
38
  await self.embedding_repository.create_embedding(
@@ -1,7 +1,8 @@
1
1
  """Vectorchord vector search."""
2
2
 
3
- from typing import Any
3
+ from typing import Any, Literal
4
4
 
5
+ import structlog
5
6
  from sqlalchemy import Result, TextClause, text
6
7
  from sqlalchemy.ext.asyncio import AsyncSession
7
8
 
@@ -51,13 +52,15 @@ ORDER BY score ASC
51
52
  LIMIT :top_k;
52
53
  """
53
54
 
55
+ TaskName = Literal["code", "text"]
56
+
54
57
 
55
58
  class VectorChordVectorSearchService(VectorSearchService):
56
59
  """VectorChord vector search."""
57
60
 
58
61
  def __init__(
59
62
  self,
60
- task_name: str,
63
+ task_name: TaskName,
61
64
  session: AsyncSession,
62
65
  embedding_provider: EmbeddingProvider,
63
66
  ) -> None:
@@ -67,6 +70,7 @@ class VectorChordVectorSearchService(VectorSearchService):
67
70
  self._initialized = False
68
71
  self.table_name = f"vectorchord_{task_name}_embeddings"
69
72
  self.index_name = f"{self.table_name}_idx"
73
+ self.log = structlog.get_logger(__name__)
70
74
 
71
75
  async def _initialize(self) -> None:
72
76
  """Initialize the VectorChord environment."""
@@ -128,6 +132,10 @@ class VectorChordVectorSearchService(VectorSearchService):
128
132
 
129
133
  async def index(self, data: list[VectorSearchRequest]) -> None:
130
134
  """Embed a list of documents."""
135
+ if not data or len(data) == 0:
136
+ self.log.warning("Embedding data is empty, skipping embedding")
137
+ return
138
+
131
139
  embeddings = await self.embedding_provider.embed([doc.text for doc in data])
132
140
  # Execute inserts
133
141
  await self._execute(
@@ -1,6 +1,6 @@
1
1
  """Embedding service."""
2
2
 
3
- from kodit.config import AppContext
3
+ from kodit.config import AppContext, Endpoint
4
4
  from kodit.enrichment.enrichment_provider.local_enrichment_provider import (
5
5
  LocalEnrichmentProvider,
6
6
  )
@@ -11,13 +11,32 @@ from kodit.enrichment.enrichment_service import (
11
11
  EnrichmentService,
12
12
  LLMEnrichmentService,
13
13
  )
14
+ from kodit.log import log_event
15
+
16
+
17
+ def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
18
+ """Get the endpoint configuration for the enrichment service."""
19
+ return app_context.enrichment_endpoint or app_context.default_endpoint or None
14
20
 
15
21
 
16
22
  def enrichment_factory(app_context: AppContext) -> EnrichmentService:
17
- """Create an embedding service."""
18
- openai_client = app_context.get_default_openai_client()
19
- if openai_client is not None:
20
- enrichment_provider = OpenAIEnrichmentProvider(openai_client=openai_client)
21
- return LLMEnrichmentService(enrichment_provider)
23
+ """Create an enrichment service."""
24
+ endpoint = _get_endpoint_configuration(app_context)
25
+ endpoint = app_context.enrichment_endpoint or app_context.default_endpoint or None
26
+
27
+ if endpoint and endpoint.type == "openai":
28
+ log_event("kodit.enrichment", {"provider": "openai"})
29
+ from openai import AsyncOpenAI
30
+
31
+ enrichment_provider = OpenAIEnrichmentProvider(
32
+ openai_client=AsyncOpenAI(
33
+ api_key=endpoint.api_key or "default",
34
+ base_url=endpoint.base_url or "https://api.openai.com/v1",
35
+ ),
36
+ model_name=endpoint.model or "gpt-4o-mini",
37
+ )
38
+ else:
39
+ log_event("kodit.enrichment", {"provider": "local"})
40
+ enrichment_provider = LocalEnrichmentProvider()
22
41
 
23
- return LLMEnrichmentService(LocalEnrichmentProvider())
42
+ return LLMEnrichmentService(enrichment_provider=enrichment_provider)
@@ -34,6 +34,10 @@ class LocalEnrichmentProvider(EnrichmentProvider):
34
34
 
35
35
  async def enrich(self, data: list[str]) -> list[str]:
36
36
  """Enrich a list of strings."""
37
+ if not data or len(data) == 0:
38
+ self.log.warning("Data is empty, skipping enrichment")
39
+ return []
40
+
37
41
  from transformers.models.auto.modeling_auto import (
38
42
  AutoModelForCausalLM,
39
43
  )
@@ -27,10 +27,14 @@ class OpenAIEnrichmentProvider(EnrichmentProvider):
27
27
  self.log = structlog.get_logger(__name__)
28
28
  self.openai_client = openai_client
29
29
  self.model_name = model_name
30
- self.encoding = tiktoken.encoding_for_model(model_name)
30
+ self.encoding = tiktoken.encoding_for_model("gpt-4o-mini") # Approximation
31
31
 
32
32
  async def enrich(self, data: list[str]) -> list[str]:
33
33
  """Enrich a list of documents."""
34
+ if not data or len(data) == 0:
35
+ self.log.warning("Data is empty, skipping enrichment")
36
+ return []
37
+
34
38
  # Process batches in parallel with a semaphore to limit concurrent requests
35
39
  sem = asyncio.Semaphore(OPENAI_NUM_PARALLEL_TASKS)
36
40
 
@@ -26,6 +26,7 @@ from kodit.enrichment.enrichment_service import EnrichmentService
26
26
  from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
27
27
  from kodit.indexing.indexing_models import Snippet
28
28
  from kodit.indexing.indexing_repository import IndexRepository
29
+ from kodit.log import log_event
29
30
  from kodit.snippets.snippets import SnippetService
30
31
  from kodit.source.source_service import SourceService
31
32
  from kodit.util.spinner import Spinner
@@ -45,7 +46,7 @@ class IndexView(pydantic.BaseModel):
45
46
  created_at: datetime
46
47
  updated_at: datetime | None = None
47
48
  source: str | None = None
48
- num_snippets: int | None = None
49
+ num_snippets: int
49
50
 
50
51
 
51
52
  class SearchRequest(pydantic.BaseModel):
@@ -119,6 +120,8 @@ class IndexService:
119
120
  ValueError: If the source doesn't exist or already has an index.
120
121
 
121
122
  """
123
+ log_event("kodit.index.create")
124
+
122
125
  # Check if the source exists
123
126
  source = await self.source_service.get(source_id)
124
127
 
@@ -129,6 +132,8 @@ class IndexService:
129
132
  return IndexView(
130
133
  id=index.id,
131
134
  created_at=index.created_at,
135
+ num_snippets=await self.repository.num_snippets_for_index(index.id),
136
+ source=source.uri,
132
137
  )
133
138
 
134
139
  async def list_indexes(self) -> list[IndexView]:
@@ -142,19 +147,33 @@ class IndexService:
142
147
  indexes = await self.repository.list_indexes()
143
148
 
144
149
  # Transform database results into DTOs
145
- return [
150
+ indexes = [
146
151
  IndexView(
147
152
  id=index.id,
148
153
  created_at=index.created_at,
149
154
  updated_at=index.updated_at,
150
- num_snippets=await self.repository.num_snippets_for_index(index.id),
155
+ num_snippets=await self.repository.num_snippets_for_index(index.id)
156
+ or 0,
151
157
  source=source.uri,
152
158
  )
153
159
  for index, source in indexes
154
160
  ]
155
161
 
162
+ # Help Kodit by measuring how much people are using indexes
163
+ log_event(
164
+ "kodit.index.list",
165
+ {
166
+ "num_indexes": len(indexes),
167
+ "num_snippets": sum([index.num_snippets for index in indexes]),
168
+ },
169
+ )
170
+
171
+ return indexes
172
+
156
173
  async def run(self, index_id: int) -> None:
157
174
  """Run the indexing process for a specific index."""
175
+ log_event("kodit.index.run")
176
+
158
177
  # Get and validate index
159
178
  index = await self.repository.get_by_id(index_id)
160
179
  if not index:
@@ -218,6 +237,8 @@ class IndexService:
218
237
 
219
238
  async def search(self, request: SearchRequest) -> list[SearchResult]:
220
239
  """Search for relevant data."""
240
+ log_event("kodit.index.search")
241
+
221
242
  fusion_list: list[list[FusionRequest]] = []
222
243
  if request.keywords:
223
244
  # Gather results for each keyword
@@ -289,6 +310,10 @@ class IndexService:
289
310
 
290
311
  """
291
312
  files = await self.repository.files_for_index(index_id)
313
+ if not files:
314
+ self.log.warning("No files to create snippets for")
315
+ return
316
+
292
317
  for file in tqdm(files, total=len(files), leave=False):
293
318
  # Skip unsupported file types
294
319
  if file.mime_type in MIME_BLACKLIST: