kodit 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (34) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +39 -19
  3. kodit/{infrastructure/indexing → application/services}/auto_indexing_service.py +9 -1
  4. kodit/application/services/code_indexing_application_service.py +16 -0
  5. kodit/application/services/sync_scheduler.py +4 -1
  6. kodit/config.py +22 -1
  7. kodit/domain/entities.py +5 -0
  8. kodit/domain/protocols.py +4 -0
  9. kodit/domain/services/index_query_service.py +5 -1
  10. kodit/domain/services/index_service.py +11 -0
  11. kodit/infrastructure/api/__init__.py +1 -0
  12. kodit/infrastructure/api/middleware/__init__.py +1 -0
  13. kodit/infrastructure/api/middleware/auth.py +34 -0
  14. kodit/infrastructure/api/v1/__init__.py +5 -0
  15. kodit/infrastructure/api/v1/dependencies.py +70 -0
  16. kodit/infrastructure/api/v1/routers/__init__.py +6 -0
  17. kodit/infrastructure/api/v1/routers/indexes.py +114 -0
  18. kodit/infrastructure/api/v1/routers/search.py +74 -0
  19. kodit/infrastructure/api/v1/schemas/__init__.py +25 -0
  20. kodit/infrastructure/api/v1/schemas/context.py +11 -0
  21. kodit/infrastructure/api/v1/schemas/index.py +101 -0
  22. kodit/infrastructure/api/v1/schemas/search.py +219 -0
  23. kodit/infrastructure/bm25/local_bm25_repository.py +4 -4
  24. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +4 -1
  25. kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +2 -9
  26. kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +4 -10
  27. kodit/infrastructure/sqlalchemy/index_repository.py +29 -0
  28. kodit/infrastructure/ui/progress.py +43 -0
  29. kodit/utils/dump_openapi.py +37 -0
  30. {kodit-0.3.10.dist-info → kodit-0.3.12.dist-info}/METADATA +16 -1
  31. {kodit-0.3.10.dist-info → kodit-0.3.12.dist-info}/RECORD +34 -21
  32. {kodit-0.3.10.dist-info → kodit-0.3.12.dist-info}/WHEEL +0 -0
  33. {kodit-0.3.10.dist-info → kodit-0.3.12.dist-info}/entry_points.txt +0 -0
  34. {kodit-0.3.10.dist-info → kodit-0.3.12.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,101 @@
1
+ """JSON:API schemas for index operations."""
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class IndexAttributes(BaseModel):
9
+ """Index attributes for JSON:API responses."""
10
+
11
+ created_at: datetime
12
+ updated_at: datetime
13
+ uri: str
14
+
15
+
16
+ class SnippetData(BaseModel):
17
+ """Snippet data for JSON:API relationships."""
18
+
19
+ type: str = "snippet"
20
+ id: str
21
+
22
+
23
+ class IndexData(BaseModel):
24
+ """Index data for JSON:API responses."""
25
+
26
+ type: str = "index"
27
+ id: str
28
+ attributes: IndexAttributes
29
+
30
+
31
+ class IndexResponse(BaseModel):
32
+ """JSON:API response for single index."""
33
+
34
+ data: IndexData
35
+
36
+
37
+ class IndexListResponse(BaseModel):
38
+ """JSON:API response for index list."""
39
+
40
+ data: list[IndexData]
41
+
42
+
43
+ class IndexCreateAttributes(BaseModel):
44
+ """Attributes for creating an index."""
45
+
46
+ uri: str = Field(..., description="URI of the source to index")
47
+
48
+
49
+ class IndexCreateData(BaseModel):
50
+ """Data for creating an index."""
51
+
52
+ type: str = "index"
53
+ attributes: IndexCreateAttributes
54
+
55
+
56
+ class IndexCreateRequest(BaseModel):
57
+ """JSON:API request for creating an index."""
58
+
59
+ data: IndexCreateData
60
+
61
+
62
+ class AuthorData(BaseModel):
63
+ """Author data for JSON:API relationships."""
64
+
65
+ type: str = "author"
66
+ id: str
67
+
68
+
69
+ class AuthorsRelationship(BaseModel):
70
+ """Authors relationship for JSON:API."""
71
+
72
+ data: list[AuthorData]
73
+
74
+
75
+ class FileRelationships(BaseModel):
76
+ """File relationships for JSON:API."""
77
+
78
+ authors: AuthorsRelationship
79
+
80
+
81
+ class FileAttributes(BaseModel):
82
+ """File attributes for JSON:API included resources."""
83
+
84
+ uri: str
85
+ sha256: str
86
+ mime_type: str
87
+ created_at: datetime
88
+ updated_at: datetime
89
+
90
+
91
+ class AuthorAttributes(BaseModel):
92
+ """Author attributes for JSON:API included resources."""
93
+
94
+ name: str
95
+ email: str
96
+
97
+
98
+ class IndexDetailResponse(BaseModel):
99
+ """JSON:API response for index details with included resources."""
100
+
101
+ data: IndexData
@@ -0,0 +1,219 @@
1
+ """JSON:API schemas for search operations."""
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class SearchFilters(BaseModel):
9
+ """Search filters for JSON:API requests."""
10
+
11
+ languages: list[str] | None = Field(
12
+ None, description="Programming languages to filter by"
13
+ )
14
+ authors: list[str] | None = Field(None, description="Authors to filter by")
15
+ start_date: datetime | None = Field(
16
+ None, description="Filter snippets created after this date"
17
+ )
18
+ end_date: datetime | None = Field(
19
+ None, description="Filter snippets created before this date"
20
+ )
21
+ sources: list[str] | None = Field(
22
+ None, description="Source repositories to filter by"
23
+ )
24
+ file_patterns: list[str] | None = Field(
25
+ None, description="File path patterns to filter by"
26
+ )
27
+
28
+
29
+ class SearchAttributes(BaseModel):
30
+ """Search attributes for JSON:API requests."""
31
+
32
+ keywords: list[str] | None = Field(None, description="Search keywords")
33
+ code: str | None = Field(None, description="Code search query")
34
+ text: str | None = Field(None, description="Text search query")
35
+ limit: int | None = Field(10, description="Maximum number of results to return")
36
+ filters: SearchFilters | None = Field(None, description="Search filters")
37
+
38
+
39
+ class SearchData(BaseModel):
40
+ """Search data for JSON:API requests."""
41
+
42
+ type: str = "search"
43
+ attributes: SearchAttributes
44
+
45
+
46
+ class SearchRequest(BaseModel):
47
+ """JSON:API request for searching snippets."""
48
+
49
+ data: SearchData
50
+
51
+ @property
52
+ def limit(self) -> int | None:
53
+ """Get the limit from the search request."""
54
+ return self.data.attributes.limit
55
+
56
+ @property
57
+ def languages(self) -> list[str] | None:
58
+ """Get the languages from the search request."""
59
+ return (
60
+ self.data.attributes.filters.languages
61
+ if self.data.attributes.filters
62
+ else None
63
+ )
64
+
65
+ @property
66
+ def authors(self) -> list[str] | None:
67
+ """Get the authors from the search request."""
68
+ return (
69
+ self.data.attributes.filters.authors
70
+ if self.data.attributes.filters
71
+ else None
72
+ )
73
+
74
+ @property
75
+ def start_date(self) -> datetime | None:
76
+ """Get the start date from the search request."""
77
+ return (
78
+ self.data.attributes.filters.start_date
79
+ if self.data.attributes.filters
80
+ else None
81
+ )
82
+
83
+ @property
84
+ def end_date(self) -> datetime | None:
85
+ """Get the end date from the search request."""
86
+ return (
87
+ self.data.attributes.filters.end_date
88
+ if self.data.attributes.filters
89
+ else None
90
+ )
91
+
92
+ @property
93
+ def sources(self) -> list[str] | None:
94
+ """Get the sources from the search request."""
95
+ return (
96
+ self.data.attributes.filters.sources
97
+ if self.data.attributes.filters
98
+ else None
99
+ )
100
+
101
+ @property
102
+ def file_patterns(self) -> list[str] | None:
103
+ """Get the file patterns from the search request."""
104
+ return (
105
+ self.data.attributes.filters.file_patterns
106
+ if self.data.attributes.filters
107
+ else None
108
+ )
109
+
110
+
111
+ class SnippetAttributes(BaseModel):
112
+ """Snippet attributes for JSON:API responses."""
113
+
114
+ content: str
115
+ created_at: datetime
116
+ updated_at: datetime
117
+ original_scores: list[float]
118
+ source_uri: str
119
+ relative_path: str
120
+ language: str
121
+ authors: list[str]
122
+ summary: str
123
+
124
+
125
+ class SnippetData(BaseModel):
126
+ """Snippet data for JSON:API responses."""
127
+
128
+ type: str = "snippet"
129
+ id: int
130
+ attributes: SnippetAttributes
131
+
132
+
133
+ class SearchResponse(BaseModel):
134
+ """JSON:API response for search results."""
135
+
136
+ data: list[SnippetData]
137
+
138
+
139
+ class FileAttributes(BaseModel):
140
+ """File attributes for JSON:API included resources."""
141
+
142
+ uri: str
143
+ sha256: str
144
+ mime_type: str
145
+ created_at: datetime
146
+ updated_at: datetime
147
+
148
+
149
+ class AuthorData(BaseModel):
150
+ """Author data for JSON:API relationships."""
151
+
152
+ type: str = "author"
153
+ id: int
154
+
155
+
156
+ class AuthorsRelationship(BaseModel):
157
+ """Authors relationship for JSON:API."""
158
+
159
+ data: list[AuthorData]
160
+
161
+
162
+ class FileRelationships(BaseModel):
163
+ """File relationships for JSON:API."""
164
+
165
+ authors: AuthorsRelationship
166
+
167
+
168
+ class FileDataWithRelationships(BaseModel):
169
+ """File data with relationships for JSON:API included resources."""
170
+
171
+ type: str = "file"
172
+ id: int
173
+ attributes: FileAttributes
174
+ relationships: FileRelationships
175
+
176
+
177
+ class AuthorAttributes(BaseModel):
178
+ """Author attributes for JSON:API included resources."""
179
+
180
+ name: str
181
+ email: str
182
+
183
+
184
+ class AuthorDataWithAttributes(BaseModel):
185
+ """Author data with attributes for JSON:API included resources."""
186
+
187
+ type: str = "author"
188
+ id: int
189
+ attributes: AuthorAttributes
190
+
191
+
192
+ class SearchResponseWithIncluded(BaseModel):
193
+ """JSON:API response for search results with included resources."""
194
+
195
+ data: list[SnippetData]
196
+ included: list[FileDataWithRelationships | AuthorDataWithAttributes] | None = None
197
+
198
+
199
+ class SnippetDetailAttributes(BaseModel):
200
+ """Snippet detail attributes for JSON:API responses."""
201
+
202
+ created_at: datetime
203
+ updated_at: datetime
204
+ original_content: dict
205
+ summary_content: dict
206
+
207
+
208
+ class SnippetDetailData(BaseModel):
209
+ """Snippet detail data for JSON:API responses."""
210
+
211
+ type: str = "snippet"
212
+ id: str
213
+ attributes: SnippetDetailAttributes
214
+
215
+
216
+ class SnippetDetailResponse(BaseModel):
217
+ """JSON:API response for snippet details."""
218
+
219
+ data: SnippetDetailData
@@ -66,6 +66,7 @@ class LocalBM25Repository(BM25Repository):
66
66
  stemmer=self.stemmer,
67
67
  return_ids=False,
68
68
  show_progress=True,
69
+ lower=True,
69
70
  )
70
71
 
71
72
  async def index_documents(self, request: IndexRequest) -> None:
@@ -78,9 +79,8 @@ class LocalBM25Repository(BM25Repository):
78
79
  vocab = self._tokenize([doc.text for doc in request.documents])
79
80
  self._retriever().index(vocab, show_progress=False)
80
81
  self._retriever().save(self.index_path)
81
- self.snippet_ids = self.snippet_ids + [
82
- doc.snippet_id for doc in request.documents
83
- ]
82
+ # Replace snippet_ids instead of appending, since the BM25 index is rebuilt
83
+ self.snippet_ids = [doc.snippet_id for doc in request.documents]
84
84
  async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
85
85
  await f.write(json.dumps(self.snippet_ids))
86
86
 
@@ -120,7 +120,7 @@ class LocalBM25Repository(BM25Repository):
120
120
 
121
121
  # Filter results by snippet_ids if provided
122
122
  filtered_results = []
123
- for result, score in zip(results[0], scores[0], strict=False):
123
+ for result, score in zip(results[0], scores[0], strict=True):
124
124
  snippet_id = int(result)
125
125
  if score > 0.0 and (
126
126
  request.snippet_ids is None or snippet_id in request.snippet_ids
@@ -70,6 +70,9 @@ UPDATE_QUERY = f"""
70
70
  UPDATE {TABLE_NAME}
71
71
  SET embedding = tokenize(passage, '{TOKENIZER_NAME}')
72
72
  """ # noqa: S608
73
+ # https://github.com/tensorchord/VectorChord-bm25:
74
+ # We intentionally make it negative so that you can use the
75
+ # default order by to get the most relevant documents first.
73
76
  SEARCH_QUERY = f"""
74
77
  SELECT
75
78
  snippet_id,
@@ -185,7 +188,7 @@ class VectorChordBM25Repository(BM25Repository):
185
188
 
186
189
  async def search(self, request: SearchRequest) -> list[SearchResult]:
187
190
  """Search documents using BM25."""
188
- if not request.query or request.query == "":
191
+ if not request.query or request.query.strip() == "":
189
192
  return []
190
193
 
191
194
  if request.snippet_ids is not None:
@@ -112,15 +112,8 @@ class LocalEmbeddingProvider(EmbeddingProvider):
112
112
 
113
113
  except Exception as e:
114
114
  self.log.exception("Error generating embeddings", error=str(e))
115
- # Return zero embeddings on error
116
- responses = [
117
- EmbeddingResponse(
118
- snippet_id=item.snippet_id,
119
- embedding=[0.0] * 1536, # Default embedding size
120
- )
121
- for item in batch
122
- ]
123
- yield responses
115
+ # Return no embeddings for this batch if there was an error
116
+ yield []
124
117
 
125
118
  def _split_sub_batches(
126
119
  self, encoding: "Encoding", data: list[EmbeddingRequest]
@@ -2,10 +2,10 @@
2
2
 
3
3
  import asyncio
4
4
  from collections.abc import AsyncGenerator
5
- from typing import Any
6
5
 
7
6
  import structlog
8
7
  import tiktoken
8
+ from openai import AsyncOpenAI
9
9
  from tiktoken import Encoding
10
10
 
11
11
  from kodit.domain.services.embedding_service import EmbeddingProvider
@@ -25,7 +25,7 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
25
25
  """OpenAI embedding provider that uses OpenAI's embedding API."""
26
26
 
27
27
  def __init__(
28
- self, openai_client: Any, model_name: str = "text-embedding-3-small"
28
+ self, openai_client: AsyncOpenAI, model_name: str = "text-embedding-3-small"
29
29
  ) -> None:
30
30
  """Initialize the OpenAI embedding provider.
31
31
 
@@ -99,14 +99,8 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
99
99
  ]
100
100
  except Exception as e:
101
101
  self.log.exception("Error embedding batch", error=str(e))
102
- # Fall back to zero embeddings so pipeline can continue
103
- return [
104
- EmbeddingResponse(
105
- snippet_id=item.snippet_id,
106
- embedding=[0.0] * 1536, # Default OpenAI dim
107
- )
108
- for item in batch
109
- ]
102
+ # Return no embeddings for this batch if there was an error
103
+ return []
110
104
 
111
105
  tasks = [_process_batch(batch) for batch in batched_data]
112
106
  for task in asyncio.as_completed(tasks):
@@ -577,3 +577,32 @@ class SqlAlchemyIndexRepository(IndexRepository):
577
577
  domain_snippet, index.id
578
578
  )
579
579
  self._session.add(db_snippet)
580
+
581
+ async def delete(self, index: domain_entities.Index) -> None:
582
+ """Delete everything related to an index."""
583
+ # Delete all snippets and embeddings
584
+ await self.delete_snippets(index.id)
585
+
586
+ # Delete all author file mappings
587
+ stmt = delete(db_entities.AuthorFileMapping).where(
588
+ db_entities.AuthorFileMapping.file_id.in_(
589
+ [file.id for file in index.source.working_copy.files]
590
+ )
591
+ )
592
+ await self._session.execute(stmt)
593
+
594
+ # Delete all files
595
+ stmt = delete(db_entities.File).where(
596
+ db_entities.File.source_id == index.source.id
597
+ )
598
+ await self._session.execute(stmt)
599
+
600
+ # Delete the source
601
+ stmt = delete(db_entities.Source).where(
602
+ db_entities.Source.id == index.source.id
603
+ )
604
+ await self._session.execute(stmt)
605
+
606
+ # Delete the index
607
+ stmt = delete(db_entities.Index).where(db_entities.Index.id == index.id)
608
+ await self._session.execute(stmt)
@@ -2,6 +2,7 @@
2
2
 
3
3
  from collections.abc import Callable
4
4
 
5
+ import structlog
5
6
  from tqdm import tqdm # type: ignore[import-untyped]
6
7
 
7
8
  from kodit.domain.interfaces import ProgressCallback
@@ -42,6 +43,43 @@ class TQDMProgressCallback(ProgressCallback):
42
43
  # TQDM will handle cleanup with leave=False
43
44
 
44
45
 
46
+ class LogProgressCallback(ProgressCallback):
47
+ """Log-based progress callback for server environments."""
48
+
49
+ def __init__(self, milestone_interval: int = 10) -> None:
50
+ """Initialize with milestone logging interval.
51
+
52
+ Args:
53
+ milestone_interval: Percentage interval for logging (default: 10%)
54
+
55
+ """
56
+ self.milestone_interval = milestone_interval
57
+ self._last_logged_percentage = -1
58
+ self.log = structlog.get_logger()
59
+
60
+ async def on_progress(self, event: ProgressEvent) -> None:
61
+ """Log progress at milestone intervals."""
62
+ percentage = int(event.percentage)
63
+
64
+ # Log at milestone intervals (0%, 10%, 20%, etc.)
65
+ milestone = (percentage // self.milestone_interval) * self.milestone_interval
66
+
67
+ if milestone > self._last_logged_percentage and milestone <= percentage:
68
+ self.log.info(
69
+ "Progress milestone reached",
70
+ operation=event.operation,
71
+ percentage=milestone,
72
+ current=event.current,
73
+ total=event.total,
74
+ message=event.message,
75
+ )
76
+ self._last_logged_percentage = milestone
77
+
78
+ async def on_complete(self, operation: str) -> None:
79
+ """Log completion of the operation."""
80
+ self.log.info("Operation completed", operation=operation)
81
+
82
+
45
83
  class LazyProgressCallback(ProgressCallback):
46
84
  """Progress callback that only shows progress when there's actual work to do."""
47
85
 
@@ -125,3 +163,8 @@ def create_multi_stage_progress_callback() -> MultiStageProgressCallback:
125
163
  return MultiStageProgressCallback(
126
164
  lambda operation: create_progress_bar(operation, "items")
127
165
  )
166
+
167
+
168
+ def create_log_progress_callback(milestone_interval: int = 10) -> LogProgressCallback:
169
+ """Create a log-based progress callback for server environments."""
170
+ return LogProgressCallback(milestone_interval=milestone_interval)
@@ -0,0 +1,37 @@
1
+ """Dump the OpenAPI json schema to a file."""
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from openapi_markdown.generator import to_markdown # type: ignore[import-untyped]
9
+ from uvicorn.importer import import_from_string
10
+
11
+ parser = argparse.ArgumentParser(prog="dump-openapi.py")
12
+ parser.add_argument(
13
+ "app", help='App import string. Eg. "kodit.app:app"', default="kodit.app:app"
14
+ )
15
+ parser.add_argument("--out-dir", help="Output directory", default="docs/reference/api")
16
+
17
+ if __name__ == "__main__":
18
+ args = parser.parse_args()
19
+
20
+ app = import_from_string(args.app)
21
+ openapi = app.openapi()
22
+ version = openapi.get("openapi", "unknown version")
23
+
24
+ # Remove any dev tags from the version by retaining only the semver part
25
+ git_tag = openapi["info"]["version"].split(".")[:3]
26
+ openapi["info"]["version"] = ".".join(git_tag)
27
+
28
+ output_json_file = Path(args.out_dir) / "openapi.json"
29
+
30
+ with output_json_file.open("w") as f:
31
+ json.dump(openapi, f, indent=2)
32
+
33
+ output_md_file = Path(args.out_dir) / "index.md"
34
+ templates_dir = Path(args.out_dir) / "templates"
35
+ options: dict[str, Any] = {}
36
+
37
+ to_markdown(str(output_json_file), str(output_md_file), str(templates_dir), options)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.3.10
3
+ Version: 0.3.12
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -72,6 +72,8 @@ Kodit connects your AI coding assistant to external codebases to provide accurat
72
72
 
73
73
  </div>
74
74
 
75
+ :star: _Help us reach more developers and grow the Helix community. Star this repo!_
76
+
75
77
  **Helix Kodit** is an **MCP server** that connects your AI coding assistant to external codebases. It can:
76
78
 
77
79
  - Improve your AI-assisted code by providing canonical examples direct from the source
@@ -120,6 +122,19 @@ intent. Kodit has been tested to work well with:
120
122
  - **New in 0.3**: Hybrid search combining BM25 keyword search with semantic search
121
123
  - **New in 0.4**: Enhanced MCP tools with rich context parameters and metadata
122
124
 
125
+ ### Hosted MCP Server
126
+
127
+ **New in 0.4**: Try Kodit instantly with our hosted MCP server at [https://kodit.helix.ml/mcp](https://kodit.helix.ml/mcp)! No installation required - just add it to your AI coding assistant and start searching popular codebases immediately.
128
+
129
+ The hosted server provides:
130
+
131
+ - Pre-indexed popular open source repositories
132
+ - Zero configuration - works out of the box
133
+ - Same powerful search capabilities as self-hosted Kodit
134
+ - Perfect for trying Kodit before setting up your own instance
135
+
136
+ Find out more in the [hosted Kodit documentation](https://docs.helix.ml/kodit/reference/hosted-kodit/).
137
+
123
138
  ### Enterprise Ready
124
139
 
125
140
  Out of the box, Kodit works with a local SQLite database and very small, local models.