kodit 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (100) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +59 -24
  3. kodit/application/factories/reporting_factory.py +16 -7
  4. kodit/application/factories/server_factory.py +311 -0
  5. kodit/application/services/code_search_application_service.py +144 -0
  6. kodit/application/services/commit_indexing_application_service.py +543 -0
  7. kodit/application/services/indexing_worker_service.py +13 -46
  8. kodit/application/services/queue_service.py +24 -3
  9. kodit/application/services/reporting.py +70 -54
  10. kodit/application/services/sync_scheduler.py +15 -31
  11. kodit/cli.py +2 -763
  12. kodit/cli_utils.py +2 -9
  13. kodit/config.py +3 -96
  14. kodit/database.py +38 -1
  15. kodit/domain/entities/__init__.py +276 -0
  16. kodit/domain/entities/git.py +190 -0
  17. kodit/domain/factories/__init__.py +1 -0
  18. kodit/domain/factories/git_repo_factory.py +76 -0
  19. kodit/domain/protocols.py +270 -46
  20. kodit/domain/services/bm25_service.py +5 -1
  21. kodit/domain/services/embedding_service.py +3 -0
  22. kodit/domain/services/git_repository_service.py +429 -0
  23. kodit/domain/services/git_service.py +300 -0
  24. kodit/domain/services/task_status_query_service.py +19 -0
  25. kodit/domain/value_objects.py +113 -147
  26. kodit/infrastructure/api/client/__init__.py +0 -2
  27. kodit/infrastructure/api/v1/__init__.py +0 -4
  28. kodit/infrastructure/api/v1/dependencies.py +105 -44
  29. kodit/infrastructure/api/v1/routers/__init__.py +0 -6
  30. kodit/infrastructure/api/v1/routers/commits.py +271 -0
  31. kodit/infrastructure/api/v1/routers/queue.py +2 -2
  32. kodit/infrastructure/api/v1/routers/repositories.py +282 -0
  33. kodit/infrastructure/api/v1/routers/search.py +31 -14
  34. kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
  35. kodit/infrastructure/api/v1/schemas/commit.py +96 -0
  36. kodit/infrastructure/api/v1/schemas/context.py +2 -0
  37. kodit/infrastructure/api/v1/schemas/repository.py +128 -0
  38. kodit/infrastructure/api/v1/schemas/search.py +12 -9
  39. kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
  40. kodit/infrastructure/api/v1/schemas/tag.py +31 -0
  41. kodit/infrastructure/api/v1/schemas/task_status.py +41 -0
  42. kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
  43. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
  44. kodit/infrastructure/cloning/git/git_python_adaptor.py +467 -0
  45. kodit/infrastructure/cloning/git/working_copy.py +10 -3
  46. kodit/infrastructure/embedding/embedding_factory.py +3 -2
  47. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  48. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
  49. kodit/infrastructure/enrichment/litellm_enrichment_provider.py +19 -26
  50. kodit/infrastructure/enrichment/local_enrichment_provider.py +41 -30
  51. kodit/infrastructure/indexing/fusion_service.py +1 -1
  52. kodit/infrastructure/mappers/git_mapper.py +193 -0
  53. kodit/infrastructure/mappers/snippet_mapper.py +106 -0
  54. kodit/infrastructure/mappers/task_mapper.py +5 -44
  55. kodit/infrastructure/mappers/task_status_mapper.py +85 -0
  56. kodit/infrastructure/reporting/db_progress.py +23 -0
  57. kodit/infrastructure/reporting/log_progress.py +13 -38
  58. kodit/infrastructure/reporting/telemetry_progress.py +21 -0
  59. kodit/infrastructure/slicing/slicer.py +32 -31
  60. kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
  61. kodit/infrastructure/sqlalchemy/entities.py +428 -131
  62. kodit/infrastructure/sqlalchemy/git_branch_repository.py +263 -0
  63. kodit/infrastructure/sqlalchemy/git_commit_repository.py +337 -0
  64. kodit/infrastructure/sqlalchemy/git_repository.py +252 -0
  65. kodit/infrastructure/sqlalchemy/git_tag_repository.py +257 -0
  66. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +484 -0
  67. kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
  68. kodit/infrastructure/sqlalchemy/task_status_repository.py +91 -0
  69. kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
  70. kodit/mcp.py +12 -26
  71. kodit/migrations/env.py +1 -1
  72. kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
  73. kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
  74. kodit/migrations/versions/b9cd1c3fd762_add_task_status.py +77 -0
  75. kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
  76. kodit/py.typed +0 -0
  77. kodit/utils/dump_openapi.py +7 -4
  78. kodit/utils/path_utils.py +29 -0
  79. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/METADATA +3 -3
  80. kodit-0.5.0.dist-info/RECORD +137 -0
  81. kodit/application/factories/code_indexing_factory.py +0 -193
  82. kodit/application/services/auto_indexing_service.py +0 -103
  83. kodit/application/services/code_indexing_application_service.py +0 -393
  84. kodit/domain/entities.py +0 -323
  85. kodit/domain/services/index_query_service.py +0 -70
  86. kodit/domain/services/index_service.py +0 -267
  87. kodit/infrastructure/api/client/index_client.py +0 -57
  88. kodit/infrastructure/api/v1/routers/indexes.py +0 -119
  89. kodit/infrastructure/api/v1/schemas/index.py +0 -101
  90. kodit/infrastructure/bm25/bm25_factory.py +0 -28
  91. kodit/infrastructure/cloning/__init__.py +0 -1
  92. kodit/infrastructure/cloning/metadata.py +0 -98
  93. kodit/infrastructure/mappers/index_mapper.py +0 -345
  94. kodit/infrastructure/reporting/tdqm_progress.py +0 -73
  95. kodit/infrastructure/slicing/language_detection_service.py +0 -18
  96. kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
  97. kodit-0.4.2.dist-info/RECORD +0 -119
  98. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/WHEEL +0 -0
  99. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/entry_points.txt +0 -0
  100. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,58 @@
1
+ """Snippet JSON-API schemas."""
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class SnippetContentSchema(BaseModel):
9
+ """Snippet content schema following JSON-API spec."""
10
+
11
+ value: str
12
+ language: str
13
+
14
+
15
+ class GitFileSchema(BaseModel):
16
+ """Git file schema following JSON-API spec."""
17
+
18
+ blob_sha: str
19
+ path: str
20
+ mime_type: str
21
+ size: int
22
+
23
+
24
+ class EnrichmentSchema(BaseModel):
25
+ """Enrichment schema following JSON-API spec."""
26
+
27
+ type: str
28
+ content: str
29
+
30
+
31
+ class SnippetAttributes(BaseModel):
32
+ """Snippet attributes following JSON-API spec."""
33
+
34
+ created_at: datetime | None = None
35
+ updated_at: datetime | None = None
36
+ derives_from: list[GitFileSchema]
37
+ content: SnippetContentSchema
38
+ enrichments: list[EnrichmentSchema]
39
+
40
+
41
+ class SnippetData(BaseModel):
42
+ """Snippet data following JSON-API spec."""
43
+
44
+ type: str = "snippet"
45
+ id: str
46
+ attributes: SnippetAttributes
47
+
48
+
49
+ class SnippetResponse(BaseModel):
50
+ """Single snippet response following JSON-API spec."""
51
+
52
+ data: SnippetData
53
+
54
+
55
+ class SnippetListResponse(BaseModel):
56
+ """Snippet list response following JSON-API spec."""
57
+
58
+ data: list[SnippetData]
@@ -0,0 +1,31 @@
1
+ """Tag JSON-API schemas."""
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class TagAttributes(BaseModel):
7
+ """Tag attributes following JSON-API spec."""
8
+
9
+ name: str
10
+ target_commit_sha: str
11
+ is_version_tag: bool
12
+
13
+
14
+ class TagData(BaseModel):
15
+ """Tag data following JSON-API spec."""
16
+
17
+ type: str = "tag"
18
+ id: str # The tag name
19
+ attributes: TagAttributes
20
+
21
+
22
+ class TagResponse(BaseModel):
23
+ """Single tag response following JSON-API spec."""
24
+
25
+ data: TagData
26
+
27
+
28
+ class TagListResponse(BaseModel):
29
+ """Tag list response following JSON-API spec."""
30
+
31
+ data: list[TagData]
@@ -0,0 +1,41 @@
1
+ """JSON:API schemas for task status operations."""
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class TaskStatusAttributes(BaseModel):
9
+ """Task status attributes for JSON:API responses."""
10
+
11
+ step: str = Field(..., description="Name of the task/operation")
12
+ state: str = Field(..., description="Current state of the task")
13
+ progress: float = Field(
14
+ default=0.0, ge=0.0, le=100.0, description="Progress percentage (0-100)"
15
+ )
16
+ total: int = Field(default=0, description="Total number of items to process")
17
+ current: int = Field(default=0, description="Current number of items processed")
18
+ created_at: datetime | None = Field(default=None, description="Task start time")
19
+ updated_at: datetime | None = Field(default=None, description="Last update time")
20
+ error: str = Field(default="", description="Error message")
21
+ message: str = Field(default="", description="Message")
22
+
23
+
24
+ class TaskStatusData(BaseModel):
25
+ """Task status data for JSON:API responses."""
26
+
27
+ type: str = "task_status"
28
+ id: str
29
+ attributes: TaskStatusAttributes
30
+
31
+
32
+ class TaskStatusResponse(BaseModel):
33
+ """JSON:API response for single task status."""
34
+
35
+ data: TaskStatusData
36
+
37
+
38
+ class TaskStatusListResponse(BaseModel):
39
+ """JSON:API response for task status list."""
40
+
41
+ data: list[TaskStatusData]
@@ -37,7 +37,7 @@ class LocalBM25Repository(BM25Repository):
37
37
  """
38
38
  self.log = structlog.get_logger(__name__)
39
39
  self.index_path = data_dir / "bm25s_index"
40
- self.snippet_ids: list[int] = []
40
+ self.snippet_ids: list[str] = []
41
41
  self.stemmer = Stemmer.Stemmer("english")
42
42
  self.__retriever: bm25s.BM25 | None = None
43
43
 
@@ -76,11 +76,23 @@ class LocalBM25Repository(BM25Repository):
76
76
  self.log.warning("Corpus is empty, skipping bm25 index")
77
77
  return
78
78
 
79
- vocab = self._tokenize([doc.text for doc in request.documents])
79
+ if not self.snippet_ids and (self.index_path / SNIPPET_IDS_FILE).exists():
80
+ async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE) as f:
81
+ self.snippet_ids = json.loads(await f.read())
82
+
83
+ # Filter out documents that have already been indexed
84
+ new_documents = [
85
+ doc for doc in request.documents if doc.snippet_id not in self.snippet_ids
86
+ ]
87
+ if not new_documents:
88
+ self.log.info("No new documents to index")
89
+ return
90
+
91
+ vocab = self._tokenize([doc.text for doc in new_documents])
80
92
  self._retriever().index(vocab, show_progress=False)
81
93
  self._retriever().save(self.index_path)
82
94
  # Replace snippet_ids instead of appending, since the BM25 index is rebuilt
83
- self.snippet_ids = [doc.snippet_id for doc in request.documents]
95
+ self.snippet_ids = [doc.snippet_id for doc in new_documents]
84
96
  async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
85
97
  await f.write(json.dumps(self.snippet_ids))
86
98
 
@@ -121,7 +133,7 @@ class LocalBM25Repository(BM25Repository):
121
133
  # Filter results by snippet_ids if provided
122
134
  filtered_results = []
123
135
  for result, score in zip(results[0], scores[0], strict=True):
124
- snippet_id = int(result)
136
+ snippet_id = result
125
137
  if score > 0.0 and (
126
138
  request.snippet_ids is None or snippet_id in request.snippet_ids
127
139
  ):
@@ -1,9 +1,9 @@
1
1
  """VectorChord BM25 repository implementation."""
2
2
 
3
- from typing import Any
3
+ from collections.abc import Callable
4
4
 
5
5
  import structlog
6
- from sqlalchemy import Result, TextClause, bindparam, text
6
+ from sqlalchemy import bindparam, text
7
7
  from sqlalchemy.ext.asyncio import AsyncSession
8
8
 
9
9
  from kodit.domain.services.bm25_service import BM25Repository
@@ -13,6 +13,7 @@ from kodit.domain.value_objects import (
13
13
  SearchRequest,
14
14
  SearchResult,
15
15
  )
16
+ from kodit.infrastructure.sqlalchemy.unit_of_work import SqlAlchemyUnitOfWork
16
17
 
17
18
  TABLE_NAME = "vectorchord_bm25_documents"
18
19
  INDEX_NAME = f"{TABLE_NAME}_idx"
@@ -29,13 +30,17 @@ SET search_path TO
29
30
  CREATE_BM25_TABLE = f"""
30
31
  CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
31
32
  id SERIAL PRIMARY KEY,
32
- snippet_id BIGINT NOT NULL,
33
+ snippet_id VARCHAR(255) NOT NULL,
33
34
  passage TEXT NOT NULL,
34
35
  embedding bm25vector,
35
36
  UNIQUE(snippet_id)
36
37
  )
37
38
  """
38
-
39
+ CHECK_EXISTING_IDS = f"""
40
+ SELECT snippet_id
41
+ FROM {TABLE_NAME}
42
+ WHERE snippet_id = ANY(:snippet_ids)
43
+ """ # noqa: S608
39
44
  CREATE_BM25_INDEX = f"""
40
45
  CREATE INDEX IF NOT EXISTS {INDEX_NAME}
41
46
  ON {TABLE_NAME}
@@ -103,14 +108,14 @@ WHERE snippet_id IN :snippet_ids
103
108
  class VectorChordBM25Repository(BM25Repository):
104
109
  """VectorChord BM25 repository implementation."""
105
110
 
106
- def __init__(self, session: AsyncSession) -> None:
111
+ def __init__(self, session_factory: Callable[[], AsyncSession]) -> None:
107
112
  """Initialize the VectorChord BM25 repository.
108
113
 
109
114
  Args:
110
115
  session: The SQLAlchemy async session to use for database operations
111
116
 
112
117
  """
113
- self.__session = session
118
+ self.session_factory = session_factory
114
119
  self._initialized = False
115
120
  self.log = structlog.get_logger(__name__)
116
121
 
@@ -127,41 +132,39 @@ class VectorChordBM25Repository(BM25Repository):
127
132
 
128
133
  async def _create_extensions(self) -> None:
129
134
  """Create the necessary extensions."""
130
- await self.__session.execute(text(CREATE_VCHORD_EXTENSION))
131
- await self.__session.execute(text(CREATE_PG_TOKENIZER))
132
- await self.__session.execute(text(CREATE_VCHORD_BM25))
133
- await self.__session.execute(text(SET_SEARCH_PATH))
134
- await self._commit()
135
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
136
+ await session.execute(text(CREATE_VCHORD_EXTENSION))
137
+ await session.execute(text(CREATE_PG_TOKENIZER))
138
+ await session.execute(text(CREATE_VCHORD_BM25))
139
+ await session.execute(text(SET_SEARCH_PATH))
135
140
 
136
141
  async def _create_tokenizer_if_not_exists(self) -> None:
137
142
  """Create the tokenizer if it doesn't exist."""
138
- # Check if tokenizer exists in the catalog
139
- result = await self.__session.execute(text(TOKENIZER_NAME_CHECK_QUERY))
140
- if result.scalar_one_or_none() is None:
141
- # Tokenizer doesn't exist, create it
142
- await self.__session.execute(text(LOAD_TOKENIZER))
143
- await self._commit()
143
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
144
+ # Check if tokenizer exists in the catalog
145
+ result = await session.execute(text(TOKENIZER_NAME_CHECK_QUERY))
146
+ if result.scalar_one_or_none() is None:
147
+ # Tokenizer doesn't exist, create it
148
+ await session.execute(text(LOAD_TOKENIZER))
144
149
 
145
150
  async def _create_tables(self) -> None:
146
151
  """Create the necessary tables in the correct order."""
147
- await self.__session.execute(text(CREATE_BM25_TABLE))
148
- await self.__session.execute(text(CREATE_BM25_INDEX))
149
- await self._commit()
150
-
151
- async def _execute(
152
- self, query: TextClause, param_list: list[Any] | dict[str, Any] | None = None
153
- ) -> Result:
154
- """Execute a query."""
155
- if not self._initialized:
156
- await self._initialize()
157
- return await self.__session.execute(query, param_list)
158
-
159
- async def _commit(self) -> None:
160
- """Commit the session."""
161
- await self.__session.commit()
152
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
153
+ await session.execute(text(CREATE_BM25_TABLE))
154
+ await session.execute(text(CREATE_BM25_INDEX))
155
+
156
+ async def _get_existing_ids(self, snippet_ids: list[str]) -> set[int]:
157
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
158
+ result = await session.execute(
159
+ text(CHECK_EXISTING_IDS), {"snippet_ids": snippet_ids}
160
+ )
161
+ return {row[0] for row in result.fetchall()}
162
162
 
163
163
  async def index_documents(self, request: IndexRequest) -> None:
164
164
  """Index documents for BM25 search."""
165
+ if not self._initialized:
166
+ await self._initialize()
167
+
165
168
  # Filter out any documents that don't have a snippet_id or text
166
169
  valid_documents = [
167
170
  doc
@@ -173,21 +176,35 @@ class VectorChordBM25Repository(BM25Repository):
173
176
  self.log.warning("Corpus is empty, skipping bm25 index")
174
177
  return
175
178
 
176
- # Execute inserts
177
- await self._execute(
178
- text(INSERT_QUERY),
179
- [
180
- {"snippet_id": doc.snippet_id, "passage": doc.text}
181
- for doc in valid_documents
182
- ],
179
+ # Filter out documents that have already been indexed
180
+ existing_ids = await self._get_existing_ids(
181
+ [doc.snippet_id for doc in valid_documents]
183
182
  )
183
+ valid_documents = [
184
+ doc for doc in valid_documents if doc.snippet_id not in existing_ids
185
+ ]
184
186
 
185
- # Tokenize the new documents with schema qualification
186
- await self._execute(text(UPDATE_QUERY))
187
- await self._commit()
187
+ if not valid_documents:
188
+ self.log.info("No new documents to index")
189
+ return
190
+
191
+ # Execute inserts
192
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
193
+ await session.execute(
194
+ text(INSERT_QUERY),
195
+ [
196
+ {"snippet_id": doc.snippet_id, "passage": doc.text}
197
+ for doc in valid_documents
198
+ ],
199
+ )
200
+
201
+ # Tokenize the new documents with schema qualification
202
+ await session.execute(text(UPDATE_QUERY))
188
203
 
189
204
  async def search(self, request: SearchRequest) -> list[SearchResult]:
190
205
  """Search documents using BM25."""
206
+ if not self._initialized:
207
+ await self._initialize()
191
208
  if not request.query or request.query.strip() == "":
192
209
  return []
193
210
 
@@ -203,22 +220,21 @@ class VectorChordBM25Repository(BM25Repository):
203
220
  limit=request.top_k,
204
221
  )
205
222
 
206
- try:
207
- result = await self._execute(sql)
223
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
224
+ result = await session.execute(sql)
208
225
  rows = result.mappings().all()
209
226
 
210
227
  return [
211
228
  SearchResult(snippet_id=row["snippet_id"], score=row["bm25_score"])
212
229
  for row in rows
213
230
  ]
214
- except Exception as e:
215
- msg = f"Error during BM25 search: {e}"
216
- raise RuntimeError(msg) from e
217
231
 
218
232
  async def delete_documents(self, request: DeleteRequest) -> None:
219
233
  """Delete documents from the index."""
220
- await self._execute(
221
- text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
222
- {"snippet_ids": request.snippet_ids},
223
- )
224
- await self._commit()
234
+ if not self._initialized:
235
+ await self._initialize()
236
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
237
+ await session.execute(
238
+ text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
239
+ {"snippet_ids": request.snippet_ids},
240
+ )