kodit 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (118) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/__init__.py +1 -0
  3. kodit/application/commands/__init__.py +1 -0
  4. kodit/application/commands/snippet_commands.py +22 -0
  5. kodit/application/services/__init__.py +1 -0
  6. kodit/application/services/indexing_application_service.py +387 -0
  7. kodit/application/services/snippet_application_service.py +149 -0
  8. kodit/cli.py +118 -82
  9. kodit/database.py +0 -22
  10. kodit/domain/__init__.py +1 -0
  11. kodit/{source/source_models.py → domain/entities.py} +88 -19
  12. kodit/domain/enums.py +9 -0
  13. kodit/domain/errors.py +5 -0
  14. kodit/domain/interfaces.py +27 -0
  15. kodit/domain/repositories.py +95 -0
  16. kodit/domain/services/__init__.py +1 -0
  17. kodit/domain/services/bm25_service.py +124 -0
  18. kodit/domain/services/embedding_service.py +155 -0
  19. kodit/domain/services/enrichment_service.py +48 -0
  20. kodit/domain/services/ignore_service.py +45 -0
  21. kodit/domain/services/indexing_service.py +203 -0
  22. kodit/domain/services/snippet_extraction_service.py +89 -0
  23. kodit/domain/services/source_service.py +85 -0
  24. kodit/domain/value_objects.py +215 -0
  25. kodit/infrastructure/__init__.py +1 -0
  26. kodit/infrastructure/bm25/__init__.py +1 -0
  27. kodit/infrastructure/bm25/bm25_factory.py +28 -0
  28. kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
  29. kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
  30. kodit/infrastructure/cloning/__init__.py +1 -0
  31. kodit/infrastructure/cloning/folder/__init__.py +1 -0
  32. kodit/infrastructure/cloning/folder/factory.py +128 -0
  33. kodit/infrastructure/cloning/folder/working_copy.py +38 -0
  34. kodit/infrastructure/cloning/git/__init__.py +1 -0
  35. kodit/infrastructure/cloning/git/factory.py +147 -0
  36. kodit/infrastructure/cloning/git/working_copy.py +32 -0
  37. kodit/infrastructure/cloning/metadata.py +127 -0
  38. kodit/infrastructure/embedding/__init__.py +1 -0
  39. kodit/infrastructure/embedding/embedding_factory.py +87 -0
  40. kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
  41. kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
  42. kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
  43. kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
  44. kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
  45. kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
  46. kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +65 -46
  47. kodit/infrastructure/enrichment/__init__.py +1 -0
  48. kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
  49. kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
  50. kodit/{enrichment/enrichment_provider → infrastructure/enrichment}/local_enrichment_provider.py +38 -26
  51. kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
  52. kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
  53. kodit/infrastructure/git/__init__.py +1 -0
  54. kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
  55. kodit/infrastructure/ignore/__init__.py +1 -0
  56. kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
  57. kodit/infrastructure/indexing/__init__.py +1 -0
  58. kodit/infrastructure/indexing/fusion_service.py +55 -0
  59. kodit/infrastructure/indexing/index_repository.py +291 -0
  60. kodit/infrastructure/indexing/indexing_factory.py +113 -0
  61. kodit/infrastructure/snippet_extraction/__init__.py +1 -0
  62. kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
  63. kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
  64. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
  65. kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
  66. kodit/infrastructure/sqlalchemy/__init__.py +1 -0
  67. kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -26
  68. kodit/infrastructure/sqlalchemy/file_repository.py +78 -0
  69. kodit/infrastructure/sqlalchemy/repository.py +133 -0
  70. kodit/infrastructure/sqlalchemy/snippet_repository.py +79 -0
  71. kodit/infrastructure/ui/__init__.py +1 -0
  72. kodit/infrastructure/ui/progress.py +127 -0
  73. kodit/{util → infrastructure/ui}/spinner.py +19 -4
  74. kodit/mcp.py +51 -28
  75. kodit/migrations/env.py +1 -4
  76. kodit/reporting.py +78 -0
  77. {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/METADATA +1 -1
  78. kodit-0.2.6.dist-info/RECORD +100 -0
  79. kodit/bm25/__init__.py +0 -1
  80. kodit/bm25/keyword_search_factory.py +0 -17
  81. kodit/bm25/keyword_search_service.py +0 -34
  82. kodit/embedding/__init__.py +0 -1
  83. kodit/embedding/embedding_factory.py +0 -69
  84. kodit/embedding/embedding_models.py +0 -28
  85. kodit/embedding/embedding_provider/__init__.py +0 -1
  86. kodit/embedding/embedding_provider/embedding_provider.py +0 -92
  87. kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -86
  88. kodit/embedding/embedding_provider/local_embedding_provider.py +0 -96
  89. kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -73
  90. kodit/embedding/local_vector_search_service.py +0 -87
  91. kodit/embedding/vector_search_service.py +0 -55
  92. kodit/enrichment/__init__.py +0 -1
  93. kodit/enrichment/enrichment_provider/__init__.py +0 -1
  94. kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -36
  95. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -79
  96. kodit/enrichment/enrichment_service.py +0 -45
  97. kodit/indexing/__init__.py +0 -1
  98. kodit/indexing/fusion.py +0 -67
  99. kodit/indexing/indexing_models.py +0 -43
  100. kodit/indexing/indexing_repository.py +0 -216
  101. kodit/indexing/indexing_service.py +0 -344
  102. kodit/snippets/__init__.py +0 -1
  103. kodit/snippets/languages/__init__.py +0 -53
  104. kodit/snippets/snippets.py +0 -50
  105. kodit/source/__init__.py +0 -1
  106. kodit/source/source_factories.py +0 -356
  107. kodit/source/source_repository.py +0 -169
  108. kodit/source/source_service.py +0 -150
  109. kodit/util/__init__.py +0 -1
  110. kodit-0.2.4.dist-info/RECORD +0 -71
  111. /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
  112. /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
  113. /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
  114. /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
  115. /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
  116. {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/WHEEL +0 -0
  117. {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/entry_points.txt +0 -0
  118. {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,85 @@
1
+ """Source service rewritten to work directly with AsyncSession."""
2
+
3
+ from collections.abc import Callable
4
+ from pathlib import Path
5
+
6
+ import structlog
7
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
8
+
9
+ from kodit.domain.entities import Source
10
+ from kodit.domain.interfaces import ProgressCallback
11
+ from kodit.domain.repositories import SourceRepository
12
+ from kodit.infrastructure.cloning.folder.factory import FolderSourceFactory
13
+ from kodit.infrastructure.cloning.folder.working_copy import FolderWorkingCopyProvider
14
+ from kodit.infrastructure.cloning.git.factory import (
15
+ GitSourceFactory,
16
+ GitWorkingCopyProvider,
17
+ )
18
+ from kodit.infrastructure.git.git_utils import is_valid_clone_target
19
+ from kodit.infrastructure.sqlalchemy.repository import SqlAlchemySourceRepository
20
+
21
+
22
+ class SourceService:
23
+ """Source service."""
24
+
25
+ def __init__(
26
+ self,
27
+ clone_dir: Path,
28
+ session_factory: async_sessionmaker[AsyncSession] | Callable[[], AsyncSession],
29
+ ) -> None:
30
+ """Initialize the source service."""
31
+ self.clone_dir = clone_dir
32
+ self._session_factory = session_factory
33
+ self.log = structlog.get_logger(__name__)
34
+
35
+ async def get(self, source_id: int) -> Source:
36
+ """Get a source."""
37
+ async with self._session_factory() as session:
38
+ repo = SqlAlchemySourceRepository(session)
39
+
40
+ source = await repo.get(source_id)
41
+ if source is None:
42
+ raise ValueError(f"Source not found: {source_id}")
43
+
44
+ return source
45
+
46
+ async def create(
47
+ self, uri_or_path_like: str, progress_callback: ProgressCallback | None = None
48
+ ) -> Source:
49
+ """Create a source."""
50
+ async with self._session_factory() as session:
51
+ repo = SqlAlchemySourceRepository(session)
52
+ git_factory, folder_factory = self._build_factories(repo, session)
53
+
54
+ if is_valid_clone_target(uri_or_path_like):
55
+ source = await git_factory.create(uri_or_path_like, progress_callback)
56
+ elif Path(uri_or_path_like).is_dir():
57
+ source = await folder_factory.create(
58
+ uri_or_path_like, progress_callback
59
+ )
60
+ else:
61
+ raise ValueError(f"Unsupported source: {uri_or_path_like}")
62
+
63
+ # Factories handle their own commits now
64
+ return source
65
+
66
+ def _build_factories(
67
+ self, repository: SourceRepository, session: AsyncSession
68
+ ) -> tuple[GitSourceFactory, FolderSourceFactory]:
69
+ # Git-specific collaborators
70
+ git_wc = GitWorkingCopyProvider(self.clone_dir)
71
+ git_factory = GitSourceFactory(
72
+ repository=repository,
73
+ working_copy=git_wc,
74
+ session=session,
75
+ )
76
+
77
+ # Folder-specific collaborators
78
+ fold_wc = FolderWorkingCopyProvider(self.clone_dir)
79
+ folder_factory = FolderSourceFactory(
80
+ repository=repository,
81
+ working_copy=fold_wc,
82
+ session=session,
83
+ )
84
+
85
+ return git_factory, folder_factory
@@ -0,0 +1,215 @@
1
+ """Domain value objects and DTOs."""
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from pathlib import Path
7
+
8
+ from kodit.domain.enums import SnippetExtractionStrategy
9
+
10
+
11
+ class SearchType(Enum):
12
+ """Type of search to perform."""
13
+
14
+ BM25 = "bm25"
15
+ VECTOR = "vector"
16
+ HYBRID = "hybrid"
17
+
18
+
19
+ @dataclass
20
+ class SnippetExtractionRequest:
21
+ """Domain model for snippet extraction request."""
22
+
23
+ file_path: Path
24
+ strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
25
+
26
+
27
+ @dataclass
28
+ class SnippetExtractionResult:
29
+ """Domain model for snippet extraction result."""
30
+
31
+ snippets: list[str]
32
+ language: str
33
+
34
+
35
+ @dataclass
36
+ class Document:
37
+ """Generic document model for indexing."""
38
+
39
+ snippet_id: int
40
+ text: str
41
+
42
+
43
+ @dataclass
44
+ class SearchResult:
45
+ """Generic search result model."""
46
+
47
+ snippet_id: int
48
+ score: float
49
+
50
+
51
+ @dataclass
52
+ class IndexRequest:
53
+ """Generic indexing request."""
54
+
55
+ documents: list[Document]
56
+
57
+
58
+ @dataclass
59
+ class SimpleSearchRequest:
60
+ """Generic search request (single query string)."""
61
+
62
+ query: str
63
+ top_k: int = 10
64
+ search_type: SearchType = SearchType.BM25
65
+
66
+
67
+ @dataclass
68
+ class DeleteRequest:
69
+ """Generic deletion request."""
70
+
71
+ snippet_ids: list[int]
72
+
73
+
74
+ @dataclass
75
+ class IndexResult:
76
+ """Generic indexing result."""
77
+
78
+ snippet_id: int
79
+
80
+
81
+ # Legacy aliases for backward compatibility
82
+ BM25Document = Document
83
+ BM25SearchResult = SearchResult
84
+ BM25IndexRequest = IndexRequest
85
+ BM25SearchRequest = SimpleSearchRequest
86
+ BM25DeleteRequest = DeleteRequest
87
+
88
+ VectorSearchRequest = Document
89
+ VectorSearchResult = SearchResult
90
+ VectorIndexRequest = IndexRequest
91
+ VectorSearchQueryRequest = SimpleSearchRequest
92
+
93
+
94
+ @dataclass
95
+ class MultiSearchRequest:
96
+ """Domain model for multi-modal search request."""
97
+
98
+ top_k: int = 10
99
+ text_query: str | None = None
100
+ code_query: str | None = None
101
+ keywords: list[str] | None = None
102
+
103
+
104
+ @dataclass
105
+ class MultiSearchResult:
106
+ """Domain model for multi-modal search result."""
107
+
108
+ id: int
109
+ uri: str
110
+ content: str
111
+ original_scores: list[float]
112
+
113
+
114
+ @dataclass
115
+ class FusionRequest:
116
+ """Domain model for fusion request."""
117
+
118
+ id: int
119
+ score: float
120
+
121
+
122
+ @dataclass
123
+ class FusionResult:
124
+ """Domain model for fusion result."""
125
+
126
+ id: int
127
+ score: float
128
+ original_scores: list[float]
129
+
130
+
131
+ @dataclass
132
+ class IndexCreateRequest:
133
+ """Domain model for index creation request."""
134
+
135
+ source_id: int
136
+
137
+
138
+ @dataclass
139
+ class IndexRunRequest:
140
+ """Domain model for index run request."""
141
+
142
+ index_id: int
143
+
144
+
145
+ @dataclass
146
+ class ProgressEvent:
147
+ """Domain model for progress events."""
148
+
149
+ operation: str
150
+ current: int
151
+ total: int
152
+ message: str | None = None
153
+
154
+ @property
155
+ def percentage(self) -> float:
156
+ """Calculate the percentage of completion."""
157
+ return (self.current / self.total * 100) if self.total > 0 else 0.0
158
+
159
+
160
+ @dataclass
161
+ class EmbeddingRequest:
162
+ """Domain model for embedding request."""
163
+
164
+ snippet_id: int
165
+ text: str
166
+
167
+
168
+ @dataclass
169
+ class EmbeddingResponse:
170
+ """Domain model for embedding response."""
171
+
172
+ snippet_id: int
173
+ embedding: list[float]
174
+
175
+
176
+ @dataclass
177
+ class EnrichmentRequest:
178
+ """Domain model for enrichment request."""
179
+
180
+ snippet_id: int
181
+ text: str
182
+
183
+
184
+ @dataclass
185
+ class EnrichmentResponse:
186
+ """Domain model for enrichment response."""
187
+
188
+ snippet_id: int
189
+ text: str
190
+
191
+
192
+ @dataclass
193
+ class EnrichmentIndexRequest:
194
+ """Domain model for enrichment index request."""
195
+
196
+ requests: list[EnrichmentRequest]
197
+
198
+
199
+ @dataclass
200
+ class EnrichmentSearchRequest:
201
+ """Domain model for enrichment search request."""
202
+
203
+ query: str
204
+ top_k: int = 10
205
+
206
+
207
+ @dataclass
208
+ class IndexView:
209
+ """Domain model for index information."""
210
+
211
+ id: int
212
+ created_at: datetime
213
+ num_snippets: int
214
+ updated_at: datetime | None = None
215
+ source: str | None = None
@@ -0,0 +1 @@
1
+ """Infrastructure layer."""
@@ -0,0 +1 @@
1
+ """BM25 infrastructure module."""
@@ -0,0 +1,28 @@
1
+ """Factory for creating BM25 repositories."""
2
+
3
+ from sqlalchemy.ext.asyncio import AsyncSession
4
+
5
+ from kodit.config import AppContext
6
+ from kodit.domain.services.bm25_service import BM25Repository
7
+ from kodit.infrastructure.bm25.local_bm25_repository import LocalBM25Repository
8
+ from kodit.infrastructure.bm25.vectorchord_bm25_repository import (
9
+ VectorChordBM25Repository,
10
+ )
11
+
12
+
13
+ def bm25_repository_factory(
14
+ app_context: AppContext, session: AsyncSession
15
+ ) -> BM25Repository:
16
+ """Create a BM25 repository based on configuration.
17
+
18
+ Args:
19
+ app_context: Application configuration context
20
+ session: SQLAlchemy async session
21
+
22
+ Returns:
23
+ BM25Repository instance
24
+
25
+ """
26
+ if app_context.default_search.provider == "vectorchord":
27
+ return VectorChordBM25Repository(session=session)
28
+ return LocalBM25Repository(data_dir=app_context.get_data_dir())
@@ -1,4 +1,4 @@
1
- """Locally hosted BM25 service primarily for use with SQLite."""
1
+ """Local BM25 repository implementation."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
@@ -10,25 +10,31 @@ import aiofiles
10
10
  import Stemmer
11
11
  import structlog
12
12
 
13
- from kodit.bm25.keyword_search_service import (
14
- BM25Document,
15
- BM25Result,
16
- KeywordSearchProvider,
13
+ from kodit.domain.services.bm25_service import BM25Repository
14
+ from kodit.domain.value_objects import (
15
+ BM25DeleteRequest,
16
+ BM25IndexRequest,
17
+ BM25SearchRequest,
18
+ BM25SearchResult,
17
19
  )
18
20
 
19
21
  if TYPE_CHECKING:
20
22
  import bm25s
21
23
  from bm25s.tokenization import Tokenized
22
24
 
23
-
24
25
  SNIPPET_IDS_FILE = "snippet_ids.jsonl"
25
26
 
26
27
 
27
- class BM25Service(KeywordSearchProvider):
28
- """LocalBM25 service."""
28
+ class LocalBM25Repository(BM25Repository):
29
+ """Local BM25 repository implementation."""
29
30
 
30
31
  def __init__(self, data_dir: Path) -> None:
31
- """Initialize the BM25 service."""
32
+ """Initialize the local BM25 repository.
33
+
34
+ Args:
35
+ data_dir: Directory to store BM25 index files
36
+
37
+ """
32
38
  self.log = structlog.get_logger(__name__)
33
39
  self.index_path = data_dir / "bm25s_index"
34
40
  self.snippet_ids: list[int] = []
@@ -51,6 +57,7 @@ class BM25Service(KeywordSearchProvider):
51
57
  return self.__retriever
52
58
 
53
59
  def _tokenize(self, corpus: list[str]) -> list[list[str]] | Tokenized:
60
+ """Tokenize text corpus."""
54
61
  from bm25s import tokenize
55
62
 
56
63
  return tokenize(
@@ -61,23 +68,25 @@ class BM25Service(KeywordSearchProvider):
61
68
  show_progress=True,
62
69
  )
63
70
 
64
- async def index(self, corpus: list[BM25Document]) -> None:
65
- """Index a new corpus."""
71
+ async def index_documents(self, request: BM25IndexRequest) -> None:
72
+ """Index documents for BM25 search."""
66
73
  self.log.debug("Indexing corpus")
67
- if not corpus or len(corpus) == 0:
74
+ if not request.documents:
68
75
  self.log.warning("Corpus is empty, skipping bm25 index")
69
76
  return
70
77
 
71
- vocab = self._tokenize([doc.text for doc in corpus])
78
+ vocab = self._tokenize([doc.text for doc in request.documents])
72
79
  self._retriever().index(vocab, show_progress=False)
73
80
  self._retriever().save(self.index_path)
74
- self.snippet_ids = self.snippet_ids + [doc.snippet_id for doc in corpus]
81
+ self.snippet_ids = self.snippet_ids + [
82
+ doc.snippet_id for doc in request.documents
83
+ ]
75
84
  async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
76
85
  await f.write(json.dumps(self.snippet_ids))
77
86
 
78
- async def retrieve(self, query: str, top_k: int = 2) -> list[BM25Result]:
79
- """Retrieve from the index."""
80
- if top_k == 0:
87
+ async def search(self, request: BM25SearchRequest) -> list[BM25SearchResult]:
88
+ """Search documents using BM25."""
89
+ if request.top_k == 0:
81
90
  self.log.warning("Top k is 0, returning empty list")
82
91
  return []
83
92
 
@@ -91,14 +100,14 @@ class BM25Service(KeywordSearchProvider):
91
100
  return []
92
101
 
93
102
  # Adjust top_k to not exceed corpus size
94
- top_k = min(top_k, num_docs)
103
+ top_k = min(request.top_k, num_docs)
95
104
  self.log.debug(
96
105
  "Retrieving from index",
97
- query=query,
106
+ query=request.query,
98
107
  top_k=top_k,
99
108
  )
100
109
 
101
- query_tokens = self._tokenize([query])
110
+ query_tokens = self._tokenize([request.query])
102
111
 
103
112
  self.log.debug("Query tokens", query_tokens=query_tokens)
104
113
 
@@ -109,11 +118,13 @@ class BM25Service(KeywordSearchProvider):
109
118
  )
110
119
  self.log.debug("Raw results", results=results, scores=scores)
111
120
  return [
112
- BM25Result(snippet_id=int(result), score=float(score))
121
+ BM25SearchResult(snippet_id=int(result), score=float(score))
113
122
  for result, score in zip(results[0], scores[0], strict=False)
114
123
  if score > 0.0
115
124
  ]
116
125
 
117
- async def delete(self, snippet_ids: list[int]) -> None: # noqa: ARG002
126
+ async def delete_documents(self, request: BM25DeleteRequest) -> None:
118
127
  """Delete documents from the index."""
128
+ # request parameter is unused as deletion is not supported
129
+ # ruff: noqa: ARG002
119
130
  self.log.warning("Deletion not supported for local BM25 index")
@@ -1,4 +1,4 @@
1
- """VectorChord repository for document operations."""
1
+ """VectorChord BM25 repository implementation."""
2
2
 
3
3
  from typing import Any
4
4
 
@@ -6,10 +6,12 @@ import structlog
6
6
  from sqlalchemy import Result, TextClause, bindparam, text
7
7
  from sqlalchemy.ext.asyncio import AsyncSession
8
8
 
9
- from kodit.bm25.keyword_search_service import (
10
- BM25Document,
11
- BM25Result,
12
- KeywordSearchProvider,
9
+ from kodit.domain.services.bm25_service import BM25Repository
10
+ from kodit.domain.value_objects import (
11
+ BM25DeleteRequest,
12
+ BM25IndexRequest,
13
+ BM25SearchRequest,
14
+ BM25SearchResult,
13
15
  )
14
16
 
15
17
  TABLE_NAME = "vectorchord_bm25_documents"
@@ -84,14 +86,16 @@ WHERE snippet_id IN :snippet_ids
84
86
  """ # noqa: S608
85
87
 
86
88
 
87
- class VectorChordBM25(KeywordSearchProvider):
88
- """BM25 using VectorChord."""
89
+ class VectorChordBM25Repository(BM25Repository):
90
+ """VectorChord BM25 repository implementation."""
89
91
 
90
- def __init__(
91
- self,
92
- session: AsyncSession,
93
- ) -> None:
94
- """Initialize the VectorChord BM25."""
92
+ def __init__(self, session: AsyncSession) -> None:
93
+ """Initialize the VectorChord BM25 repository.
94
+
95
+ Args:
96
+ session: The SQLAlchemy async session to use for database operations
97
+
98
+ """
95
99
  self.__session = session
96
100
  self._initialized = False
97
101
  self.log = structlog.get_logger(__name__)
@@ -142,55 +146,56 @@ class VectorChordBM25(KeywordSearchProvider):
142
146
  """Commit the session."""
143
147
  await self.__session.commit()
144
148
 
145
- async def index(self, corpus: list[BM25Document]) -> None:
146
- """Index a new corpus."""
149
+ async def index_documents(self, request: BM25IndexRequest) -> None:
150
+ """Index documents for BM25 search."""
147
151
  # Filter out any documents that don't have a snippet_id or text
148
- corpus = [
152
+ valid_documents = [
149
153
  doc
150
- for doc in corpus
154
+ for doc in request.documents
151
155
  if doc.snippet_id is not None and doc.text is not None and doc.text != ""
152
156
  ]
153
157
 
154
- if not corpus or len(corpus) == 0:
158
+ if not valid_documents:
155
159
  self.log.warning("Corpus is empty, skipping bm25 index")
156
160
  return
157
161
 
158
162
  # Execute inserts
159
163
  await self._execute(
160
164
  text(INSERT_QUERY),
161
- [{"snippet_id": doc.snippet_id, "passage": doc.text} for doc in corpus],
165
+ [
166
+ {"snippet_id": doc.snippet_id, "passage": doc.text}
167
+ for doc in valid_documents
168
+ ],
162
169
  )
163
170
 
164
171
  # Tokenize the new documents with schema qualification
165
172
  await self._execute(text(UPDATE_QUERY))
166
173
  await self._commit()
167
174
 
168
- async def delete(self, snippet_ids: list[int]) -> None:
169
- """Delete documents from the index."""
170
- await self._execute(
171
- text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
172
- {"snippet_ids": snippet_ids},
173
- )
174
- await self._commit()
175
-
176
- async def retrieve(
177
- self,
178
- query: str,
179
- top_k: int = 10,
180
- ) -> list[BM25Result]:
181
- """Search documents using BM25 similarity."""
182
- if not query or query == "":
175
+ async def search(self, request: BM25SearchRequest) -> list[BM25SearchResult]:
176
+ """Search documents using BM25."""
177
+ if not request.query or request.query == "":
183
178
  return []
184
179
 
185
- sql = text(SEARCH_QUERY).bindparams(query_text=query, limit=top_k)
180
+ sql = text(SEARCH_QUERY).bindparams(
181
+ query_text=request.query, limit=request.top_k
182
+ )
186
183
  try:
187
184
  result = await self._execute(sql)
188
185
  rows = result.mappings().all()
189
186
 
190
187
  return [
191
- BM25Result(snippet_id=row["snippet_id"], score=row["bm25_score"])
188
+ BM25SearchResult(snippet_id=row["snippet_id"], score=row["bm25_score"])
192
189
  for row in rows
193
190
  ]
194
191
  except Exception as e:
195
192
  msg = f"Error during BM25 search: {e}"
196
193
  raise RuntimeError(msg) from e
194
+
195
+ async def delete_documents(self, request: BM25DeleteRequest) -> None:
196
+ """Delete documents from the index."""
197
+ await self._execute(
198
+ text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
199
+ {"snippet_ids": request.snippet_ids},
200
+ )
201
+ await self._commit()
@@ -0,0 +1 @@
1
+ """Cloning infrastructure."""
@@ -0,0 +1 @@
1
+ """Folder cloning infrastructure."""