kodit 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/__init__.py +1 -0
- kodit/application/commands/__init__.py +1 -0
- kodit/application/commands/snippet_commands.py +22 -0
- kodit/application/services/__init__.py +1 -0
- kodit/application/services/indexing_application_service.py +387 -0
- kodit/application/services/snippet_application_service.py +149 -0
- kodit/cli.py +118 -82
- kodit/database.py +0 -22
- kodit/domain/__init__.py +1 -0
- kodit/{source/source_models.py → domain/entities.py} +88 -19
- kodit/domain/enums.py +9 -0
- kodit/domain/errors.py +5 -0
- kodit/domain/interfaces.py +27 -0
- kodit/domain/repositories.py +95 -0
- kodit/domain/services/__init__.py +1 -0
- kodit/domain/services/bm25_service.py +124 -0
- kodit/domain/services/embedding_service.py +155 -0
- kodit/domain/services/enrichment_service.py +48 -0
- kodit/domain/services/ignore_service.py +45 -0
- kodit/domain/services/indexing_service.py +203 -0
- kodit/domain/services/snippet_extraction_service.py +89 -0
- kodit/domain/services/source_service.py +85 -0
- kodit/domain/value_objects.py +215 -0
- kodit/infrastructure/__init__.py +1 -0
- kodit/infrastructure/bm25/__init__.py +1 -0
- kodit/infrastructure/bm25/bm25_factory.py +28 -0
- kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
- kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
- kodit/infrastructure/cloning/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/factory.py +128 -0
- kodit/infrastructure/cloning/folder/working_copy.py +38 -0
- kodit/infrastructure/cloning/git/__init__.py +1 -0
- kodit/infrastructure/cloning/git/factory.py +147 -0
- kodit/infrastructure/cloning/git/working_copy.py +32 -0
- kodit/infrastructure/cloning/metadata.py +127 -0
- kodit/infrastructure/embedding/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_factory.py +87 -0
- kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
- kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
- kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
- kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +65 -46
- kodit/infrastructure/enrichment/__init__.py +1 -0
- kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
- kodit/{enrichment/enrichment_provider → infrastructure/enrichment}/local_enrichment_provider.py +38 -26
- kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
- kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
- kodit/infrastructure/git/__init__.py +1 -0
- kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
- kodit/infrastructure/ignore/__init__.py +1 -0
- kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
- kodit/infrastructure/indexing/__init__.py +1 -0
- kodit/infrastructure/indexing/fusion_service.py +55 -0
- kodit/infrastructure/indexing/index_repository.py +291 -0
- kodit/infrastructure/indexing/indexing_factory.py +113 -0
- kodit/infrastructure/snippet_extraction/__init__.py +1 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
- kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
- kodit/infrastructure/sqlalchemy/__init__.py +1 -0
- kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -26
- kodit/infrastructure/sqlalchemy/file_repository.py +78 -0
- kodit/infrastructure/sqlalchemy/repository.py +133 -0
- kodit/infrastructure/sqlalchemy/snippet_repository.py +79 -0
- kodit/infrastructure/ui/__init__.py +1 -0
- kodit/infrastructure/ui/progress.py +127 -0
- kodit/{util → infrastructure/ui}/spinner.py +19 -4
- kodit/mcp.py +51 -28
- kodit/migrations/env.py +1 -4
- kodit/reporting.py +78 -0
- {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/METADATA +1 -1
- kodit-0.2.6.dist-info/RECORD +100 -0
- kodit/bm25/__init__.py +0 -1
- kodit/bm25/keyword_search_factory.py +0 -17
- kodit/bm25/keyword_search_service.py +0 -34
- kodit/embedding/__init__.py +0 -1
- kodit/embedding/embedding_factory.py +0 -69
- kodit/embedding/embedding_models.py +0 -28
- kodit/embedding/embedding_provider/__init__.py +0 -1
- kodit/embedding/embedding_provider/embedding_provider.py +0 -92
- kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -86
- kodit/embedding/embedding_provider/local_embedding_provider.py +0 -96
- kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -73
- kodit/embedding/local_vector_search_service.py +0 -87
- kodit/embedding/vector_search_service.py +0 -55
- kodit/enrichment/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -36
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -79
- kodit/enrichment/enrichment_service.py +0 -45
- kodit/indexing/__init__.py +0 -1
- kodit/indexing/fusion.py +0 -67
- kodit/indexing/indexing_models.py +0 -43
- kodit/indexing/indexing_repository.py +0 -216
- kodit/indexing/indexing_service.py +0 -344
- kodit/snippets/__init__.py +0 -1
- kodit/snippets/languages/__init__.py +0 -53
- kodit/snippets/snippets.py +0 -50
- kodit/source/__init__.py +0 -1
- kodit/source/source_factories.py +0 -356
- kodit/source/source_repository.py +0 -169
- kodit/source/source_service.py +0 -150
- kodit/util/__init__.py +0 -1
- kodit-0.2.4.dist-info/RECORD +0 -71
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/WHEEL +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Source service rewritten to work directly with AsyncSession."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import structlog
|
|
7
|
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
|
8
|
+
|
|
9
|
+
from kodit.domain.entities import Source
|
|
10
|
+
from kodit.domain.interfaces import ProgressCallback
|
|
11
|
+
from kodit.domain.repositories import SourceRepository
|
|
12
|
+
from kodit.infrastructure.cloning.folder.factory import FolderSourceFactory
|
|
13
|
+
from kodit.infrastructure.cloning.folder.working_copy import FolderWorkingCopyProvider
|
|
14
|
+
from kodit.infrastructure.cloning.git.factory import (
|
|
15
|
+
GitSourceFactory,
|
|
16
|
+
GitWorkingCopyProvider,
|
|
17
|
+
)
|
|
18
|
+
from kodit.infrastructure.git.git_utils import is_valid_clone_target
|
|
19
|
+
from kodit.infrastructure.sqlalchemy.repository import SqlAlchemySourceRepository
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SourceService:
|
|
23
|
+
"""Source service."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
clone_dir: Path,
|
|
28
|
+
session_factory: async_sessionmaker[AsyncSession] | Callable[[], AsyncSession],
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Initialize the source service."""
|
|
31
|
+
self.clone_dir = clone_dir
|
|
32
|
+
self._session_factory = session_factory
|
|
33
|
+
self.log = structlog.get_logger(__name__)
|
|
34
|
+
|
|
35
|
+
async def get(self, source_id: int) -> Source:
|
|
36
|
+
"""Get a source."""
|
|
37
|
+
async with self._session_factory() as session:
|
|
38
|
+
repo = SqlAlchemySourceRepository(session)
|
|
39
|
+
|
|
40
|
+
source = await repo.get(source_id)
|
|
41
|
+
if source is None:
|
|
42
|
+
raise ValueError(f"Source not found: {source_id}")
|
|
43
|
+
|
|
44
|
+
return source
|
|
45
|
+
|
|
46
|
+
async def create(
|
|
47
|
+
self, uri_or_path_like: str, progress_callback: ProgressCallback | None = None
|
|
48
|
+
) -> Source:
|
|
49
|
+
"""Create a source."""
|
|
50
|
+
async with self._session_factory() as session:
|
|
51
|
+
repo = SqlAlchemySourceRepository(session)
|
|
52
|
+
git_factory, folder_factory = self._build_factories(repo, session)
|
|
53
|
+
|
|
54
|
+
if is_valid_clone_target(uri_or_path_like):
|
|
55
|
+
source = await git_factory.create(uri_or_path_like, progress_callback)
|
|
56
|
+
elif Path(uri_or_path_like).is_dir():
|
|
57
|
+
source = await folder_factory.create(
|
|
58
|
+
uri_or_path_like, progress_callback
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Unsupported source: {uri_or_path_like}")
|
|
62
|
+
|
|
63
|
+
# Factories handle their own commits now
|
|
64
|
+
return source
|
|
65
|
+
|
|
66
|
+
def _build_factories(
|
|
67
|
+
self, repository: SourceRepository, session: AsyncSession
|
|
68
|
+
) -> tuple[GitSourceFactory, FolderSourceFactory]:
|
|
69
|
+
# Git-specific collaborators
|
|
70
|
+
git_wc = GitWorkingCopyProvider(self.clone_dir)
|
|
71
|
+
git_factory = GitSourceFactory(
|
|
72
|
+
repository=repository,
|
|
73
|
+
working_copy=git_wc,
|
|
74
|
+
session=session,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Folder-specific collaborators
|
|
78
|
+
fold_wc = FolderWorkingCopyProvider(self.clone_dir)
|
|
79
|
+
folder_factory = FolderSourceFactory(
|
|
80
|
+
repository=repository,
|
|
81
|
+
working_copy=fold_wc,
|
|
82
|
+
session=session,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return git_factory, folder_factory
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Domain value objects and DTOs."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from kodit.domain.enums import SnippetExtractionStrategy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SearchType(Enum):
|
|
12
|
+
"""Type of search to perform."""
|
|
13
|
+
|
|
14
|
+
BM25 = "bm25"
|
|
15
|
+
VECTOR = "vector"
|
|
16
|
+
HYBRID = "hybrid"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class SnippetExtractionRequest:
|
|
21
|
+
"""Domain model for snippet extraction request."""
|
|
22
|
+
|
|
23
|
+
file_path: Path
|
|
24
|
+
strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class SnippetExtractionResult:
|
|
29
|
+
"""Domain model for snippet extraction result."""
|
|
30
|
+
|
|
31
|
+
snippets: list[str]
|
|
32
|
+
language: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Document:
|
|
37
|
+
"""Generic document model for indexing."""
|
|
38
|
+
|
|
39
|
+
snippet_id: int
|
|
40
|
+
text: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class SearchResult:
|
|
45
|
+
"""Generic search result model."""
|
|
46
|
+
|
|
47
|
+
snippet_id: int
|
|
48
|
+
score: float
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class IndexRequest:
|
|
53
|
+
"""Generic indexing request."""
|
|
54
|
+
|
|
55
|
+
documents: list[Document]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class SimpleSearchRequest:
|
|
60
|
+
"""Generic search request (single query string)."""
|
|
61
|
+
|
|
62
|
+
query: str
|
|
63
|
+
top_k: int = 10
|
|
64
|
+
search_type: SearchType = SearchType.BM25
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class DeleteRequest:
|
|
69
|
+
"""Generic deletion request."""
|
|
70
|
+
|
|
71
|
+
snippet_ids: list[int]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class IndexResult:
|
|
76
|
+
"""Generic indexing result."""
|
|
77
|
+
|
|
78
|
+
snippet_id: int
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# Legacy aliases for backward compatibility
|
|
82
|
+
BM25Document = Document
|
|
83
|
+
BM25SearchResult = SearchResult
|
|
84
|
+
BM25IndexRequest = IndexRequest
|
|
85
|
+
BM25SearchRequest = SimpleSearchRequest
|
|
86
|
+
BM25DeleteRequest = DeleteRequest
|
|
87
|
+
|
|
88
|
+
VectorSearchRequest = Document
|
|
89
|
+
VectorSearchResult = SearchResult
|
|
90
|
+
VectorIndexRequest = IndexRequest
|
|
91
|
+
VectorSearchQueryRequest = SimpleSearchRequest
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class MultiSearchRequest:
|
|
96
|
+
"""Domain model for multi-modal search request."""
|
|
97
|
+
|
|
98
|
+
top_k: int = 10
|
|
99
|
+
text_query: str | None = None
|
|
100
|
+
code_query: str | None = None
|
|
101
|
+
keywords: list[str] | None = None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class MultiSearchResult:
|
|
106
|
+
"""Domain model for multi-modal search result."""
|
|
107
|
+
|
|
108
|
+
id: int
|
|
109
|
+
uri: str
|
|
110
|
+
content: str
|
|
111
|
+
original_scores: list[float]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class FusionRequest:
|
|
116
|
+
"""Domain model for fusion request."""
|
|
117
|
+
|
|
118
|
+
id: int
|
|
119
|
+
score: float
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclass
|
|
123
|
+
class FusionResult:
|
|
124
|
+
"""Domain model for fusion result."""
|
|
125
|
+
|
|
126
|
+
id: int
|
|
127
|
+
score: float
|
|
128
|
+
original_scores: list[float]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@dataclass
|
|
132
|
+
class IndexCreateRequest:
|
|
133
|
+
"""Domain model for index creation request."""
|
|
134
|
+
|
|
135
|
+
source_id: int
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class IndexRunRequest:
|
|
140
|
+
"""Domain model for index run request."""
|
|
141
|
+
|
|
142
|
+
index_id: int
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@dataclass
|
|
146
|
+
class ProgressEvent:
|
|
147
|
+
"""Domain model for progress events."""
|
|
148
|
+
|
|
149
|
+
operation: str
|
|
150
|
+
current: int
|
|
151
|
+
total: int
|
|
152
|
+
message: str | None = None
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def percentage(self) -> float:
|
|
156
|
+
"""Calculate the percentage of completion."""
|
|
157
|
+
return (self.current / self.total * 100) if self.total > 0 else 0.0
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class EmbeddingRequest:
|
|
162
|
+
"""Domain model for embedding request."""
|
|
163
|
+
|
|
164
|
+
snippet_id: int
|
|
165
|
+
text: str
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dataclass
|
|
169
|
+
class EmbeddingResponse:
|
|
170
|
+
"""Domain model for embedding response."""
|
|
171
|
+
|
|
172
|
+
snippet_id: int
|
|
173
|
+
embedding: list[float]
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@dataclass
|
|
177
|
+
class EnrichmentRequest:
|
|
178
|
+
"""Domain model for enrichment request."""
|
|
179
|
+
|
|
180
|
+
snippet_id: int
|
|
181
|
+
text: str
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@dataclass
|
|
185
|
+
class EnrichmentResponse:
|
|
186
|
+
"""Domain model for enrichment response."""
|
|
187
|
+
|
|
188
|
+
snippet_id: int
|
|
189
|
+
text: str
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@dataclass
|
|
193
|
+
class EnrichmentIndexRequest:
|
|
194
|
+
"""Domain model for enrichment index request."""
|
|
195
|
+
|
|
196
|
+
requests: list[EnrichmentRequest]
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@dataclass
|
|
200
|
+
class EnrichmentSearchRequest:
|
|
201
|
+
"""Domain model for enrichment search request."""
|
|
202
|
+
|
|
203
|
+
query: str
|
|
204
|
+
top_k: int = 10
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@dataclass
|
|
208
|
+
class IndexView:
|
|
209
|
+
"""Domain model for index information."""
|
|
210
|
+
|
|
211
|
+
id: int
|
|
212
|
+
created_at: datetime
|
|
213
|
+
num_snippets: int
|
|
214
|
+
updated_at: datetime | None = None
|
|
215
|
+
source: str | None = None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Infrastructure layer."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""BM25 infrastructure module."""
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Factory for creating BM25 repositories."""
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
+
|
|
5
|
+
from kodit.config import AppContext
|
|
6
|
+
from kodit.domain.services.bm25_service import BM25Repository
|
|
7
|
+
from kodit.infrastructure.bm25.local_bm25_repository import LocalBM25Repository
|
|
8
|
+
from kodit.infrastructure.bm25.vectorchord_bm25_repository import (
|
|
9
|
+
VectorChordBM25Repository,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def bm25_repository_factory(
|
|
14
|
+
app_context: AppContext, session: AsyncSession
|
|
15
|
+
) -> BM25Repository:
|
|
16
|
+
"""Create a BM25 repository based on configuration.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
app_context: Application configuration context
|
|
20
|
+
session: SQLAlchemy async session
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
BM25Repository instance
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
if app_context.default_search.provider == "vectorchord":
|
|
27
|
+
return VectorChordBM25Repository(session=session)
|
|
28
|
+
return LocalBM25Repository(data_dir=app_context.get_data_dir())
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Local BM25 repository implementation."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
@@ -10,25 +10,31 @@ import aiofiles
|
|
|
10
10
|
import Stemmer
|
|
11
11
|
import structlog
|
|
12
12
|
|
|
13
|
-
from kodit.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
from kodit.domain.services.bm25_service import BM25Repository
|
|
14
|
+
from kodit.domain.value_objects import (
|
|
15
|
+
BM25DeleteRequest,
|
|
16
|
+
BM25IndexRequest,
|
|
17
|
+
BM25SearchRequest,
|
|
18
|
+
BM25SearchResult,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
if TYPE_CHECKING:
|
|
20
22
|
import bm25s
|
|
21
23
|
from bm25s.tokenization import Tokenized
|
|
22
24
|
|
|
23
|
-
|
|
24
25
|
SNIPPET_IDS_FILE = "snippet_ids.jsonl"
|
|
25
26
|
|
|
26
27
|
|
|
27
|
-
class
|
|
28
|
-
"""
|
|
28
|
+
class LocalBM25Repository(BM25Repository):
|
|
29
|
+
"""Local BM25 repository implementation."""
|
|
29
30
|
|
|
30
31
|
def __init__(self, data_dir: Path) -> None:
|
|
31
|
-
"""Initialize the BM25
|
|
32
|
+
"""Initialize the local BM25 repository.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
data_dir: Directory to store BM25 index files
|
|
36
|
+
|
|
37
|
+
"""
|
|
32
38
|
self.log = structlog.get_logger(__name__)
|
|
33
39
|
self.index_path = data_dir / "bm25s_index"
|
|
34
40
|
self.snippet_ids: list[int] = []
|
|
@@ -51,6 +57,7 @@ class BM25Service(KeywordSearchProvider):
|
|
|
51
57
|
return self.__retriever
|
|
52
58
|
|
|
53
59
|
def _tokenize(self, corpus: list[str]) -> list[list[str]] | Tokenized:
|
|
60
|
+
"""Tokenize text corpus."""
|
|
54
61
|
from bm25s import tokenize
|
|
55
62
|
|
|
56
63
|
return tokenize(
|
|
@@ -61,23 +68,25 @@ class BM25Service(KeywordSearchProvider):
|
|
|
61
68
|
show_progress=True,
|
|
62
69
|
)
|
|
63
70
|
|
|
64
|
-
async def
|
|
65
|
-
"""Index
|
|
71
|
+
async def index_documents(self, request: BM25IndexRequest) -> None:
|
|
72
|
+
"""Index documents for BM25 search."""
|
|
66
73
|
self.log.debug("Indexing corpus")
|
|
67
|
-
if not
|
|
74
|
+
if not request.documents:
|
|
68
75
|
self.log.warning("Corpus is empty, skipping bm25 index")
|
|
69
76
|
return
|
|
70
77
|
|
|
71
|
-
vocab = self._tokenize([doc.text for doc in
|
|
78
|
+
vocab = self._tokenize([doc.text for doc in request.documents])
|
|
72
79
|
self._retriever().index(vocab, show_progress=False)
|
|
73
80
|
self._retriever().save(self.index_path)
|
|
74
|
-
self.snippet_ids = self.snippet_ids + [
|
|
81
|
+
self.snippet_ids = self.snippet_ids + [
|
|
82
|
+
doc.snippet_id for doc in request.documents
|
|
83
|
+
]
|
|
75
84
|
async with aiofiles.open(self.index_path / SNIPPET_IDS_FILE, "w") as f:
|
|
76
85
|
await f.write(json.dumps(self.snippet_ids))
|
|
77
86
|
|
|
78
|
-
async def
|
|
79
|
-
"""
|
|
80
|
-
if top_k == 0:
|
|
87
|
+
async def search(self, request: BM25SearchRequest) -> list[BM25SearchResult]:
|
|
88
|
+
"""Search documents using BM25."""
|
|
89
|
+
if request.top_k == 0:
|
|
81
90
|
self.log.warning("Top k is 0, returning empty list")
|
|
82
91
|
return []
|
|
83
92
|
|
|
@@ -91,14 +100,14 @@ class BM25Service(KeywordSearchProvider):
|
|
|
91
100
|
return []
|
|
92
101
|
|
|
93
102
|
# Adjust top_k to not exceed corpus size
|
|
94
|
-
top_k = min(top_k, num_docs)
|
|
103
|
+
top_k = min(request.top_k, num_docs)
|
|
95
104
|
self.log.debug(
|
|
96
105
|
"Retrieving from index",
|
|
97
|
-
query=query,
|
|
106
|
+
query=request.query,
|
|
98
107
|
top_k=top_k,
|
|
99
108
|
)
|
|
100
109
|
|
|
101
|
-
query_tokens = self._tokenize([query])
|
|
110
|
+
query_tokens = self._tokenize([request.query])
|
|
102
111
|
|
|
103
112
|
self.log.debug("Query tokens", query_tokens=query_tokens)
|
|
104
113
|
|
|
@@ -109,11 +118,13 @@ class BM25Service(KeywordSearchProvider):
|
|
|
109
118
|
)
|
|
110
119
|
self.log.debug("Raw results", results=results, scores=scores)
|
|
111
120
|
return [
|
|
112
|
-
|
|
121
|
+
BM25SearchResult(snippet_id=int(result), score=float(score))
|
|
113
122
|
for result, score in zip(results[0], scores[0], strict=False)
|
|
114
123
|
if score > 0.0
|
|
115
124
|
]
|
|
116
125
|
|
|
117
|
-
async def
|
|
126
|
+
async def delete_documents(self, request: BM25DeleteRequest) -> None:
|
|
118
127
|
"""Delete documents from the index."""
|
|
128
|
+
# request parameter is unused as deletion is not supported
|
|
129
|
+
# ruff: noqa: ARG002
|
|
119
130
|
self.log.warning("Deletion not supported for local BM25 index")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""VectorChord repository
|
|
1
|
+
"""VectorChord BM25 repository implementation."""
|
|
2
2
|
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
@@ -6,10 +6,12 @@ import structlog
|
|
|
6
6
|
from sqlalchemy import Result, TextClause, bindparam, text
|
|
7
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
8
|
|
|
9
|
-
from kodit.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
9
|
+
from kodit.domain.services.bm25_service import BM25Repository
|
|
10
|
+
from kodit.domain.value_objects import (
|
|
11
|
+
BM25DeleteRequest,
|
|
12
|
+
BM25IndexRequest,
|
|
13
|
+
BM25SearchRequest,
|
|
14
|
+
BM25SearchResult,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
TABLE_NAME = "vectorchord_bm25_documents"
|
|
@@ -84,14 +86,16 @@ WHERE snippet_id IN :snippet_ids
|
|
|
84
86
|
""" # noqa: S608
|
|
85
87
|
|
|
86
88
|
|
|
87
|
-
class
|
|
88
|
-
"""BM25
|
|
89
|
+
class VectorChordBM25Repository(BM25Repository):
|
|
90
|
+
"""VectorChord BM25 repository implementation."""
|
|
89
91
|
|
|
90
|
-
def __init__(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
93
|
+
"""Initialize the VectorChord BM25 repository.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
session: The SQLAlchemy async session to use for database operations
|
|
97
|
+
|
|
98
|
+
"""
|
|
95
99
|
self.__session = session
|
|
96
100
|
self._initialized = False
|
|
97
101
|
self.log = structlog.get_logger(__name__)
|
|
@@ -142,55 +146,56 @@ class VectorChordBM25(KeywordSearchProvider):
|
|
|
142
146
|
"""Commit the session."""
|
|
143
147
|
await self.__session.commit()
|
|
144
148
|
|
|
145
|
-
async def
|
|
146
|
-
"""Index
|
|
149
|
+
async def index_documents(self, request: BM25IndexRequest) -> None:
|
|
150
|
+
"""Index documents for BM25 search."""
|
|
147
151
|
# Filter out any documents that don't have a snippet_id or text
|
|
148
|
-
|
|
152
|
+
valid_documents = [
|
|
149
153
|
doc
|
|
150
|
-
for doc in
|
|
154
|
+
for doc in request.documents
|
|
151
155
|
if doc.snippet_id is not None and doc.text is not None and doc.text != ""
|
|
152
156
|
]
|
|
153
157
|
|
|
154
|
-
if not
|
|
158
|
+
if not valid_documents:
|
|
155
159
|
self.log.warning("Corpus is empty, skipping bm25 index")
|
|
156
160
|
return
|
|
157
161
|
|
|
158
162
|
# Execute inserts
|
|
159
163
|
await self._execute(
|
|
160
164
|
text(INSERT_QUERY),
|
|
161
|
-
[
|
|
165
|
+
[
|
|
166
|
+
{"snippet_id": doc.snippet_id, "passage": doc.text}
|
|
167
|
+
for doc in valid_documents
|
|
168
|
+
],
|
|
162
169
|
)
|
|
163
170
|
|
|
164
171
|
# Tokenize the new documents with schema qualification
|
|
165
172
|
await self._execute(text(UPDATE_QUERY))
|
|
166
173
|
await self._commit()
|
|
167
174
|
|
|
168
|
-
async def
|
|
169
|
-
"""
|
|
170
|
-
|
|
171
|
-
text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
|
|
172
|
-
{"snippet_ids": snippet_ids},
|
|
173
|
-
)
|
|
174
|
-
await self._commit()
|
|
175
|
-
|
|
176
|
-
async def retrieve(
|
|
177
|
-
self,
|
|
178
|
-
query: str,
|
|
179
|
-
top_k: int = 10,
|
|
180
|
-
) -> list[BM25Result]:
|
|
181
|
-
"""Search documents using BM25 similarity."""
|
|
182
|
-
if not query or query == "":
|
|
175
|
+
async def search(self, request: BM25SearchRequest) -> list[BM25SearchResult]:
|
|
176
|
+
"""Search documents using BM25."""
|
|
177
|
+
if not request.query or request.query == "":
|
|
183
178
|
return []
|
|
184
179
|
|
|
185
|
-
sql = text(SEARCH_QUERY).bindparams(
|
|
180
|
+
sql = text(SEARCH_QUERY).bindparams(
|
|
181
|
+
query_text=request.query, limit=request.top_k
|
|
182
|
+
)
|
|
186
183
|
try:
|
|
187
184
|
result = await self._execute(sql)
|
|
188
185
|
rows = result.mappings().all()
|
|
189
186
|
|
|
190
187
|
return [
|
|
191
|
-
|
|
188
|
+
BM25SearchResult(snippet_id=row["snippet_id"], score=row["bm25_score"])
|
|
192
189
|
for row in rows
|
|
193
190
|
]
|
|
194
191
|
except Exception as e:
|
|
195
192
|
msg = f"Error during BM25 search: {e}"
|
|
196
193
|
raise RuntimeError(msg) from e
|
|
194
|
+
|
|
195
|
+
async def delete_documents(self, request: BM25DeleteRequest) -> None:
|
|
196
|
+
"""Delete documents from the index."""
|
|
197
|
+
await self._execute(
|
|
198
|
+
text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
|
|
199
|
+
{"snippet_ids": request.snippet_ids},
|
|
200
|
+
)
|
|
201
|
+
await self._commit()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Cloning infrastructure."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Folder cloning infrastructure."""
|