kodit 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (118) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/__init__.py +1 -0
  3. kodit/application/commands/__init__.py +1 -0
  4. kodit/application/commands/snippet_commands.py +22 -0
  5. kodit/application/services/__init__.py +1 -0
  6. kodit/application/services/indexing_application_service.py +387 -0
  7. kodit/application/services/snippet_application_service.py +149 -0
  8. kodit/cli.py +118 -82
  9. kodit/database.py +0 -22
  10. kodit/domain/__init__.py +1 -0
  11. kodit/{source/source_models.py → domain/entities.py} +88 -19
  12. kodit/domain/enums.py +9 -0
  13. kodit/domain/errors.py +5 -0
  14. kodit/domain/interfaces.py +27 -0
  15. kodit/domain/repositories.py +95 -0
  16. kodit/domain/services/__init__.py +1 -0
  17. kodit/domain/services/bm25_service.py +124 -0
  18. kodit/domain/services/embedding_service.py +155 -0
  19. kodit/domain/services/enrichment_service.py +48 -0
  20. kodit/domain/services/ignore_service.py +45 -0
  21. kodit/domain/services/indexing_service.py +203 -0
  22. kodit/domain/services/snippet_extraction_service.py +89 -0
  23. kodit/domain/services/source_service.py +85 -0
  24. kodit/domain/value_objects.py +215 -0
  25. kodit/infrastructure/__init__.py +1 -0
  26. kodit/infrastructure/bm25/__init__.py +1 -0
  27. kodit/infrastructure/bm25/bm25_factory.py +28 -0
  28. kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
  29. kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
  30. kodit/infrastructure/cloning/__init__.py +1 -0
  31. kodit/infrastructure/cloning/folder/__init__.py +1 -0
  32. kodit/infrastructure/cloning/folder/factory.py +128 -0
  33. kodit/infrastructure/cloning/folder/working_copy.py +38 -0
  34. kodit/infrastructure/cloning/git/__init__.py +1 -0
  35. kodit/infrastructure/cloning/git/factory.py +147 -0
  36. kodit/infrastructure/cloning/git/working_copy.py +32 -0
  37. kodit/infrastructure/cloning/metadata.py +127 -0
  38. kodit/infrastructure/embedding/__init__.py +1 -0
  39. kodit/infrastructure/embedding/embedding_factory.py +87 -0
  40. kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
  41. kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
  42. kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
  43. kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
  44. kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
  45. kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
  46. kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +65 -46
  47. kodit/infrastructure/enrichment/__init__.py +1 -0
  48. kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
  49. kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
  50. kodit/{enrichment/enrichment_provider → infrastructure/enrichment}/local_enrichment_provider.py +38 -26
  51. kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
  52. kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
  53. kodit/infrastructure/git/__init__.py +1 -0
  54. kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
  55. kodit/infrastructure/ignore/__init__.py +1 -0
  56. kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
  57. kodit/infrastructure/indexing/__init__.py +1 -0
  58. kodit/infrastructure/indexing/fusion_service.py +55 -0
  59. kodit/infrastructure/indexing/index_repository.py +291 -0
  60. kodit/infrastructure/indexing/indexing_factory.py +113 -0
  61. kodit/infrastructure/snippet_extraction/__init__.py +1 -0
  62. kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
  63. kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
  64. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
  65. kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
  66. kodit/infrastructure/sqlalchemy/__init__.py +1 -0
  67. kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -26
  68. kodit/infrastructure/sqlalchemy/file_repository.py +78 -0
  69. kodit/infrastructure/sqlalchemy/repository.py +133 -0
  70. kodit/infrastructure/sqlalchemy/snippet_repository.py +79 -0
  71. kodit/infrastructure/ui/__init__.py +1 -0
  72. kodit/infrastructure/ui/progress.py +127 -0
  73. kodit/{util → infrastructure/ui}/spinner.py +19 -4
  74. kodit/mcp.py +51 -28
  75. kodit/migrations/env.py +1 -4
  76. kodit/reporting.py +78 -0
  77. {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/METADATA +1 -1
  78. kodit-0.2.6.dist-info/RECORD +100 -0
  79. kodit/bm25/__init__.py +0 -1
  80. kodit/bm25/keyword_search_factory.py +0 -17
  81. kodit/bm25/keyword_search_service.py +0 -34
  82. kodit/embedding/__init__.py +0 -1
  83. kodit/embedding/embedding_factory.py +0 -69
  84. kodit/embedding/embedding_models.py +0 -28
  85. kodit/embedding/embedding_provider/__init__.py +0 -1
  86. kodit/embedding/embedding_provider/embedding_provider.py +0 -92
  87. kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -86
  88. kodit/embedding/embedding_provider/local_embedding_provider.py +0 -96
  89. kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -73
  90. kodit/embedding/local_vector_search_service.py +0 -87
  91. kodit/embedding/vector_search_service.py +0 -55
  92. kodit/enrichment/__init__.py +0 -1
  93. kodit/enrichment/enrichment_provider/__init__.py +0 -1
  94. kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -36
  95. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -79
  96. kodit/enrichment/enrichment_service.py +0 -45
  97. kodit/indexing/__init__.py +0 -1
  98. kodit/indexing/fusion.py +0 -67
  99. kodit/indexing/indexing_models.py +0 -43
  100. kodit/indexing/indexing_repository.py +0 -216
  101. kodit/indexing/indexing_service.py +0 -344
  102. kodit/snippets/__init__.py +0 -1
  103. kodit/snippets/languages/__init__.py +0 -53
  104. kodit/snippets/snippets.py +0 -50
  105. kodit/source/__init__.py +0 -1
  106. kodit/source/source_factories.py +0 -356
  107. kodit/source/source_repository.py +0 -169
  108. kodit/source/source_service.py +0 -150
  109. kodit/util/__init__.py +0 -1
  110. kodit-0.2.4.dist-info/RECORD +0 -71
  111. /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
  112. /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
  113. /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
  114. /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
  115. /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
  116. {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/WHEEL +0 -0
  117. {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/entry_points.txt +0 -0
  118. {kodit-0.2.4.dist-info → kodit-0.2.6.dist-info}/licenses/LICENSE +0 -0
kodit/indexing/fusion.py DELETED
@@ -1,67 +0,0 @@
1
- """Fusion functions for combining search results."""
2
-
3
- from collections import defaultdict
4
- from dataclasses import dataclass
5
-
6
-
7
- @dataclass
8
- class FusionResult:
9
- """Result of a fusion operation."""
10
-
11
- id: int
12
- score: float
13
- original_scores: list[float]
14
-
15
-
16
- @dataclass
17
- class FusionRequest:
18
- """Result of a RRF operation."""
19
-
20
- id: int
21
- score: float
22
-
23
-
24
- def reciprocal_rank_fusion(
25
- rankings: list[list[FusionRequest]], k: float = 60
26
- ) -> list[FusionResult]:
27
- """RRF prioritises results that are present in all results.
28
-
29
- Args:
30
- rankings: List of rankers, each containing a list of document ids. Top of the
31
- list is considered to be the best result.
32
- k: Parameter for RRF.
33
-
34
- Returns:
35
- Dictionary of ids and their scores.
36
-
37
- """
38
- scores = {}
39
- for ranker in rankings:
40
- for rank in ranker:
41
- scores[rank.id] = float(0)
42
-
43
- for ranker in rankings:
44
- for i, rank in enumerate(ranker):
45
- scores[rank.id] += 1.0 / (k + i)
46
-
47
- # Create a list of tuples of ids and their scores
48
- results = [(rank, scores[rank]) for rank in scores]
49
-
50
- # Sort results by score
51
- results.sort(key=lambda x: x[1], reverse=True)
52
-
53
- # Create a map of original scores to ids
54
- original_scores_to_ids = defaultdict(list)
55
- for ranker in rankings:
56
- for rank in ranker:
57
- original_scores_to_ids[rank.id].append(rank.score)
58
-
59
- # Rebuild a list of final results with their original scores
60
- return [
61
- FusionResult(
62
- id=result[0],
63
- score=result[1],
64
- original_scores=original_scores_to_ids[result[0]],
65
- )
66
- for result in results
67
- ]
@@ -1,43 +0,0 @@
1
- """Index models for managing code indexes.
2
-
3
- This module defines the SQLAlchemy models used for storing and managing code indexes,
4
- including files and snippets. It provides the data structures for tracking indexed
5
- files and their content.
6
- """
7
-
8
- from sqlalchemy import ForeignKey, UnicodeText
9
- from sqlalchemy.orm import Mapped, mapped_column
10
-
11
- from kodit.database import Base, CommonMixin
12
-
13
-
14
- class Index(Base, CommonMixin):
15
- """Index model."""
16
-
17
- __tablename__ = "indexes"
18
-
19
- source_id: Mapped[int] = mapped_column(
20
- ForeignKey("sources.id"), unique=True, index=True
21
- )
22
-
23
- def __init__(self, source_id: int) -> None:
24
- """Initialize the index."""
25
- super().__init__()
26
- self.source_id = source_id
27
-
28
-
29
- class Snippet(Base, CommonMixin):
30
- """Snippet model."""
31
-
32
- __tablename__ = "snippets"
33
-
34
- file_id: Mapped[int] = mapped_column(ForeignKey("files.id"), index=True)
35
- index_id: Mapped[int] = mapped_column(ForeignKey("indexes.id"), index=True)
36
- content: Mapped[str] = mapped_column(UnicodeText, default="")
37
-
38
- def __init__(self, file_id: int, index_id: int, content: str) -> None:
39
- """Initialize the snippet."""
40
- super().__init__()
41
- self.file_id = file_id
42
- self.index_id = index_id
43
- self.content = content
@@ -1,216 +0,0 @@
1
- """Repository for managing code indexes and their associated files and snippets.
2
-
3
- This module provides the IndexRepository class which handles all database operations
4
- related to code indexes, including creating indexes, managing files and snippets,
5
- and retrieving index information with their associated metadata.
6
- """
7
-
8
- from datetime import UTC, datetime
9
- from typing import TypeVar
10
-
11
- from sqlalchemy import delete, func, select
12
- from sqlalchemy.ext.asyncio import AsyncSession
13
-
14
- from kodit.embedding.embedding_models import Embedding
15
- from kodit.indexing.indexing_models import Index, Snippet
16
- from kodit.source.source_models import File, Source
17
-
18
- T = TypeVar("T")
19
-
20
-
21
- class IndexRepository:
22
- """Repository for managing code indexes and their associated data.
23
-
24
- This class provides methods for creating and managing code indexes, including
25
- their associated files and snippets. It handles all database operations related
26
- to indexing code sources.
27
- """
28
-
29
- def __init__(self, session: AsyncSession) -> None:
30
- """Initialize the index repository.
31
-
32
- Args:
33
- session: The SQLAlchemy async session to use for database operations.
34
-
35
- """
36
- self.session = session
37
-
38
- async def create(self, source_id: int) -> Index:
39
- """Create a new index for a source.
40
-
41
- Args:
42
- source_id: The ID of the source to create an index for.
43
-
44
- Returns:
45
- The newly created Index instance.
46
-
47
- """
48
- index = Index(source_id=source_id)
49
- self.session.add(index)
50
- await self.session.commit()
51
- return index
52
-
53
- async def get_by_id(self, index_id: int) -> Index | None:
54
- """Get an index by its ID.
55
-
56
- Args:
57
- index_id: The ID of the index to retrieve.
58
-
59
- Returns:
60
- The Index instance if found, None otherwise.
61
-
62
- """
63
- query = select(Index).where(Index.id == index_id)
64
- result = await self.session.execute(query)
65
- return result.scalar_one_or_none()
66
-
67
- async def get_by_source_id(self, source_id: int) -> Index | None:
68
- """Get an index by its source ID.
69
-
70
- Args:
71
- source_id: The ID of the source to retrieve an index for.
72
-
73
- """
74
- query = select(Index).where(Index.source_id == source_id)
75
- result = await self.session.execute(query)
76
- return result.scalar_one_or_none()
77
-
78
- async def files_for_index(self, index_id: int) -> list[File]:
79
- """Get all files for an index.
80
-
81
- Args:
82
- index_id: The ID of the index to get files for.
83
-
84
- Returns:
85
- A list of File instances.
86
-
87
- """
88
- query = (
89
- select(File)
90
- .join(Source, File.source_id == Source.id)
91
- .join(Index, Index.source_id == Source.id)
92
- .where(Index.id == index_id)
93
- )
94
- result = await self.session.execute(query)
95
- return list(result.scalars())
96
-
97
- async def list_indexes(self) -> list[tuple[Index, Source]]:
98
- """List all indexes.
99
-
100
- Returns:
101
- A list of tuples containing index information, source details,
102
- and counts of files and snippets.
103
-
104
- """
105
- query = select(Index, Source).join(
106
- Source, Index.source_id == Source.id, full=True
107
- )
108
- result = await self.session.execute(query)
109
- return list(result.tuples())
110
-
111
- async def num_snippets_for_index(self, index_id: int) -> int:
112
- """Get the number of snippets for an index."""
113
- query = select(func.count()).where(Snippet.index_id == index_id)
114
- result = await self.session.execute(query)
115
- return result.scalar_one()
116
-
117
- async def update_index_timestamp(self, index: Index) -> None:
118
- """Update the updated_at timestamp of an index.
119
-
120
- Args:
121
- index: The Index instance to update.
122
-
123
- """
124
- index.updated_at = datetime.now(UTC)
125
- await self.session.commit()
126
-
127
- async def add_snippet(self, snippet: Snippet) -> None:
128
- """Add a new snippet to the database if it doesn't exist, otherwise update it.
129
-
130
- Args:
131
- snippet: The Snippet instance to add.
132
-
133
- """
134
- self.session.add(snippet)
135
- await self.session.commit()
136
-
137
- async def delete_all_snippets(self, index_id: int) -> None:
138
- """Delete all snippets for an index.
139
-
140
- Args:
141
- index_id: The ID of the index to delete snippets for.
142
-
143
- """
144
- # First get all snippets for this index
145
- snippets = await self.get_snippets_for_index(index_id)
146
-
147
- # Delete all embeddings for these snippets, if there are any
148
- for snippet in snippets:
149
- query = delete(Embedding).where(Embedding.snippet_id == snippet.id)
150
- await self.session.execute(query)
151
-
152
- # Now delete the snippets
153
- query = delete(Snippet).where(Snippet.index_id == index_id)
154
- await self.session.execute(query)
155
- await self.session.commit()
156
-
157
- async def get_snippets_for_index(self, index_id: int) -> list[Snippet]:
158
- """Get all snippets for an index.
159
-
160
- Args:
161
- index_id: The ID of the index to get snippets for.
162
-
163
- """
164
- query = select(Snippet).where(Snippet.index_id == index_id)
165
- result = await self.session.execute(query)
166
- return list(result.scalars())
167
-
168
- async def get_all_snippets(self, index_id: int) -> list[Snippet]:
169
- """Get all snippets.
170
-
171
- Returns:
172
- A list of all snippets.
173
-
174
- """
175
- query = select(Snippet).where(Snippet.index_id == index_id).order_by(Snippet.id)
176
- result = await self.session.execute(query)
177
- return list(result.scalars())
178
-
179
- async def add_embedding(self, embedding: Embedding) -> None:
180
- """Add a new embedding to the database.
181
-
182
- Args:
183
- embedding: The Embedding instance to add.
184
-
185
- """
186
- self.session.add(embedding)
187
- await self.session.commit()
188
-
189
- async def list_snippets_by_ids(self, ids: list[int]) -> list[tuple[File, Snippet]]:
190
- """List snippets by IDs.
191
-
192
- Returns:
193
- A list of snippets in the same order as the input IDs.
194
-
195
- """
196
- query = (
197
- select(Snippet, File)
198
- .where(Snippet.id.in_(ids))
199
- .join(File, Snippet.file_id == File.id)
200
- )
201
- rows = await self.session.execute(query)
202
-
203
- # Create a dictionary for O(1) lookup of results by ID
204
- id_to_result = {snippet.id: (file, snippet) for snippet, file in rows.all()}
205
-
206
- # Check that all IDs are present
207
- if len(id_to_result) != len(ids):
208
- # Create a list of missing IDs
209
- missing_ids = [
210
- snippet_id for snippet_id in ids if snippet_id not in id_to_result
211
- ]
212
- msg = f"Some IDs are not present: {missing_ids}"
213
- raise ValueError(msg)
214
-
215
- # Rebuild the list in the same order that it was passed in
216
- return [id_to_result[i] for i in ids]
@@ -1,344 +0,0 @@
1
- """Index service for managing code indexes.
2
-
3
- This module provides the IndexService class which handles the business logic for
4
- creating, listing, and running code indexes. It orchestrates the interaction between the
5
- file system, database operations (via IndexRepository), and provides a clean API for
6
- index management.
7
- """
8
-
9
- from datetime import datetime
10
- from pathlib import Path
11
-
12
- import pydantic
13
- import structlog
14
- from tqdm.asyncio import tqdm
15
-
16
- from kodit.bm25.keyword_search_service import (
17
- BM25Document,
18
- BM25Result,
19
- KeywordSearchProvider,
20
- )
21
- from kodit.embedding.vector_search_service import (
22
- VectorSearchRequest,
23
- VectorSearchService,
24
- )
25
- from kodit.enrichment.enrichment_provider.enrichment_provider import EnrichmentRequest
26
- from kodit.enrichment.enrichment_service import EnrichmentService
27
- from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
28
- from kodit.indexing.indexing_models import Snippet
29
- from kodit.indexing.indexing_repository import IndexRepository
30
- from kodit.log import log_event
31
- from kodit.snippets.snippets import SnippetService
32
- from kodit.source.source_service import SourceService
33
- from kodit.util.spinner import Spinner
34
-
35
- # List of MIME types that are blacklisted from being indexed
36
- MIME_BLACKLIST = ["unknown/unknown"]
37
-
38
-
39
- class IndexView(pydantic.BaseModel):
40
- """Data transfer object for index information.
41
-
42
- This model represents the public interface for index data, providing a clean
43
- view of index information without exposing internal implementation details.
44
- """
45
-
46
- id: int
47
- created_at: datetime
48
- updated_at: datetime | None = None
49
- source: str | None = None
50
- num_snippets: int
51
-
52
-
53
- class SearchRequest(pydantic.BaseModel):
54
- """Request for a search."""
55
-
56
- text_query: str | None = None
57
- code_query: str | None = None
58
- keywords: list[str] | None = None
59
- top_k: int = 10
60
-
61
-
62
- class SearchResult(pydantic.BaseModel):
63
- """Data transfer object for search results.
64
-
65
- This model represents a single search result, containing both the file path
66
- and the matching snippet content.
67
- """
68
-
69
- id: int
70
- uri: str
71
- content: str
72
- original_scores: list[float]
73
-
74
-
75
- class IndexService:
76
- """Service for managing code indexes.
77
-
78
- This service handles the business logic for creating, listing, and running code
79
- indexes. It coordinates between file system operations, database operations (via
80
- IndexRepository), and provides a clean API for index management.
81
- """
82
-
83
- def __init__( # noqa: PLR0913
84
- self,
85
- repository: IndexRepository,
86
- source_service: SourceService,
87
- keyword_search_provider: KeywordSearchProvider,
88
- code_search_service: VectorSearchService,
89
- text_search_service: VectorSearchService,
90
- enrichment_service: EnrichmentService,
91
- ) -> None:
92
- """Initialize the index service.
93
-
94
- Args:
95
- repository: The repository instance to use for database operations.
96
- source_service: The source service instance to use for source validation.
97
-
98
- """
99
- self.repository = repository
100
- self.source_service = source_service
101
- self.snippet_service = SnippetService()
102
- self.log = structlog.get_logger(__name__)
103
- self.keyword_search_provider = keyword_search_provider
104
- self.code_search_service = code_search_service
105
- self.text_search_service = text_search_service
106
- self.enrichment_service = enrichment_service
107
-
108
- async def create(self, source_id: int) -> IndexView:
109
- """Create a new index for a source.
110
-
111
- This method creates a new index for the specified source, after validating
112
- that the source exists and doesn't already have an index.
113
-
114
- Args:
115
- source_id: The ID of the source to create an index for.
116
-
117
- Returns:
118
- An Index object representing the newly created index.
119
-
120
- Raises:
121
- ValueError: If the source doesn't exist or already has an index.
122
-
123
- """
124
- log_event("kodit.index.create")
125
-
126
- # Check if the source exists
127
- source = await self.source_service.get(source_id)
128
-
129
- # Check if the index already exists
130
- index = await self.repository.get_by_source_id(source.id)
131
- if not index:
132
- index = await self.repository.create(source.id)
133
- return IndexView(
134
- id=index.id,
135
- created_at=index.created_at,
136
- num_snippets=await self.repository.num_snippets_for_index(index.id),
137
- source=source.uri,
138
- )
139
-
140
- async def list_indexes(self) -> list[IndexView]:
141
- """List all available indexes with their details.
142
-
143
- Returns:
144
- A list of Index objects containing information about each index,
145
- including file and snippet counts.
146
-
147
- """
148
- indexes = await self.repository.list_indexes()
149
-
150
- # Transform database results into DTOs
151
- indexes = [
152
- IndexView(
153
- id=index.id,
154
- created_at=index.created_at,
155
- updated_at=index.updated_at,
156
- num_snippets=await self.repository.num_snippets_for_index(index.id)
157
- or 0,
158
- source=source.uri,
159
- )
160
- for index, source in indexes
161
- ]
162
-
163
- # Help Kodit by measuring how much people are using indexes
164
- log_event(
165
- "kodit.index.list",
166
- {
167
- "num_indexes": len(indexes),
168
- "num_snippets": sum([index.num_snippets for index in indexes]),
169
- },
170
- )
171
-
172
- return indexes
173
-
174
- async def run(self, index_id: int) -> None:
175
- """Run the indexing process for a specific index."""
176
- log_event("kodit.index.run")
177
-
178
- # Get and validate index
179
- index = await self.repository.get_by_id(index_id)
180
- if not index:
181
- msg = f"Index not found: {index_id}"
182
- raise ValueError(msg)
183
-
184
- # Delete old snippets so we don't duplicate. In the future should probably check
185
- # which files have changed and only change those.
186
- await self.repository.delete_all_snippets(index.id)
187
-
188
- # Create snippets for supported file types
189
- self.log.info("Creating snippets for files", index_id=index.id)
190
- await self._create_snippets(index.id)
191
-
192
- snippets = await self.repository.get_all_snippets(index.id)
193
-
194
- self.log.info("Creating keyword index")
195
- with Spinner():
196
- await self.keyword_search_provider.index(
197
- [
198
- BM25Document(snippet_id=snippet.id, text=snippet.content)
199
- for snippet in snippets
200
- ]
201
- )
202
-
203
- self.log.info("Creating semantic code index")
204
- with tqdm(total=len(snippets), leave=False) as pbar:
205
- async for result in self.code_search_service.index(
206
- [
207
- VectorSearchRequest(snippet.id, snippet.content)
208
- for snippet in snippets
209
- ]
210
- ):
211
- pbar.update(len(result))
212
-
213
- self.log.info("Enriching snippets", num_snippets=len(snippets))
214
- enriched_contents = []
215
- with tqdm(total=len(snippets), leave=False) as pbar:
216
- async for result in self.enrichment_service.enrich(
217
- [
218
- EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
219
- for snippet in snippets
220
- ]
221
- ):
222
- snippet = next(s for s in snippets if s.id == result.snippet_id)
223
- if snippet:
224
- snippet.content = (
225
- result.text + "\n\n```\n" + snippet.content + "\n```"
226
- )
227
- await self.repository.add_snippet(snippet)
228
- enriched_contents.append(result)
229
- pbar.update(1)
230
-
231
- self.log.info("Creating semantic text index")
232
- with tqdm(total=len(snippets), leave=False) as pbar:
233
- async for result in self.text_search_service.index(
234
- [
235
- VectorSearchRequest(snippet.id, snippet.content)
236
- for snippet in snippets
237
- ]
238
- ):
239
- pbar.update(len(result))
240
-
241
- # Update index timestamp
242
- await self.repository.update_index_timestamp(index)
243
-
244
- async def search(self, request: SearchRequest) -> list[SearchResult]:
245
- """Search for relevant data."""
246
- log_event("kodit.index.search")
247
-
248
- fusion_list: list[list[FusionRequest]] = []
249
- if request.keywords:
250
- # Gather results for each keyword
251
- result_ids: list[BM25Result] = []
252
- for keyword in request.keywords:
253
- results = await self.keyword_search_provider.retrieve(
254
- keyword, request.top_k
255
- )
256
- result_ids.extend(results)
257
-
258
- fusion_list.append(
259
- [FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
260
- )
261
-
262
- # Compute embedding for semantic query
263
- if request.code_query:
264
- query_embedding = await self.code_search_service.retrieve(
265
- request.code_query, top_k=request.top_k
266
- )
267
- fusion_list.append(
268
- [FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
269
- )
270
-
271
- if request.text_query:
272
- query_embedding = await self.text_search_service.retrieve(
273
- request.text_query, top_k=request.top_k
274
- )
275
- fusion_list.append(
276
- [FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
277
- )
278
-
279
- if len(fusion_list) == 0:
280
- return []
281
-
282
- # Combine all results together with RFF if required
283
- final_results = reciprocal_rank_fusion(
284
- rankings=fusion_list,
285
- k=60,
286
- )
287
-
288
- # Only keep top_k results
289
- final_results = final_results[: request.top_k]
290
-
291
- # Get snippets from database (up to top_k)
292
- search_results = await self.repository.list_snippets_by_ids(
293
- [x.id for x in final_results]
294
- )
295
-
296
- return [
297
- SearchResult(
298
- id=snippet.id,
299
- uri=file.uri,
300
- content=snippet.content,
301
- original_scores=fr.original_scores,
302
- )
303
- for (file, snippet), fr in zip(search_results, final_results, strict=True)
304
- ]
305
-
306
- async def _create_snippets(
307
- self,
308
- index_id: int,
309
- ) -> None:
310
- """Create snippets for supported files.
311
-
312
- Args:
313
- index: The index to create snippets for.
314
- file_list: List of files to create snippets from.
315
- existing_snippets_set: Set of file IDs that already have snippets.
316
-
317
- """
318
- files = await self.repository.files_for_index(index_id)
319
- if not files:
320
- self.log.warning("No files to create snippets for")
321
- return
322
-
323
- for file in tqdm(files, total=len(files), leave=False):
324
- # Skip unsupported file types
325
- if file.mime_type in MIME_BLACKLIST:
326
- self.log.debug("Skipping mime type", mime_type=file.mime_type)
327
- continue
328
-
329
- # Create snippet from file content
330
- try:
331
- snippets = self.snippet_service.snippets_for_file(
332
- Path(file.cloned_path)
333
- )
334
- except ValueError as e:
335
- self.log.debug("Skipping file", file=file.cloned_path, error=e)
336
- continue
337
-
338
- for snippet in snippets:
339
- s = Snippet(
340
- index_id=index_id,
341
- file_id=file.id,
342
- content=snippet.text,
343
- )
344
- await self.repository.add_snippet(s)
@@ -1 +0,0 @@
1
- """Extract method snippets from source code."""