kodit 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (117) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/__init__.py +1 -0
  3. kodit/application/commands/__init__.py +1 -0
  4. kodit/application/commands/snippet_commands.py +22 -0
  5. kodit/application/services/__init__.py +1 -0
  6. kodit/application/services/indexing_application_service.py +363 -0
  7. kodit/application/services/snippet_application_service.py +143 -0
  8. kodit/cli.py +105 -82
  9. kodit/database.py +0 -22
  10. kodit/domain/__init__.py +1 -0
  11. kodit/{source/source_models.py → domain/entities.py} +88 -19
  12. kodit/domain/enums.py +9 -0
  13. kodit/domain/interfaces.py +27 -0
  14. kodit/domain/repositories.py +95 -0
  15. kodit/domain/services/__init__.py +1 -0
  16. kodit/domain/services/bm25_service.py +124 -0
  17. kodit/domain/services/embedding_service.py +155 -0
  18. kodit/domain/services/enrichment_service.py +48 -0
  19. kodit/domain/services/ignore_service.py +45 -0
  20. kodit/domain/services/indexing_service.py +203 -0
  21. kodit/domain/services/snippet_extraction_service.py +89 -0
  22. kodit/domain/services/source_service.py +83 -0
  23. kodit/domain/value_objects.py +215 -0
  24. kodit/infrastructure/__init__.py +1 -0
  25. kodit/infrastructure/bm25/__init__.py +1 -0
  26. kodit/infrastructure/bm25/bm25_factory.py +28 -0
  27. kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
  28. kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
  29. kodit/infrastructure/cloning/__init__.py +1 -0
  30. kodit/infrastructure/cloning/folder/__init__.py +1 -0
  31. kodit/infrastructure/cloning/folder/factory.py +119 -0
  32. kodit/infrastructure/cloning/folder/working_copy.py +38 -0
  33. kodit/infrastructure/cloning/git/__init__.py +1 -0
  34. kodit/infrastructure/cloning/git/factory.py +133 -0
  35. kodit/infrastructure/cloning/git/working_copy.py +32 -0
  36. kodit/infrastructure/cloning/metadata.py +127 -0
  37. kodit/infrastructure/embedding/__init__.py +1 -0
  38. kodit/infrastructure/embedding/embedding_factory.py +87 -0
  39. kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
  40. kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
  41. kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
  42. kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
  43. kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
  44. kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
  45. kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +65 -46
  46. kodit/infrastructure/enrichment/__init__.py +1 -0
  47. kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
  48. kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
  49. kodit/{enrichment/enrichment_provider → infrastructure/enrichment}/local_enrichment_provider.py +38 -26
  50. kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
  51. kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
  52. kodit/infrastructure/git/__init__.py +1 -0
  53. kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
  54. kodit/infrastructure/ignore/__init__.py +1 -0
  55. kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
  56. kodit/infrastructure/indexing/__init__.py +1 -0
  57. kodit/infrastructure/indexing/fusion_service.py +55 -0
  58. kodit/infrastructure/indexing/index_repository.py +296 -0
  59. kodit/infrastructure/indexing/indexing_factory.py +111 -0
  60. kodit/infrastructure/snippet_extraction/__init__.py +1 -0
  61. kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
  62. kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
  63. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
  64. kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
  65. kodit/infrastructure/sqlalchemy/__init__.py +1 -0
  66. kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
  67. kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
  68. kodit/infrastructure/sqlalchemy/repository.py +121 -0
  69. kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
  70. kodit/infrastructure/ui/__init__.py +1 -0
  71. kodit/infrastructure/ui/progress.py +127 -0
  72. kodit/{util → infrastructure/ui}/spinner.py +19 -4
  73. kodit/mcp.py +50 -28
  74. kodit/migrations/env.py +1 -4
  75. kodit/reporting.py +78 -0
  76. {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
  77. kodit-0.2.5.dist-info/RECORD +99 -0
  78. kodit/bm25/__init__.py +0 -1
  79. kodit/bm25/keyword_search_factory.py +0 -17
  80. kodit/bm25/keyword_search_service.py +0 -34
  81. kodit/embedding/__init__.py +0 -1
  82. kodit/embedding/embedding_factory.py +0 -69
  83. kodit/embedding/embedding_models.py +0 -28
  84. kodit/embedding/embedding_provider/__init__.py +0 -1
  85. kodit/embedding/embedding_provider/embedding_provider.py +0 -92
  86. kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -86
  87. kodit/embedding/embedding_provider/local_embedding_provider.py +0 -96
  88. kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -73
  89. kodit/embedding/local_vector_search_service.py +0 -87
  90. kodit/embedding/vector_search_service.py +0 -55
  91. kodit/enrichment/__init__.py +0 -1
  92. kodit/enrichment/enrichment_provider/__init__.py +0 -1
  93. kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -36
  94. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -79
  95. kodit/enrichment/enrichment_service.py +0 -45
  96. kodit/indexing/__init__.py +0 -1
  97. kodit/indexing/fusion.py +0 -67
  98. kodit/indexing/indexing_models.py +0 -43
  99. kodit/indexing/indexing_repository.py +0 -216
  100. kodit/indexing/indexing_service.py +0 -344
  101. kodit/snippets/__init__.py +0 -1
  102. kodit/snippets/languages/__init__.py +0 -53
  103. kodit/snippets/snippets.py +0 -50
  104. kodit/source/__init__.py +0 -1
  105. kodit/source/source_factories.py +0 -356
  106. kodit/source/source_repository.py +0 -169
  107. kodit/source/source_service.py +0 -150
  108. kodit/util/__init__.py +0 -1
  109. kodit-0.2.4.dist-info/RECORD +0 -71
  110. /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
  111. /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
  112. /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
  113. /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
  114. /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
  115. {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
  116. {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
  117. {kodit-0.2.4.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,119 @@
1
+ """Factory for creating folder-based working copies."""
2
+
3
+ from pathlib import Path
4
+
5
+ import structlog
6
+
7
+ from kodit.domain.entities import AuthorFileMapping, Source, SourceType
8
+ from kodit.domain.interfaces import NullProgressCallback, ProgressCallback
9
+ from kodit.domain.repositories import SourceRepository
10
+ from kodit.domain.value_objects import ProgressEvent
11
+ from kodit.infrastructure.cloning.folder.working_copy import FolderWorkingCopyProvider
12
+ from kodit.infrastructure.cloning.metadata import (
13
+ FolderFileMetadataExtractor,
14
+ NoOpAuthorExtractor,
15
+ )
16
+
17
+
18
+ class FolderSourceFactory:
19
+ """Factory for creating folder sources."""
20
+
21
+ def __init__(
22
+ self,
23
+ repository: SourceRepository,
24
+ working_copy: FolderWorkingCopyProvider,
25
+ ) -> None:
26
+ """Initialize the source factory."""
27
+ self.log = structlog.get_logger(__name__)
28
+ self.repository = repository
29
+ self.working_copy = working_copy
30
+ self.metadata_extractor = FolderFileMetadataExtractor()
31
+ self.author_extractor = NoOpAuthorExtractor()
32
+
33
+ async def create(
34
+ self, uri: str, progress_callback: ProgressCallback | None = None
35
+ ) -> Source:
36
+ """Create a folder source from a path."""
37
+ # Use null callback if none provided
38
+ if progress_callback is None:
39
+ progress_callback = NullProgressCallback()
40
+
41
+ directory = Path(uri).expanduser().resolve()
42
+
43
+ # Check if source already exists
44
+ source = await self.repository.get_by_uri(directory.as_uri())
45
+ if source:
46
+ self.log.info("Source already exists, reusing...", source_id=source.id)
47
+ return source
48
+
49
+ # Validate directory exists
50
+ if not directory.exists():
51
+ msg = f"Folder does not exist: {directory}"
52
+ raise ValueError(msg)
53
+
54
+ # Prepare working copy
55
+ clone_path = await self.working_copy.prepare(directory.as_uri())
56
+
57
+ # Create source record
58
+ source = await self.repository.create_source(
59
+ Source(
60
+ uri=directory.as_uri(),
61
+ cloned_path=str(clone_path),
62
+ source_type=SourceType.FOLDER,
63
+ )
64
+ )
65
+
66
+ # Get all files to process
67
+ files = [f for f in clone_path.rglob("*") if f.is_file()]
68
+
69
+ # Process files
70
+ await self._process_files(source, files, progress_callback)
71
+
72
+ return source
73
+
74
+ async def _process_files(
75
+ self, source: Source, files: list[Path], progress_callback: ProgressCallback
76
+ ) -> None:
77
+ """Process files for a source."""
78
+ total_files = len(files)
79
+
80
+ # Notify start of operation
81
+ await progress_callback.on_progress(
82
+ ProgressEvent(
83
+ operation="process_files",
84
+ current=0,
85
+ total=total_files,
86
+ message="Processing files...",
87
+ )
88
+ )
89
+
90
+ for i, path in enumerate(files, 1):
91
+ if not path.is_file():
92
+ continue
93
+
94
+ # Extract file metadata
95
+ file_record = await self.metadata_extractor.extract(path, source)
96
+ await self.repository.create_file(file_record)
97
+
98
+ # Extract authors
99
+ authors = await self.author_extractor.extract(path, source)
100
+ for author in authors:
101
+ await self.repository.upsert_author_file_mapping(
102
+ AuthorFileMapping(
103
+ author_id=author.id,
104
+ file_id=file_record.id,
105
+ )
106
+ )
107
+
108
+ # Update progress
109
+ await progress_callback.on_progress(
110
+ ProgressEvent(
111
+ operation="process_files",
112
+ current=i,
113
+ total=total_files,
114
+ message=f"Processing {path.name}...",
115
+ )
116
+ )
117
+
118
+ # Notify completion
119
+ await progress_callback.on_complete("process_files")
@@ -0,0 +1,38 @@
1
+ """Working copy provider for folder-based sources."""
2
+
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+
7
+ class FolderWorkingCopyProvider:
8
+ """Working copy provider for folder-based sources."""
9
+
10
+ def __init__(self, clone_dir: Path) -> None:
11
+ """Initialize the provider."""
12
+ self.clone_dir = clone_dir
13
+
14
+ async def prepare(self, uri: str) -> Path:
15
+ """Prepare a folder working copy."""
16
+ # Handle file:// URIs
17
+ if uri.startswith("file://"):
18
+ from urllib.parse import urlparse
19
+
20
+ parsed = urlparse(uri)
21
+ directory = Path(parsed.path).expanduser().resolve()
22
+ else:
23
+ directory = Path(uri).expanduser().resolve()
24
+
25
+ # Clone into a local directory
26
+ clone_path = self.clone_dir / directory.as_posix().replace("/", "_")
27
+ clone_path.mkdir(parents=True, exist_ok=True)
28
+
29
+ # Copy all files recursively, preserving directory structure, ignoring
30
+ # hidden files
31
+ shutil.copytree(
32
+ directory,
33
+ clone_path,
34
+ ignore=shutil.ignore_patterns(".*"),
35
+ dirs_exist_ok=True,
36
+ )
37
+
38
+ return clone_path
@@ -0,0 +1 @@
1
+ """Git cloning infrastructure."""
@@ -0,0 +1,133 @@
1
+ """Factory for creating git-based working copies."""
2
+
3
+ import tempfile
4
+ from pathlib import Path
5
+
6
+ import git
7
+ import structlog
8
+
9
+ from kodit.domain.entities import AuthorFileMapping, Source, SourceType
10
+ from kodit.domain.interfaces import NullProgressCallback, ProgressCallback
11
+ from kodit.domain.repositories import SourceRepository
12
+ from kodit.domain.services.ignore_service import IgnoreService
13
+ from kodit.domain.value_objects import ProgressEvent
14
+ from kodit.infrastructure.cloning.git.working_copy import GitWorkingCopyProvider
15
+ from kodit.infrastructure.cloning.metadata import (
16
+ GitAuthorExtractor,
17
+ GitFileMetadataExtractor,
18
+ )
19
+ from kodit.infrastructure.ignore.ignore_pattern_provider import GitIgnorePatternProvider
20
+
21
+
22
+ class GitSourceFactory:
23
+ """Factory for creating git-based working copies."""
24
+
25
+ def __init__(
26
+ self,
27
+ repository: SourceRepository,
28
+ working_copy: GitWorkingCopyProvider,
29
+ ) -> None:
30
+ """Initialize the source factory."""
31
+ self.log = structlog.get_logger(__name__)
32
+ self.repository = repository
33
+ self.working_copy = working_copy
34
+ self.metadata_extractor = GitFileMetadataExtractor()
35
+ self.author_extractor = GitAuthorExtractor(repository)
36
+
37
+ async def create(
38
+ self, uri: str, progress_callback: ProgressCallback | None = None
39
+ ) -> Source:
40
+ """Create a git source from a URI."""
41
+ # Use null callback if none provided
42
+ if progress_callback is None:
43
+ progress_callback = NullProgressCallback()
44
+
45
+ # Normalize the URI
46
+ self.log.debug("Normalising git uri", uri=uri)
47
+ with tempfile.TemporaryDirectory() as temp_dir:
48
+ git.Repo.clone_from(uri, temp_dir)
49
+ remote = git.Repo(temp_dir).remote()
50
+ uri = remote.url
51
+
52
+ # Check if source already exists
53
+ self.log.debug("Checking if source already exists", uri=uri)
54
+ source = await self.repository.get_by_uri(uri)
55
+
56
+ if source:
57
+ self.log.info("Source already exists, reusing...", source_id=source.id)
58
+ return source
59
+
60
+ # Prepare working copy
61
+ clone_path = await self.working_copy.prepare(uri)
62
+
63
+ # Create source record
64
+ self.log.debug("Creating source", uri=uri, clone_path=str(clone_path))
65
+ source = await self.repository.create_source(
66
+ Source(
67
+ uri=uri,
68
+ cloned_path=str(clone_path),
69
+ source_type=SourceType.GIT,
70
+ )
71
+ )
72
+
73
+ # Get files to process using ignore patterns
74
+ ignore_provider = GitIgnorePatternProvider(clone_path)
75
+ ignore_service = IgnoreService(ignore_provider)
76
+ files = [
77
+ f
78
+ for f in clone_path.rglob("*")
79
+ if f.is_file() and not ignore_service.should_ignore(f)
80
+ ]
81
+
82
+ # Process files
83
+ self.log.info("Inspecting files", source_id=source.id, num_files=len(files))
84
+ await self._process_files(source, files, progress_callback)
85
+
86
+ return source
87
+
88
+ async def _process_files(
89
+ self, source: Source, files: list[Path], progress_callback: ProgressCallback
90
+ ) -> None:
91
+ """Process files for a source."""
92
+ total_files = len(files)
93
+
94
+ # Notify start of operation
95
+ await progress_callback.on_progress(
96
+ ProgressEvent(
97
+ operation="process_files",
98
+ current=0,
99
+ total=total_files,
100
+ message="Processing files...",
101
+ )
102
+ )
103
+
104
+ for i, path in enumerate(files, 1):
105
+ if not path.is_file():
106
+ continue
107
+
108
+ # Extract file metadata
109
+ file_record = await self.metadata_extractor.extract(path, source)
110
+ await self.repository.create_file(file_record)
111
+
112
+ # Extract authors
113
+ authors = await self.author_extractor.extract(path, source)
114
+ for author in authors:
115
+ await self.repository.upsert_author_file_mapping(
116
+ AuthorFileMapping(
117
+ author_id=author.id,
118
+ file_id=file_record.id,
119
+ )
120
+ )
121
+
122
+ # Update progress
123
+ await progress_callback.on_progress(
124
+ ProgressEvent(
125
+ operation="process_files",
126
+ current=i,
127
+ total=total_files,
128
+ message=f"Processing {path.name}...",
129
+ )
130
+ )
131
+
132
+ # Notify completion
133
+ await progress_callback.on_complete("process_files")
@@ -0,0 +1,32 @@
1
+ """Working copy provider for git-based sources."""
2
+
3
+ from pathlib import Path
4
+
5
+ import git
6
+ import structlog
7
+
8
+
9
+ class GitWorkingCopyProvider:
10
+ """Working copy provider for git-based sources."""
11
+
12
+ def __init__(self, clone_dir: Path) -> None:
13
+ """Initialize the provider."""
14
+ self.clone_dir = clone_dir
15
+ self.log = structlog.get_logger(__name__)
16
+
17
+ async def prepare(self, uri: str) -> Path:
18
+ """Prepare a Git working copy."""
19
+ # Create a unique directory name for the clone
20
+ clone_path = self.clone_dir / uri.replace("/", "_").replace(":", "_")
21
+ clone_path.mkdir(parents=True, exist_ok=True)
22
+
23
+ try:
24
+ self.log.info("Cloning repository", uri=uri, clone_path=str(clone_path))
25
+ git.Repo.clone_from(uri, clone_path)
26
+ except git.GitCommandError as e:
27
+ if "already exists and is not an empty directory" not in str(e):
28
+ msg = f"Failed to clone repository: {e}"
29
+ raise ValueError(msg) from e
30
+ self.log.info("Repository already exists, reusing...", uri=uri)
31
+
32
+ return clone_path
@@ -0,0 +1,127 @@
1
+ """Metadata extraction for cloned sources."""
2
+
3
+ import mimetypes
4
+ from datetime import UTC, datetime
5
+ from hashlib import sha256
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import aiofiles
10
+ import git
11
+ import structlog
12
+
13
+ from kodit.domain.entities import Author, File, Source
14
+
15
+
16
+ class BaseFileMetadataExtractor:
17
+ """Base class for file metadata extraction with common functionality."""
18
+
19
+ async def extract(self, path: Path, source: Source) -> File:
20
+ """Extract metadata from a file."""
21
+ # Get timestamps - to be implemented by subclasses
22
+ created_at, updated_at = await self._get_timestamps(path, source)
23
+
24
+ # Read file content and calculate metadata
25
+ async with aiofiles.open(path, "rb") as f:
26
+ content = await f.read()
27
+ mime_type = mimetypes.guess_type(path)
28
+ sha = sha256(content).hexdigest()
29
+
30
+ return File(
31
+ created_at=created_at,
32
+ updated_at=updated_at,
33
+ source_id=source.id,
34
+ cloned_path=str(path),
35
+ mime_type=mime_type[0]
36
+ if mime_type and mime_type[0]
37
+ else "application/octet-stream",
38
+ uri=path.as_uri(),
39
+ sha256=sha,
40
+ size_bytes=len(content),
41
+ )
42
+
43
+ async def _get_timestamps(
44
+ self, path: Path, source: Source
45
+ ) -> tuple[datetime, datetime]:
46
+ """Get creation and modification timestamps. To be implemented by subclasses."""
47
+ raise NotImplementedError
48
+
49
+
50
+ class GitFileMetadataExtractor(BaseFileMetadataExtractor):
51
+ """Git-specific implementation for extracting file metadata."""
52
+
53
+ async def _get_timestamps(
54
+ self, path: Path, source: Source
55
+ ) -> tuple[datetime, datetime]:
56
+ """Get timestamps from Git history."""
57
+ git_repo = git.Repo(source.cloned_path)
58
+ commits = list(git_repo.iter_commits(paths=str(path), all=True))
59
+
60
+ if commits:
61
+ last_modified_at = commits[0].committed_datetime
62
+ first_modified_at = commits[-1].committed_datetime
63
+ return first_modified_at, last_modified_at
64
+ # Fallback to current time if no commits found
65
+ now = datetime.now(UTC)
66
+ return now, now
67
+
68
+
69
+ class FolderFileMetadataExtractor(BaseFileMetadataExtractor):
70
+ """Folder-specific implementation for extracting file metadata."""
71
+
72
+ async def _get_timestamps(
73
+ self,
74
+ path: Path,
75
+ source: Source, # noqa: ARG002
76
+ ) -> tuple[datetime, datetime]:
77
+ """Get timestamps from file system."""
78
+ stat = path.stat()
79
+ file_created_at = datetime.fromtimestamp(stat.st_ctime, UTC)
80
+ file_modified_at = datetime.fromtimestamp(stat.st_mtime, UTC)
81
+ return file_created_at, file_modified_at
82
+
83
+
84
+ class GitAuthorExtractor:
85
+ """Author extractor for Git repositories."""
86
+
87
+ def __init__(self, repository: Any) -> None:
88
+ """Initialize the extractor."""
89
+ self.repository = repository
90
+ self.log = structlog.get_logger(__name__)
91
+
92
+ async def extract(self, path: Path, source: Source) -> list[Author]:
93
+ """Extract authors from a Git file."""
94
+ authors: list[Author] = []
95
+ git_repo = git.Repo(source.cloned_path)
96
+
97
+ try:
98
+ # Get the file's blame
99
+ blames = git_repo.blame("HEAD", str(path))
100
+
101
+ # Extract the blame's authors
102
+ actors = [
103
+ commit.author
104
+ for blame in blames or []
105
+ for commit in blame
106
+ if isinstance(commit, git.Commit)
107
+ ]
108
+
109
+ # Get or create the authors in the database
110
+ for actor in actors:
111
+ if actor.email:
112
+ author = Author.from_actor(actor)
113
+ author = await self.repository.upsert_author(author)
114
+ authors.append(author)
115
+ except git.GitCommandError:
116
+ # Handle cases where file might not be tracked
117
+ pass
118
+
119
+ return authors
120
+
121
+
122
+ class NoOpAuthorExtractor:
123
+ """No-op author extractor for sources that don't have author information."""
124
+
125
+ async def extract(self, path: Path, source: Source) -> list[Author]: # noqa: ARG002
126
+ """Return empty list of authors."""
127
+ return []
@@ -0,0 +1 @@
1
+ """Embedding infrastructure module."""
@@ -0,0 +1,87 @@
1
+ """Factory for creating embedding services with DDD architecture."""
2
+
3
+ from sqlalchemy.ext.asyncio import AsyncSession
4
+
5
+ from kodit.config import AppContext, Endpoint
6
+ from kodit.domain.entities import EmbeddingType
7
+ from kodit.domain.services.embedding_service import EmbeddingDomainService
8
+ from kodit.infrastructure.embedding.embedding_providers.local_embedding_provider import ( # noqa: E501
9
+ CODE,
10
+ LocalEmbeddingProvider,
11
+ )
12
+ from kodit.infrastructure.embedding.embedding_providers.openai_embedding_provider import ( # noqa: E501
13
+ OpenAIEmbeddingProvider,
14
+ )
15
+ from kodit.infrastructure.embedding.local_vector_search_repository import (
16
+ LocalVectorSearchRepository,
17
+ )
18
+ from kodit.infrastructure.embedding.vectorchord_vector_search_repository import (
19
+ TaskName,
20
+ VectorChordVectorSearchRepository,
21
+ )
22
+ from kodit.infrastructure.sqlalchemy.embedding_repository import (
23
+ SqlAlchemyEmbeddingRepository,
24
+ )
25
+ from kodit.log import log_event
26
+
27
+
28
+ def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
29
+ """Get the endpoint configuration for the embedding service."""
30
+ return app_context.embedding_endpoint or app_context.default_endpoint or None
31
+
32
+
33
+ def embedding_domain_service_factory(
34
+ task_name: TaskName, app_context: AppContext, session: AsyncSession
35
+ ) -> EmbeddingDomainService:
36
+ """Create an embedding domain service."""
37
+ # Create embedding repository
38
+ embedding_repository = SqlAlchemyEmbeddingRepository(session=session)
39
+
40
+ # Create embedding provider
41
+ endpoint = _get_endpoint_configuration(app_context)
42
+ if endpoint and endpoint.type == "openai":
43
+ log_event("kodit.embedding", {"provider": "openai"})
44
+ from openai import AsyncOpenAI
45
+
46
+ embedding_provider = OpenAIEmbeddingProvider(
47
+ openai_client=AsyncOpenAI(
48
+ api_key=endpoint.api_key or "default",
49
+ base_url=endpoint.base_url or "https://api.openai.com/v1",
50
+ timeout=10,
51
+ max_retries=2,
52
+ ),
53
+ model_name=endpoint.model or "text-embedding-3-small",
54
+ )
55
+ else:
56
+ log_event("kodit.embedding", {"provider": "local"})
57
+ embedding_provider = LocalEmbeddingProvider(CODE)
58
+
59
+ # Create vector search repository based on configuration
60
+ if app_context.default_search.provider == "vectorchord":
61
+ log_event("kodit.database", {"provider": "vectorchord"})
62
+ vector_search_repository = VectorChordVectorSearchRepository(
63
+ task_name, session, embedding_provider
64
+ )
65
+ elif app_context.default_search.provider == "sqlite":
66
+ log_event("kodit.database", {"provider": "sqlite"})
67
+ if task_name == "code":
68
+ embedding_type = EmbeddingType.CODE
69
+ elif task_name == "text":
70
+ embedding_type = EmbeddingType.TEXT
71
+ else:
72
+ raise ValueError(f"Invalid task name: {task_name}")
73
+
74
+ vector_search_repository = LocalVectorSearchRepository(
75
+ embedding_repository=embedding_repository,
76
+ embedding_provider=embedding_provider,
77
+ embedding_type=embedding_type,
78
+ )
79
+ else:
80
+ msg = f"Invalid semantic search provider: {app_context.default_search.provider}"
81
+ raise ValueError(msg)
82
+
83
+ # Create and return domain service
84
+ return EmbeddingDomainService(
85
+ embedding_provider=embedding_provider,
86
+ vector_search_repository=vector_search_repository,
87
+ )
@@ -0,0 +1 @@
1
+ """Embedding providers module."""
@@ -0,0 +1,93 @@
1
+ """Utilities for batching embedding requests based on token counts and batch size.
2
+
3
+ This module centralises the logic for splitting a list of ``EmbeddingRequest``
4
+ into smaller sub-batches that respect token limits (and optionally a maximum
5
+ number of items per batch). Both the OpenAI and Local embedding providers use
6
+ this functionality.
7
+ """
8
+
9
+ from tiktoken import Encoding
10
+
11
+ from kodit.domain.value_objects import EmbeddingRequest
12
+
13
+ __all__ = [
14
+ "split_sub_batches",
15
+ ]
16
+
17
+
18
+ DEFAULT_MAX_TOKENS = 8192 # A conservative upper-bound for most embedding models
19
+
20
+
21
+ def split_sub_batches(
22
+ encoding: Encoding,
23
+ data: list[EmbeddingRequest],
24
+ *,
25
+ max_tokens: int = DEFAULT_MAX_TOKENS,
26
+ batch_size: int | None = None,
27
+ ) -> list[list[EmbeddingRequest]]:
28
+ """Split *data* into sub-batches constrained by tokens and size.
29
+
30
+ Parameters
31
+ ----------
32
+ encoding
33
+ A *tiktoken* ``Encoding`` instance capable of counting tokens.
34
+ data
35
+ List of :class:`kodit.domain.value_objects.EmbeddingRequest` objects.
36
+ max_tokens
37
+ Maximum number of tokens allowed in a single batch. Defaults to
38
+ ``DEFAULT_MAX_TOKENS``.
39
+ batch_size
40
+ Optional maximum number of items per batch. If *None*, no explicit
41
+ size constraint is applied (token limit still applies).
42
+
43
+ Returns
44
+ -------
45
+ list[list[EmbeddingRequest]]
46
+ A list of batches where each batch is a list of ``EmbeddingRequest``s.
47
+
48
+ """
49
+ batches: list[list[EmbeddingRequest]] = []
50
+ current_batch: list[EmbeddingRequest] = []
51
+ current_tokens = 0
52
+
53
+ for original_item in data:
54
+ # ------------------------------------------------------------------
55
+ # Ensure **individual** requests never exceed the token limit.
56
+ # If they do, we *truncate* them rather than sending an oversized
57
+ # request to the embedding model (which would raise a 400 error).
58
+ # ------------------------------------------------------------------
59
+
60
+ token_ids = encoding.encode(original_item.text, disallowed_special=())
61
+ if len(token_ids) > max_tokens:
62
+ # Keep only the first *max_tokens* tokens and decode back to text.
63
+ token_ids = token_ids[:max_tokens]
64
+ truncated_text = encoding.decode(token_ids)
65
+
66
+ # Create a *new* EmbeddingRequest to avoid mutating the caller's
67
+ # objects (side-effects can be surprising).
68
+ item = EmbeddingRequest(
69
+ snippet_id=original_item.snippet_id,
70
+ text=truncated_text,
71
+ )
72
+ else:
73
+ item = original_item
74
+
75
+ item_tokens = len(token_ids)
76
+
77
+ # Determine whether adding the item would violate limits for the
78
+ # *current* batch. Note: size constraint is optional.
79
+ token_overflow = current_tokens + item_tokens > max_tokens
80
+ size_overflow = batch_size is not None and len(current_batch) >= batch_size
81
+
82
+ if (token_overflow or size_overflow) and current_batch:
83
+ batches.append(current_batch)
84
+ current_batch = [item]
85
+ current_tokens = item_tokens
86
+ else:
87
+ current_batch.append(item)
88
+ current_tokens += item_tokens
89
+
90
+ if current_batch:
91
+ batches.append(current_batch)
92
+
93
+ return batches