kodit 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/__init__.py +1 -0
- kodit/application/commands/__init__.py +1 -0
- kodit/application/commands/snippet_commands.py +22 -0
- kodit/application/services/__init__.py +1 -0
- kodit/application/services/indexing_application_service.py +363 -0
- kodit/application/services/snippet_application_service.py +143 -0
- kodit/cli.py +105 -82
- kodit/database.py +0 -22
- kodit/domain/__init__.py +1 -0
- kodit/{source/source_models.py → domain/entities.py} +88 -19
- kodit/domain/enums.py +9 -0
- kodit/domain/interfaces.py +27 -0
- kodit/domain/repositories.py +95 -0
- kodit/domain/services/__init__.py +1 -0
- kodit/domain/services/bm25_service.py +124 -0
- kodit/domain/services/embedding_service.py +155 -0
- kodit/domain/services/enrichment_service.py +48 -0
- kodit/domain/services/ignore_service.py +45 -0
- kodit/domain/services/indexing_service.py +203 -0
- kodit/domain/services/snippet_extraction_service.py +89 -0
- kodit/domain/services/source_service.py +83 -0
- kodit/domain/value_objects.py +215 -0
- kodit/infrastructure/__init__.py +1 -0
- kodit/infrastructure/bm25/__init__.py +1 -0
- kodit/infrastructure/bm25/bm25_factory.py +28 -0
- kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
- kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
- kodit/infrastructure/cloning/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/factory.py +119 -0
- kodit/infrastructure/cloning/folder/working_copy.py +38 -0
- kodit/infrastructure/cloning/git/__init__.py +1 -0
- kodit/infrastructure/cloning/git/factory.py +133 -0
- kodit/infrastructure/cloning/git/working_copy.py +32 -0
- kodit/infrastructure/cloning/metadata.py +127 -0
- kodit/infrastructure/embedding/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_factory.py +87 -0
- kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
- kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
- kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
- kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +98 -32
- kodit/infrastructure/enrichment/__init__.py +1 -0
- kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
- kodit/infrastructure/enrichment/local_enrichment_provider.py +115 -0
- kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
- kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
- kodit/infrastructure/git/__init__.py +1 -0
- kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
- kodit/infrastructure/ignore/__init__.py +1 -0
- kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
- kodit/infrastructure/indexing/__init__.py +1 -0
- kodit/infrastructure/indexing/fusion_service.py +55 -0
- kodit/infrastructure/indexing/index_repository.py +296 -0
- kodit/infrastructure/indexing/indexing_factory.py +111 -0
- kodit/infrastructure/snippet_extraction/__init__.py +1 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
- kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
- kodit/infrastructure/sqlalchemy/__init__.py +1 -0
- kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
- kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
- kodit/infrastructure/sqlalchemy/repository.py +121 -0
- kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
- kodit/infrastructure/ui/__init__.py +1 -0
- kodit/infrastructure/ui/progress.py +127 -0
- kodit/{util → infrastructure/ui}/spinner.py +19 -4
- kodit/mcp.py +50 -28
- kodit/migrations/env.py +1 -4
- kodit/reporting.py +78 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
- kodit-0.2.5.dist-info/RECORD +99 -0
- kodit/bm25/__init__.py +0 -1
- kodit/bm25/keyword_search_factory.py +0 -17
- kodit/bm25/keyword_search_service.py +0 -34
- kodit/embedding/__init__.py +0 -1
- kodit/embedding/embedding_factory.py +0 -63
- kodit/embedding/embedding_models.py +0 -28
- kodit/embedding/embedding_provider/__init__.py +0 -1
- kodit/embedding/embedding_provider/embedding_provider.py +0 -64
- kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -77
- kodit/embedding/embedding_provider/local_embedding_provider.py +0 -64
- kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -77
- kodit/embedding/local_vector_search_service.py +0 -54
- kodit/embedding/vector_search_service.py +0 -38
- kodit/enrichment/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -16
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +0 -92
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -81
- kodit/enrichment/enrichment_service.py +0 -33
- kodit/indexing/__init__.py +0 -1
- kodit/indexing/fusion.py +0 -67
- kodit/indexing/indexing_models.py +0 -43
- kodit/indexing/indexing_repository.py +0 -216
- kodit/indexing/indexing_service.py +0 -338
- kodit/snippets/__init__.py +0 -1
- kodit/snippets/languages/__init__.py +0 -53
- kodit/snippets/snippets.py +0 -50
- kodit/source/__init__.py +0 -1
- kodit/source/source_factories.py +0 -356
- kodit/source/source_repository.py +0 -169
- kodit/source/source_service.py +0 -150
- kodit/util/__init__.py +0 -1
- kodit-0.2.3.dist-info/RECORD +0 -71
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Factory for creating folder-based working copies."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import structlog
|
|
6
|
+
|
|
7
|
+
from kodit.domain.entities import AuthorFileMapping, Source, SourceType
|
|
8
|
+
from kodit.domain.interfaces import NullProgressCallback, ProgressCallback
|
|
9
|
+
from kodit.domain.repositories import SourceRepository
|
|
10
|
+
from kodit.domain.value_objects import ProgressEvent
|
|
11
|
+
from kodit.infrastructure.cloning.folder.working_copy import FolderWorkingCopyProvider
|
|
12
|
+
from kodit.infrastructure.cloning.metadata import (
|
|
13
|
+
FolderFileMetadataExtractor,
|
|
14
|
+
NoOpAuthorExtractor,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FolderSourceFactory:
|
|
19
|
+
"""Factory for creating folder sources."""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
repository: SourceRepository,
|
|
24
|
+
working_copy: FolderWorkingCopyProvider,
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Initialize the source factory."""
|
|
27
|
+
self.log = structlog.get_logger(__name__)
|
|
28
|
+
self.repository = repository
|
|
29
|
+
self.working_copy = working_copy
|
|
30
|
+
self.metadata_extractor = FolderFileMetadataExtractor()
|
|
31
|
+
self.author_extractor = NoOpAuthorExtractor()
|
|
32
|
+
|
|
33
|
+
async def create(
|
|
34
|
+
self, uri: str, progress_callback: ProgressCallback | None = None
|
|
35
|
+
) -> Source:
|
|
36
|
+
"""Create a folder source from a path."""
|
|
37
|
+
# Use null callback if none provided
|
|
38
|
+
if progress_callback is None:
|
|
39
|
+
progress_callback = NullProgressCallback()
|
|
40
|
+
|
|
41
|
+
directory = Path(uri).expanduser().resolve()
|
|
42
|
+
|
|
43
|
+
# Check if source already exists
|
|
44
|
+
source = await self.repository.get_by_uri(directory.as_uri())
|
|
45
|
+
if source:
|
|
46
|
+
self.log.info("Source already exists, reusing...", source_id=source.id)
|
|
47
|
+
return source
|
|
48
|
+
|
|
49
|
+
# Validate directory exists
|
|
50
|
+
if not directory.exists():
|
|
51
|
+
msg = f"Folder does not exist: {directory}"
|
|
52
|
+
raise ValueError(msg)
|
|
53
|
+
|
|
54
|
+
# Prepare working copy
|
|
55
|
+
clone_path = await self.working_copy.prepare(directory.as_uri())
|
|
56
|
+
|
|
57
|
+
# Create source record
|
|
58
|
+
source = await self.repository.create_source(
|
|
59
|
+
Source(
|
|
60
|
+
uri=directory.as_uri(),
|
|
61
|
+
cloned_path=str(clone_path),
|
|
62
|
+
source_type=SourceType.FOLDER,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Get all files to process
|
|
67
|
+
files = [f for f in clone_path.rglob("*") if f.is_file()]
|
|
68
|
+
|
|
69
|
+
# Process files
|
|
70
|
+
await self._process_files(source, files, progress_callback)
|
|
71
|
+
|
|
72
|
+
return source
|
|
73
|
+
|
|
74
|
+
async def _process_files(
|
|
75
|
+
self, source: Source, files: list[Path], progress_callback: ProgressCallback
|
|
76
|
+
) -> None:
|
|
77
|
+
"""Process files for a source."""
|
|
78
|
+
total_files = len(files)
|
|
79
|
+
|
|
80
|
+
# Notify start of operation
|
|
81
|
+
await progress_callback.on_progress(
|
|
82
|
+
ProgressEvent(
|
|
83
|
+
operation="process_files",
|
|
84
|
+
current=0,
|
|
85
|
+
total=total_files,
|
|
86
|
+
message="Processing files...",
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
for i, path in enumerate(files, 1):
|
|
91
|
+
if not path.is_file():
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
# Extract file metadata
|
|
95
|
+
file_record = await self.metadata_extractor.extract(path, source)
|
|
96
|
+
await self.repository.create_file(file_record)
|
|
97
|
+
|
|
98
|
+
# Extract authors
|
|
99
|
+
authors = await self.author_extractor.extract(path, source)
|
|
100
|
+
for author in authors:
|
|
101
|
+
await self.repository.upsert_author_file_mapping(
|
|
102
|
+
AuthorFileMapping(
|
|
103
|
+
author_id=author.id,
|
|
104
|
+
file_id=file_record.id,
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Update progress
|
|
109
|
+
await progress_callback.on_progress(
|
|
110
|
+
ProgressEvent(
|
|
111
|
+
operation="process_files",
|
|
112
|
+
current=i,
|
|
113
|
+
total=total_files,
|
|
114
|
+
message=f"Processing {path.name}...",
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Notify completion
|
|
119
|
+
await progress_callback.on_complete("process_files")
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Working copy provider for folder-based sources."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FolderWorkingCopyProvider:
|
|
8
|
+
"""Working copy provider for folder-based sources."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, clone_dir: Path) -> None:
|
|
11
|
+
"""Initialize the provider."""
|
|
12
|
+
self.clone_dir = clone_dir
|
|
13
|
+
|
|
14
|
+
async def prepare(self, uri: str) -> Path:
|
|
15
|
+
"""Prepare a folder working copy."""
|
|
16
|
+
# Handle file:// URIs
|
|
17
|
+
if uri.startswith("file://"):
|
|
18
|
+
from urllib.parse import urlparse
|
|
19
|
+
|
|
20
|
+
parsed = urlparse(uri)
|
|
21
|
+
directory = Path(parsed.path).expanduser().resolve()
|
|
22
|
+
else:
|
|
23
|
+
directory = Path(uri).expanduser().resolve()
|
|
24
|
+
|
|
25
|
+
# Clone into a local directory
|
|
26
|
+
clone_path = self.clone_dir / directory.as_posix().replace("/", "_")
|
|
27
|
+
clone_path.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
# Copy all files recursively, preserving directory structure, ignoring
|
|
30
|
+
# hidden files
|
|
31
|
+
shutil.copytree(
|
|
32
|
+
directory,
|
|
33
|
+
clone_path,
|
|
34
|
+
ignore=shutil.ignore_patterns(".*"),
|
|
35
|
+
dirs_exist_ok=True,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
return clone_path
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Git cloning infrastructure."""
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Factory for creating git-based working copies."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import git
|
|
7
|
+
import structlog
|
|
8
|
+
|
|
9
|
+
from kodit.domain.entities import AuthorFileMapping, Source, SourceType
|
|
10
|
+
from kodit.domain.interfaces import NullProgressCallback, ProgressCallback
|
|
11
|
+
from kodit.domain.repositories import SourceRepository
|
|
12
|
+
from kodit.domain.services.ignore_service import IgnoreService
|
|
13
|
+
from kodit.domain.value_objects import ProgressEvent
|
|
14
|
+
from kodit.infrastructure.cloning.git.working_copy import GitWorkingCopyProvider
|
|
15
|
+
from kodit.infrastructure.cloning.metadata import (
|
|
16
|
+
GitAuthorExtractor,
|
|
17
|
+
GitFileMetadataExtractor,
|
|
18
|
+
)
|
|
19
|
+
from kodit.infrastructure.ignore.ignore_pattern_provider import GitIgnorePatternProvider
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class GitSourceFactory:
|
|
23
|
+
"""Factory for creating git-based working copies."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
repository: SourceRepository,
|
|
28
|
+
working_copy: GitWorkingCopyProvider,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Initialize the source factory."""
|
|
31
|
+
self.log = structlog.get_logger(__name__)
|
|
32
|
+
self.repository = repository
|
|
33
|
+
self.working_copy = working_copy
|
|
34
|
+
self.metadata_extractor = GitFileMetadataExtractor()
|
|
35
|
+
self.author_extractor = GitAuthorExtractor(repository)
|
|
36
|
+
|
|
37
|
+
async def create(
|
|
38
|
+
self, uri: str, progress_callback: ProgressCallback | None = None
|
|
39
|
+
) -> Source:
|
|
40
|
+
"""Create a git source from a URI."""
|
|
41
|
+
# Use null callback if none provided
|
|
42
|
+
if progress_callback is None:
|
|
43
|
+
progress_callback = NullProgressCallback()
|
|
44
|
+
|
|
45
|
+
# Normalize the URI
|
|
46
|
+
self.log.debug("Normalising git uri", uri=uri)
|
|
47
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
48
|
+
git.Repo.clone_from(uri, temp_dir)
|
|
49
|
+
remote = git.Repo(temp_dir).remote()
|
|
50
|
+
uri = remote.url
|
|
51
|
+
|
|
52
|
+
# Check if source already exists
|
|
53
|
+
self.log.debug("Checking if source already exists", uri=uri)
|
|
54
|
+
source = await self.repository.get_by_uri(uri)
|
|
55
|
+
|
|
56
|
+
if source:
|
|
57
|
+
self.log.info("Source already exists, reusing...", source_id=source.id)
|
|
58
|
+
return source
|
|
59
|
+
|
|
60
|
+
# Prepare working copy
|
|
61
|
+
clone_path = await self.working_copy.prepare(uri)
|
|
62
|
+
|
|
63
|
+
# Create source record
|
|
64
|
+
self.log.debug("Creating source", uri=uri, clone_path=str(clone_path))
|
|
65
|
+
source = await self.repository.create_source(
|
|
66
|
+
Source(
|
|
67
|
+
uri=uri,
|
|
68
|
+
cloned_path=str(clone_path),
|
|
69
|
+
source_type=SourceType.GIT,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Get files to process using ignore patterns
|
|
74
|
+
ignore_provider = GitIgnorePatternProvider(clone_path)
|
|
75
|
+
ignore_service = IgnoreService(ignore_provider)
|
|
76
|
+
files = [
|
|
77
|
+
f
|
|
78
|
+
for f in clone_path.rglob("*")
|
|
79
|
+
if f.is_file() and not ignore_service.should_ignore(f)
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
# Process files
|
|
83
|
+
self.log.info("Inspecting files", source_id=source.id, num_files=len(files))
|
|
84
|
+
await self._process_files(source, files, progress_callback)
|
|
85
|
+
|
|
86
|
+
return source
|
|
87
|
+
|
|
88
|
+
async def _process_files(
|
|
89
|
+
self, source: Source, files: list[Path], progress_callback: ProgressCallback
|
|
90
|
+
) -> None:
|
|
91
|
+
"""Process files for a source."""
|
|
92
|
+
total_files = len(files)
|
|
93
|
+
|
|
94
|
+
# Notify start of operation
|
|
95
|
+
await progress_callback.on_progress(
|
|
96
|
+
ProgressEvent(
|
|
97
|
+
operation="process_files",
|
|
98
|
+
current=0,
|
|
99
|
+
total=total_files,
|
|
100
|
+
message="Processing files...",
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
for i, path in enumerate(files, 1):
|
|
105
|
+
if not path.is_file():
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# Extract file metadata
|
|
109
|
+
file_record = await self.metadata_extractor.extract(path, source)
|
|
110
|
+
await self.repository.create_file(file_record)
|
|
111
|
+
|
|
112
|
+
# Extract authors
|
|
113
|
+
authors = await self.author_extractor.extract(path, source)
|
|
114
|
+
for author in authors:
|
|
115
|
+
await self.repository.upsert_author_file_mapping(
|
|
116
|
+
AuthorFileMapping(
|
|
117
|
+
author_id=author.id,
|
|
118
|
+
file_id=file_record.id,
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Update progress
|
|
123
|
+
await progress_callback.on_progress(
|
|
124
|
+
ProgressEvent(
|
|
125
|
+
operation="process_files",
|
|
126
|
+
current=i,
|
|
127
|
+
total=total_files,
|
|
128
|
+
message=f"Processing {path.name}...",
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Notify completion
|
|
133
|
+
await progress_callback.on_complete("process_files")
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Working copy provider for git-based sources."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import git
|
|
6
|
+
import structlog
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GitWorkingCopyProvider:
|
|
10
|
+
"""Working copy provider for git-based sources."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, clone_dir: Path) -> None:
|
|
13
|
+
"""Initialize the provider."""
|
|
14
|
+
self.clone_dir = clone_dir
|
|
15
|
+
self.log = structlog.get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
async def prepare(self, uri: str) -> Path:
|
|
18
|
+
"""Prepare a Git working copy."""
|
|
19
|
+
# Create a unique directory name for the clone
|
|
20
|
+
clone_path = self.clone_dir / uri.replace("/", "_").replace(":", "_")
|
|
21
|
+
clone_path.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
self.log.info("Cloning repository", uri=uri, clone_path=str(clone_path))
|
|
25
|
+
git.Repo.clone_from(uri, clone_path)
|
|
26
|
+
except git.GitCommandError as e:
|
|
27
|
+
if "already exists and is not an empty directory" not in str(e):
|
|
28
|
+
msg = f"Failed to clone repository: {e}"
|
|
29
|
+
raise ValueError(msg) from e
|
|
30
|
+
self.log.info("Repository already exists, reusing...", uri=uri)
|
|
31
|
+
|
|
32
|
+
return clone_path
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Metadata extraction for cloned sources."""
|
|
2
|
+
|
|
3
|
+
import mimetypes
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from hashlib import sha256
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import aiofiles
|
|
10
|
+
import git
|
|
11
|
+
import structlog
|
|
12
|
+
|
|
13
|
+
from kodit.domain.entities import Author, File, Source
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseFileMetadataExtractor:
|
|
17
|
+
"""Base class for file metadata extraction with common functionality."""
|
|
18
|
+
|
|
19
|
+
async def extract(self, path: Path, source: Source) -> File:
|
|
20
|
+
"""Extract metadata from a file."""
|
|
21
|
+
# Get timestamps - to be implemented by subclasses
|
|
22
|
+
created_at, updated_at = await self._get_timestamps(path, source)
|
|
23
|
+
|
|
24
|
+
# Read file content and calculate metadata
|
|
25
|
+
async with aiofiles.open(path, "rb") as f:
|
|
26
|
+
content = await f.read()
|
|
27
|
+
mime_type = mimetypes.guess_type(path)
|
|
28
|
+
sha = sha256(content).hexdigest()
|
|
29
|
+
|
|
30
|
+
return File(
|
|
31
|
+
created_at=created_at,
|
|
32
|
+
updated_at=updated_at,
|
|
33
|
+
source_id=source.id,
|
|
34
|
+
cloned_path=str(path),
|
|
35
|
+
mime_type=mime_type[0]
|
|
36
|
+
if mime_type and mime_type[0]
|
|
37
|
+
else "application/octet-stream",
|
|
38
|
+
uri=path.as_uri(),
|
|
39
|
+
sha256=sha,
|
|
40
|
+
size_bytes=len(content),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
async def _get_timestamps(
|
|
44
|
+
self, path: Path, source: Source
|
|
45
|
+
) -> tuple[datetime, datetime]:
|
|
46
|
+
"""Get creation and modification timestamps. To be implemented by subclasses."""
|
|
47
|
+
raise NotImplementedError
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class GitFileMetadataExtractor(BaseFileMetadataExtractor):
|
|
51
|
+
"""Git-specific implementation for extracting file metadata."""
|
|
52
|
+
|
|
53
|
+
async def _get_timestamps(
|
|
54
|
+
self, path: Path, source: Source
|
|
55
|
+
) -> tuple[datetime, datetime]:
|
|
56
|
+
"""Get timestamps from Git history."""
|
|
57
|
+
git_repo = git.Repo(source.cloned_path)
|
|
58
|
+
commits = list(git_repo.iter_commits(paths=str(path), all=True))
|
|
59
|
+
|
|
60
|
+
if commits:
|
|
61
|
+
last_modified_at = commits[0].committed_datetime
|
|
62
|
+
first_modified_at = commits[-1].committed_datetime
|
|
63
|
+
return first_modified_at, last_modified_at
|
|
64
|
+
# Fallback to current time if no commits found
|
|
65
|
+
now = datetime.now(UTC)
|
|
66
|
+
return now, now
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class FolderFileMetadataExtractor(BaseFileMetadataExtractor):
|
|
70
|
+
"""Folder-specific implementation for extracting file metadata."""
|
|
71
|
+
|
|
72
|
+
async def _get_timestamps(
|
|
73
|
+
self,
|
|
74
|
+
path: Path,
|
|
75
|
+
source: Source, # noqa: ARG002
|
|
76
|
+
) -> tuple[datetime, datetime]:
|
|
77
|
+
"""Get timestamps from file system."""
|
|
78
|
+
stat = path.stat()
|
|
79
|
+
file_created_at = datetime.fromtimestamp(stat.st_ctime, UTC)
|
|
80
|
+
file_modified_at = datetime.fromtimestamp(stat.st_mtime, UTC)
|
|
81
|
+
return file_created_at, file_modified_at
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class GitAuthorExtractor:
|
|
85
|
+
"""Author extractor for Git repositories."""
|
|
86
|
+
|
|
87
|
+
def __init__(self, repository: Any) -> None:
|
|
88
|
+
"""Initialize the extractor."""
|
|
89
|
+
self.repository = repository
|
|
90
|
+
self.log = structlog.get_logger(__name__)
|
|
91
|
+
|
|
92
|
+
async def extract(self, path: Path, source: Source) -> list[Author]:
|
|
93
|
+
"""Extract authors from a Git file."""
|
|
94
|
+
authors: list[Author] = []
|
|
95
|
+
git_repo = git.Repo(source.cloned_path)
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
# Get the file's blame
|
|
99
|
+
blames = git_repo.blame("HEAD", str(path))
|
|
100
|
+
|
|
101
|
+
# Extract the blame's authors
|
|
102
|
+
actors = [
|
|
103
|
+
commit.author
|
|
104
|
+
for blame in blames or []
|
|
105
|
+
for commit in blame
|
|
106
|
+
if isinstance(commit, git.Commit)
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
# Get or create the authors in the database
|
|
110
|
+
for actor in actors:
|
|
111
|
+
if actor.email:
|
|
112
|
+
author = Author.from_actor(actor)
|
|
113
|
+
author = await self.repository.upsert_author(author)
|
|
114
|
+
authors.append(author)
|
|
115
|
+
except git.GitCommandError:
|
|
116
|
+
# Handle cases where file might not be tracked
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
return authors
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class NoOpAuthorExtractor:
|
|
123
|
+
"""No-op author extractor for sources that don't have author information."""
|
|
124
|
+
|
|
125
|
+
async def extract(self, path: Path, source: Source) -> list[Author]: # noqa: ARG002
|
|
126
|
+
"""Return empty list of authors."""
|
|
127
|
+
return []
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Embedding infrastructure module."""
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Factory for creating embedding services with DDD architecture."""
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
+
|
|
5
|
+
from kodit.config import AppContext, Endpoint
|
|
6
|
+
from kodit.domain.entities import EmbeddingType
|
|
7
|
+
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
8
|
+
from kodit.infrastructure.embedding.embedding_providers.local_embedding_provider import ( # noqa: E501
|
|
9
|
+
CODE,
|
|
10
|
+
LocalEmbeddingProvider,
|
|
11
|
+
)
|
|
12
|
+
from kodit.infrastructure.embedding.embedding_providers.openai_embedding_provider import ( # noqa: E501
|
|
13
|
+
OpenAIEmbeddingProvider,
|
|
14
|
+
)
|
|
15
|
+
from kodit.infrastructure.embedding.local_vector_search_repository import (
|
|
16
|
+
LocalVectorSearchRepository,
|
|
17
|
+
)
|
|
18
|
+
from kodit.infrastructure.embedding.vectorchord_vector_search_repository import (
|
|
19
|
+
TaskName,
|
|
20
|
+
VectorChordVectorSearchRepository,
|
|
21
|
+
)
|
|
22
|
+
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
23
|
+
SqlAlchemyEmbeddingRepository,
|
|
24
|
+
)
|
|
25
|
+
from kodit.log import log_event
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
|
|
29
|
+
"""Get the endpoint configuration for the embedding service."""
|
|
30
|
+
return app_context.embedding_endpoint or app_context.default_endpoint or None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def embedding_domain_service_factory(
|
|
34
|
+
task_name: TaskName, app_context: AppContext, session: AsyncSession
|
|
35
|
+
) -> EmbeddingDomainService:
|
|
36
|
+
"""Create an embedding domain service."""
|
|
37
|
+
# Create embedding repository
|
|
38
|
+
embedding_repository = SqlAlchemyEmbeddingRepository(session=session)
|
|
39
|
+
|
|
40
|
+
# Create embedding provider
|
|
41
|
+
endpoint = _get_endpoint_configuration(app_context)
|
|
42
|
+
if endpoint and endpoint.type == "openai":
|
|
43
|
+
log_event("kodit.embedding", {"provider": "openai"})
|
|
44
|
+
from openai import AsyncOpenAI
|
|
45
|
+
|
|
46
|
+
embedding_provider = OpenAIEmbeddingProvider(
|
|
47
|
+
openai_client=AsyncOpenAI(
|
|
48
|
+
api_key=endpoint.api_key or "default",
|
|
49
|
+
base_url=endpoint.base_url or "https://api.openai.com/v1",
|
|
50
|
+
timeout=10,
|
|
51
|
+
max_retries=2,
|
|
52
|
+
),
|
|
53
|
+
model_name=endpoint.model or "text-embedding-3-small",
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
log_event("kodit.embedding", {"provider": "local"})
|
|
57
|
+
embedding_provider = LocalEmbeddingProvider(CODE)
|
|
58
|
+
|
|
59
|
+
# Create vector search repository based on configuration
|
|
60
|
+
if app_context.default_search.provider == "vectorchord":
|
|
61
|
+
log_event("kodit.database", {"provider": "vectorchord"})
|
|
62
|
+
vector_search_repository = VectorChordVectorSearchRepository(
|
|
63
|
+
task_name, session, embedding_provider
|
|
64
|
+
)
|
|
65
|
+
elif app_context.default_search.provider == "sqlite":
|
|
66
|
+
log_event("kodit.database", {"provider": "sqlite"})
|
|
67
|
+
if task_name == "code":
|
|
68
|
+
embedding_type = EmbeddingType.CODE
|
|
69
|
+
elif task_name == "text":
|
|
70
|
+
embedding_type = EmbeddingType.TEXT
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError(f"Invalid task name: {task_name}")
|
|
73
|
+
|
|
74
|
+
vector_search_repository = LocalVectorSearchRepository(
|
|
75
|
+
embedding_repository=embedding_repository,
|
|
76
|
+
embedding_provider=embedding_provider,
|
|
77
|
+
embedding_type=embedding_type,
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
msg = f"Invalid semantic search provider: {app_context.default_search.provider}"
|
|
81
|
+
raise ValueError(msg)
|
|
82
|
+
|
|
83
|
+
# Create and return domain service
|
|
84
|
+
return EmbeddingDomainService(
|
|
85
|
+
embedding_provider=embedding_provider,
|
|
86
|
+
vector_search_repository=vector_search_repository,
|
|
87
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Embedding providers module."""
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Utilities for batching embedding requests based on token counts and batch size.
|
|
2
|
+
|
|
3
|
+
This module centralises the logic for splitting a list of ``EmbeddingRequest``
|
|
4
|
+
into smaller sub-batches that respect token limits (and optionally a maximum
|
|
5
|
+
number of items per batch). Both the OpenAI and Local embedding providers use
|
|
6
|
+
this functionality.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from tiktoken import Encoding
|
|
10
|
+
|
|
11
|
+
from kodit.domain.value_objects import EmbeddingRequest
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"split_sub_batches",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
DEFAULT_MAX_TOKENS = 8192 # A conservative upper-bound for most embedding models
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def split_sub_batches(
|
|
22
|
+
encoding: Encoding,
|
|
23
|
+
data: list[EmbeddingRequest],
|
|
24
|
+
*,
|
|
25
|
+
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
26
|
+
batch_size: int | None = None,
|
|
27
|
+
) -> list[list[EmbeddingRequest]]:
|
|
28
|
+
"""Split *data* into sub-batches constrained by tokens and size.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
encoding
|
|
33
|
+
A *tiktoken* ``Encoding`` instance capable of counting tokens.
|
|
34
|
+
data
|
|
35
|
+
List of :class:`kodit.domain.value_objects.EmbeddingRequest` objects.
|
|
36
|
+
max_tokens
|
|
37
|
+
Maximum number of tokens allowed in a single batch. Defaults to
|
|
38
|
+
``DEFAULT_MAX_TOKENS``.
|
|
39
|
+
batch_size
|
|
40
|
+
Optional maximum number of items per batch. If *None*, no explicit
|
|
41
|
+
size constraint is applied (token limit still applies).
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
list[list[EmbeddingRequest]]
|
|
46
|
+
A list of batches where each batch is a list of ``EmbeddingRequest``s.
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
batches: list[list[EmbeddingRequest]] = []
|
|
50
|
+
current_batch: list[EmbeddingRequest] = []
|
|
51
|
+
current_tokens = 0
|
|
52
|
+
|
|
53
|
+
for original_item in data:
|
|
54
|
+
# ------------------------------------------------------------------
|
|
55
|
+
# Ensure **individual** requests never exceed the token limit.
|
|
56
|
+
# If they do, we *truncate* them rather than sending an oversized
|
|
57
|
+
# request to the embedding model (which would raise a 400 error).
|
|
58
|
+
# ------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
token_ids = encoding.encode(original_item.text, disallowed_special=())
|
|
61
|
+
if len(token_ids) > max_tokens:
|
|
62
|
+
# Keep only the first *max_tokens* tokens and decode back to text.
|
|
63
|
+
token_ids = token_ids[:max_tokens]
|
|
64
|
+
truncated_text = encoding.decode(token_ids)
|
|
65
|
+
|
|
66
|
+
# Create a *new* EmbeddingRequest to avoid mutating the caller's
|
|
67
|
+
# objects (side-effects can be surprising).
|
|
68
|
+
item = EmbeddingRequest(
|
|
69
|
+
snippet_id=original_item.snippet_id,
|
|
70
|
+
text=truncated_text,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
item = original_item
|
|
74
|
+
|
|
75
|
+
item_tokens = len(token_ids)
|
|
76
|
+
|
|
77
|
+
# Determine whether adding the item would violate limits for the
|
|
78
|
+
# *current* batch. Note: size constraint is optional.
|
|
79
|
+
token_overflow = current_tokens + item_tokens > max_tokens
|
|
80
|
+
size_overflow = batch_size is not None and len(current_batch) >= batch_size
|
|
81
|
+
|
|
82
|
+
if (token_overflow or size_overflow) and current_batch:
|
|
83
|
+
batches.append(current_batch)
|
|
84
|
+
current_batch = [item]
|
|
85
|
+
current_tokens = item_tokens
|
|
86
|
+
else:
|
|
87
|
+
current_batch.append(item)
|
|
88
|
+
current_tokens += item_tokens
|
|
89
|
+
|
|
90
|
+
if current_batch:
|
|
91
|
+
batches.append(current_batch)
|
|
92
|
+
|
|
93
|
+
return batches
|