kodit 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/code_indexing_factory.py +56 -29
- kodit/application/services/code_indexing_application_service.py +152 -118
- kodit/cli.py +14 -41
- kodit/domain/entities.py +268 -197
- kodit/domain/protocols.py +61 -0
- kodit/domain/services/embedding_service.py +1 -1
- kodit/domain/services/index_query_service.py +66 -0
- kodit/domain/services/index_service.py +282 -0
- kodit/domain/value_objects.py +143 -65
- kodit/infrastructure/cloning/git/working_copy.py +17 -8
- kodit/infrastructure/cloning/metadata.py +37 -67
- kodit/infrastructure/embedding/embedding_factory.py +1 -1
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
- kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
- kodit/infrastructure/git/git_utils.py +1 -63
- kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
- kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/__init__.py +1 -0
- kodit/infrastructure/mappers/index_mapper.py +344 -0
- kodit/infrastructure/slicing/__init__.py +1 -0
- kodit/infrastructure/slicing/language_detection_service.py +18 -0
- kodit/infrastructure/slicing/slicer.py +894 -0
- kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/entities.py +203 -0
- kodit/infrastructure/sqlalchemy/index_repository.py +579 -0
- kodit/mcp.py +0 -7
- kodit/migrations/env.py +1 -1
- kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +36 -0
- kodit/migrations/versions/4552eb3f23ce_add_summary.py +4 -4
- kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +24 -16
- kodit/migrations/versions/85155663351e_initial.py +64 -48
- kodit/migrations/versions/c3f5137d30f5_index_all_the_things.py +20 -14
- kodit/utils/__init__.py +1 -0
- kodit/utils/path_utils.py +54 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/METADATA +9 -4
- kodit-0.3.4.dist-info/RECORD +89 -0
- kodit/domain/enums.py +0 -9
- kodit/domain/repositories.py +0 -128
- kodit/domain/services/ignore_service.py +0 -45
- kodit/domain/services/indexing_service.py +0 -204
- kodit/domain/services/snippet_extraction_service.py +0 -89
- kodit/domain/services/snippet_service.py +0 -215
- kodit/domain/services/source_service.py +0 -85
- kodit/infrastructure/cloning/folder/__init__.py +0 -1
- kodit/infrastructure/cloning/folder/factory.py +0 -128
- kodit/infrastructure/cloning/folder/working_copy.py +0 -38
- kodit/infrastructure/cloning/git/factory.py +0 -153
- kodit/infrastructure/indexing/index_repository.py +0 -286
- kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
- kodit/infrastructure/snippet_extraction/__init__.py +0 -1
- kodit/infrastructure/snippet_extraction/language_detection_service.py +0 -39
- kodit/infrastructure/snippet_extraction/languages/csharp.scm +0 -12
- kodit/infrastructure/snippet_extraction/languages/go.scm +0 -26
- kodit/infrastructure/snippet_extraction/languages/java.scm +0 -12
- kodit/infrastructure/snippet_extraction/languages/javascript.scm +0 -24
- kodit/infrastructure/snippet_extraction/languages/python.scm +0 -22
- kodit/infrastructure/snippet_extraction/languages/typescript.scm +0 -25
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +0 -67
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -45
- kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +0 -182
- kodit/infrastructure/sqlalchemy/file_repository.py +0 -78
- kodit/infrastructure/sqlalchemy/repository.py +0 -133
- kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -259
- kodit-0.3.2.dist-info/RECORD +0 -103
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/WHEEL +0 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/entry_points.txt +0 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,59 +4,55 @@ import mimetypes
|
|
|
4
4
|
from datetime import UTC, datetime
|
|
5
5
|
from hashlib import sha256
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any
|
|
8
7
|
|
|
9
8
|
import aiofiles
|
|
10
9
|
import git
|
|
11
|
-
import
|
|
10
|
+
from pydantic import AnyUrl
|
|
12
11
|
|
|
13
|
-
from kodit.domain.entities import Author, File
|
|
12
|
+
from kodit.domain.entities import Author, File
|
|
13
|
+
from kodit.domain.value_objects import FileProcessingStatus, SourceType
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class
|
|
17
|
-
"""
|
|
16
|
+
class FileMetadataExtractor:
|
|
17
|
+
"""File metadata extractor."""
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
def __init__(self, source_type: SourceType) -> None:
|
|
20
|
+
"""Initialize the extractor."""
|
|
21
|
+
self.source_type = source_type
|
|
22
|
+
|
|
23
|
+
async def extract(self, file_path: Path) -> File:
|
|
20
24
|
"""Extract metadata from a file."""
|
|
21
|
-
|
|
22
|
-
|
|
25
|
+
if self.source_type == SourceType.GIT:
|
|
26
|
+
created_at, updated_at = await self._get_git_timestamps(file_path)
|
|
27
|
+
else:
|
|
28
|
+
created_at, updated_at = await self._get_file_system_timestamps(file_path)
|
|
23
29
|
|
|
24
30
|
# Read file content and calculate metadata
|
|
25
|
-
async with aiofiles.open(
|
|
31
|
+
async with aiofiles.open(file_path, "rb") as f:
|
|
26
32
|
content = await f.read()
|
|
27
|
-
mime_type = mimetypes.guess_type(
|
|
33
|
+
mime_type = mimetypes.guess_type(file_path)
|
|
28
34
|
sha = sha256(content).hexdigest()
|
|
35
|
+
if self.source_type == SourceType.GIT:
|
|
36
|
+
authors = await self._extract_git_authors(file_path)
|
|
37
|
+
else:
|
|
38
|
+
authors = []
|
|
29
39
|
|
|
30
40
|
return File(
|
|
31
41
|
created_at=created_at,
|
|
32
42
|
updated_at=updated_at,
|
|
33
|
-
|
|
34
|
-
cloned_path=str(path),
|
|
43
|
+
uri=AnyUrl(file_path.resolve().absolute().as_uri()),
|
|
35
44
|
mime_type=mime_type[0]
|
|
36
45
|
if mime_type and mime_type[0]
|
|
37
46
|
else "application/octet-stream",
|
|
38
|
-
uri=path.as_uri(),
|
|
39
47
|
sha256=sha,
|
|
40
|
-
|
|
41
|
-
|
|
48
|
+
authors=authors,
|
|
49
|
+
file_processing_status=FileProcessingStatus.ADDED,
|
|
42
50
|
)
|
|
43
51
|
|
|
44
|
-
async def
|
|
45
|
-
self, path: Path, source: Source
|
|
46
|
-
) -> tuple[datetime, datetime]:
|
|
47
|
-
"""Get creation and modification timestamps. To be implemented by subclasses."""
|
|
48
|
-
raise NotImplementedError
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class GitFileMetadataExtractor(BaseFileMetadataExtractor):
|
|
52
|
-
"""Git-specific implementation for extracting file metadata."""
|
|
53
|
-
|
|
54
|
-
async def _get_timestamps(
|
|
55
|
-
self, path: Path, source: Source
|
|
56
|
-
) -> tuple[datetime, datetime]:
|
|
52
|
+
async def _get_git_timestamps(self, file_path: Path) -> tuple[datetime, datetime]:
|
|
57
53
|
"""Get timestamps from Git history."""
|
|
58
|
-
git_repo = git.Repo(
|
|
59
|
-
commits = list(git_repo.iter_commits(paths=str(
|
|
54
|
+
git_repo = git.Repo(file_path.parent, search_parent_directories=True)
|
|
55
|
+
commits = list(git_repo.iter_commits(paths=str(file_path), all=True))
|
|
60
56
|
|
|
61
57
|
if commits:
|
|
62
58
|
last_modified_at = commits[0].committed_datetime
|
|
@@ -66,38 +62,23 @@ class GitFileMetadataExtractor(BaseFileMetadataExtractor):
|
|
|
66
62
|
now = datetime.now(UTC)
|
|
67
63
|
return now, now
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
class FolderFileMetadataExtractor(BaseFileMetadataExtractor):
|
|
71
|
-
"""Folder-specific implementation for extracting file metadata."""
|
|
72
|
-
|
|
73
|
-
async def _get_timestamps(
|
|
65
|
+
async def _get_file_system_timestamps(
|
|
74
66
|
self,
|
|
75
|
-
|
|
76
|
-
source: Source, # noqa: ARG002
|
|
67
|
+
file_path: Path,
|
|
77
68
|
) -> tuple[datetime, datetime]:
|
|
78
69
|
"""Get timestamps from file system."""
|
|
79
|
-
stat =
|
|
70
|
+
stat = file_path.stat()
|
|
80
71
|
file_created_at = datetime.fromtimestamp(stat.st_ctime, UTC)
|
|
81
72
|
file_modified_at = datetime.fromtimestamp(stat.st_mtime, UTC)
|
|
82
73
|
return file_created_at, file_modified_at
|
|
83
74
|
|
|
84
|
-
|
|
85
|
-
class GitAuthorExtractor:
|
|
86
|
-
"""Author extractor for Git repositories."""
|
|
87
|
-
|
|
88
|
-
def __init__(self, repository: Any) -> None:
|
|
89
|
-
"""Initialize the extractor."""
|
|
90
|
-
self.repository = repository
|
|
91
|
-
self.log = structlog.get_logger(__name__)
|
|
92
|
-
|
|
93
|
-
async def extract(self, path: Path, source: Source) -> list[Author]:
|
|
75
|
+
async def _extract_git_authors(self, file_path: Path) -> list[Author]:
|
|
94
76
|
"""Extract authors from a Git file."""
|
|
95
|
-
|
|
96
|
-
git_repo = git.Repo(source.cloned_path)
|
|
77
|
+
git_repo = git.Repo(file_path.parent, search_parent_directories=True)
|
|
97
78
|
|
|
98
79
|
try:
|
|
99
80
|
# Get the file's blame
|
|
100
|
-
blames = git_repo.blame("HEAD", str(
|
|
81
|
+
blames = git_repo.blame("HEAD", str(file_path))
|
|
101
82
|
|
|
102
83
|
# Extract the blame's authors
|
|
103
84
|
actors = [
|
|
@@ -108,21 +89,10 @@ class GitAuthorExtractor:
|
|
|
108
89
|
]
|
|
109
90
|
|
|
110
91
|
# Get or create the authors in the database
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
authors.append(author)
|
|
92
|
+
return [
|
|
93
|
+
Author(name=actor.name or "", email=actor.email or "")
|
|
94
|
+
for actor in actors
|
|
95
|
+
]
|
|
116
96
|
except git.GitCommandError:
|
|
117
97
|
# Handle cases where file might not be tracked
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
return authors
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class NoOpAuthorExtractor:
|
|
124
|
-
"""No-op author extractor for sources that don't have author information."""
|
|
125
|
-
|
|
126
|
-
async def extract(self, path: Path, source: Source) -> list[Author]: # noqa: ARG002
|
|
127
|
-
"""Return empty list of authors."""
|
|
128
|
-
return []
|
|
98
|
+
return []
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
4
|
|
|
5
5
|
from kodit.config import AppContext, Endpoint
|
|
6
|
-
from kodit.domain.entities import EmbeddingType
|
|
7
6
|
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
8
7
|
from kodit.infrastructure.embedding.embedding_providers.local_embedding_provider import ( # noqa: E501
|
|
9
8
|
CODE,
|
|
@@ -22,6 +21,7 @@ from kodit.infrastructure.embedding.vectorchord_vector_search_repository import
|
|
|
22
21
|
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
23
22
|
SqlAlchemyEmbeddingRepository,
|
|
24
23
|
)
|
|
24
|
+
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
25
25
|
from kodit.log import log_event
|
|
26
26
|
|
|
27
27
|
|
|
@@ -4,7 +4,6 @@ from collections.abc import AsyncGenerator
|
|
|
4
4
|
|
|
5
5
|
import structlog
|
|
6
6
|
|
|
7
|
-
from kodit.domain.entities import Embedding, EmbeddingType
|
|
8
7
|
from kodit.domain.services.embedding_service import (
|
|
9
8
|
EmbeddingProvider,
|
|
10
9
|
VectorSearchRepository,
|
|
@@ -19,6 +18,7 @@ from kodit.domain.value_objects import (
|
|
|
19
18
|
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
20
19
|
SqlAlchemyEmbeddingRepository,
|
|
21
20
|
)
|
|
21
|
+
from kodit.infrastructure.sqlalchemy.entities import Embedding, EmbeddingType
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class LocalVectorSearchRepository(VectorSearchRepository):
|
|
@@ -7,7 +7,6 @@ import structlog
|
|
|
7
7
|
from sqlalchemy import Result, TextClause, text
|
|
8
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
9
|
|
|
10
|
-
from kodit.domain.entities import EmbeddingType
|
|
11
10
|
from kodit.domain.services.embedding_service import (
|
|
12
11
|
EmbeddingProvider,
|
|
13
12
|
VectorSearchRepository,
|
|
@@ -19,6 +18,7 @@ from kodit.domain.value_objects import (
|
|
|
19
18
|
SearchRequest,
|
|
20
19
|
SearchResult,
|
|
21
20
|
)
|
|
21
|
+
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
22
22
|
|
|
23
23
|
# SQL Queries
|
|
24
24
|
CREATE_VCHORD_EXTENSION = """
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Null enrichment provider for testing."""
|
|
2
2
|
|
|
3
|
+
import re
|
|
3
4
|
from collections.abc import AsyncGenerator
|
|
4
5
|
|
|
5
6
|
from kodit.domain.services.enrichment_service import EnrichmentProvider
|
|
@@ -12,14 +13,7 @@ class NullEnrichmentProvider(EnrichmentProvider):
|
|
|
12
13
|
async def enrich(
|
|
13
14
|
self, requests: list[EnrichmentRequest]
|
|
14
15
|
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
requests: List of enrichment requests.
|
|
19
|
-
|
|
20
|
-
Yields:
|
|
21
|
-
Empty enrichment responses.
|
|
22
|
-
|
|
23
|
-
"""
|
|
16
|
+
"""Only keep alphabetic characters."""
|
|
24
17
|
for request in requests:
|
|
25
|
-
|
|
18
|
+
response = re.sub(r"[^a-zA-Z]", " ", request.text)
|
|
19
|
+
yield EnrichmentResponse(snippet_id=request.snippet_id, text=response)
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
"""Git utilities for infrastructure operations."""
|
|
2
2
|
|
|
3
3
|
import tempfile
|
|
4
|
-
from urllib.parse import urlparse, urlunparse
|
|
5
4
|
|
|
6
5
|
import git
|
|
7
6
|
|
|
8
7
|
|
|
8
|
+
# FUTURE: move to clone dir
|
|
9
9
|
def is_valid_clone_target(target: str) -> bool:
|
|
10
10
|
"""Return True if the target is clonable.
|
|
11
11
|
|
|
@@ -23,65 +23,3 @@ def is_valid_clone_target(target: str) -> bool:
|
|
|
23
23
|
return False
|
|
24
24
|
else:
|
|
25
25
|
return True
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def sanitize_git_url(url: str) -> str:
|
|
29
|
-
"""Remove credentials from a git URL while preserving the rest of the URL structure.
|
|
30
|
-
|
|
31
|
-
This function handles various git URL formats:
|
|
32
|
-
- HTTPS URLs with username:password@host
|
|
33
|
-
- HTTPS URLs with username@host (no password)
|
|
34
|
-
- SSH URLs (left unchanged)
|
|
35
|
-
- File URLs (left unchanged)
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
url: The git URL that may contain credentials.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
The sanitized URL with credentials removed.
|
|
42
|
-
|
|
43
|
-
Examples:
|
|
44
|
-
>>> sanitize_git_url("https://phil:token@dev.azure.com/org/project/_git/repo")
|
|
45
|
-
"https://dev.azure.com/org/project/_git/repo"
|
|
46
|
-
>>> sanitize_git_url("https://username@github.com/user/repo.git")
|
|
47
|
-
"https://github.com/user/repo.git"
|
|
48
|
-
>>> sanitize_git_url("git@github.com:user/repo.git")
|
|
49
|
-
"git@github.com:user/repo.git"
|
|
50
|
-
|
|
51
|
-
"""
|
|
52
|
-
# Handle SSH URLs (they don't have credentials in the URL format)
|
|
53
|
-
if url.startswith(("git@", "ssh://")):
|
|
54
|
-
return url
|
|
55
|
-
|
|
56
|
-
# Handle file URLs
|
|
57
|
-
if url.startswith("file://"):
|
|
58
|
-
return url
|
|
59
|
-
|
|
60
|
-
try:
|
|
61
|
-
# Parse the URL
|
|
62
|
-
parsed = urlparse(url)
|
|
63
|
-
|
|
64
|
-
# If there are no credentials, return the URL as-is
|
|
65
|
-
if not parsed.username:
|
|
66
|
-
return url
|
|
67
|
-
|
|
68
|
-
# Reconstruct the URL without credentials
|
|
69
|
-
# Keep scheme, netloc (without username/password), path, params, query, fragment
|
|
70
|
-
sanitized_netloc = parsed.hostname
|
|
71
|
-
if parsed.port:
|
|
72
|
-
sanitized_netloc = f"{parsed.hostname}:{parsed.port}"
|
|
73
|
-
|
|
74
|
-
return urlunparse(
|
|
75
|
-
(
|
|
76
|
-
parsed.scheme,
|
|
77
|
-
sanitized_netloc,
|
|
78
|
-
parsed.path,
|
|
79
|
-
parsed.params,
|
|
80
|
-
parsed.query,
|
|
81
|
-
parsed.fragment,
|
|
82
|
-
)
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
except Exception: # noqa: BLE001
|
|
86
|
-
# If URL parsing fails, return the original URL
|
|
87
|
-
return url
|
|
@@ -5,11 +5,10 @@ from pathlib import Path
|
|
|
5
5
|
import git
|
|
6
6
|
import pathspec
|
|
7
7
|
|
|
8
|
-
from kodit.domain.services.ignore_service import IgnorePatternProvider
|
|
9
8
|
from kodit.infrastructure.git.git_utils import is_valid_clone_target
|
|
10
9
|
|
|
11
10
|
|
|
12
|
-
class GitIgnorePatternProvider
|
|
11
|
+
class GitIgnorePatternProvider:
|
|
13
12
|
"""Ignore pattern provider for git repositories."""
|
|
14
13
|
|
|
15
14
|
def __init__(self, base_dir: Path) -> None:
|
|
@@ -11,7 +11,6 @@ from kodit.application.factories.code_indexing_factory import (
|
|
|
11
11
|
create_code_indexing_application_service,
|
|
12
12
|
)
|
|
13
13
|
from kodit.config import AppContext
|
|
14
|
-
from kodit.domain.services.source_service import SourceService
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
class AutoIndexingService:
|
|
@@ -44,29 +43,20 @@ class AutoIndexingService:
|
|
|
44
43
|
async def _index_sources(self, sources: list[str]) -> None:
|
|
45
44
|
"""Index all configured sources in the background."""
|
|
46
45
|
async with self.session_factory() as session:
|
|
47
|
-
source_service = SourceService(
|
|
48
|
-
clone_dir=self.app_context.get_clone_dir(),
|
|
49
|
-
session_factory=lambda: session,
|
|
50
|
-
)
|
|
51
|
-
|
|
52
46
|
service = create_code_indexing_application_service(
|
|
53
47
|
app_context=self.app_context,
|
|
54
48
|
session=session,
|
|
55
|
-
source_service=source_service,
|
|
56
49
|
)
|
|
57
50
|
|
|
58
51
|
for source in sources:
|
|
59
52
|
try:
|
|
60
53
|
self.log.info("Auto-indexing source", source=source)
|
|
61
54
|
|
|
62
|
-
# Create source
|
|
63
|
-
s = await source_service.create(source)
|
|
64
|
-
|
|
65
55
|
# Create index
|
|
66
|
-
index = await service.
|
|
56
|
+
index = await service.create_index_from_uri(source)
|
|
67
57
|
|
|
68
58
|
# Run indexing (without progress callback for background mode)
|
|
69
|
-
await service.run_index(index
|
|
59
|
+
await service.run_index(index, progress_callback=None)
|
|
70
60
|
|
|
71
61
|
self.log.info("Successfully auto-indexed source", source=source)
|
|
72
62
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Mapping layer for converting between domain and infrastructure models."""
|