kodit 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (57) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/factories/code_indexing_factory.py +77 -28
  3. kodit/application/services/code_indexing_application_service.py +148 -119
  4. kodit/cli.py +49 -52
  5. kodit/domain/entities.py +268 -189
  6. kodit/domain/protocols.py +61 -0
  7. kodit/domain/services/embedding_service.py +1 -1
  8. kodit/domain/services/index_query_service.py +66 -0
  9. kodit/domain/services/index_service.py +323 -0
  10. kodit/domain/value_objects.py +225 -92
  11. kodit/infrastructure/cloning/git/working_copy.py +17 -8
  12. kodit/infrastructure/cloning/metadata.py +37 -67
  13. kodit/infrastructure/embedding/embedding_factory.py +1 -1
  14. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  15. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
  16. kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
  17. kodit/infrastructure/git/git_utils.py +1 -63
  18. kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
  19. kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
  20. kodit/infrastructure/indexing/fusion_service.py +1 -1
  21. kodit/infrastructure/mappers/__init__.py +1 -0
  22. kodit/infrastructure/mappers/index_mapper.py +344 -0
  23. kodit/infrastructure/snippet_extraction/factories.py +13 -0
  24. kodit/infrastructure/snippet_extraction/language_detection_service.py +1 -1
  25. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -1
  26. kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +1 -1
  27. kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
  28. kodit/infrastructure/sqlalchemy/entities.py +203 -0
  29. kodit/infrastructure/sqlalchemy/file_repository.py +1 -1
  30. kodit/infrastructure/sqlalchemy/index_repository.py +550 -0
  31. kodit/log.py +4 -1
  32. kodit/mcp.py +1 -13
  33. kodit/migrations/env.py +1 -1
  34. kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +34 -0
  35. kodit/migrations/versions/4552eb3f23ce_add_summary.py +34 -0
  36. kodit/utils/__init__.py +1 -0
  37. kodit/utils/path_utils.py +54 -0
  38. {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/METADATA +1 -1
  39. {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/RECORD +42 -45
  40. kodit/domain/enums.py +0 -9
  41. kodit/domain/repositories.py +0 -128
  42. kodit/domain/services/ignore_service.py +0 -45
  43. kodit/domain/services/indexing_service.py +0 -204
  44. kodit/domain/services/snippet_extraction_service.py +0 -89
  45. kodit/domain/services/snippet_service.py +0 -211
  46. kodit/domain/services/source_service.py +0 -85
  47. kodit/infrastructure/cloning/folder/__init__.py +0 -1
  48. kodit/infrastructure/cloning/folder/factory.py +0 -128
  49. kodit/infrastructure/cloning/folder/working_copy.py +0 -38
  50. kodit/infrastructure/cloning/git/factory.py +0 -153
  51. kodit/infrastructure/indexing/index_repository.py +0 -273
  52. kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
  53. kodit/infrastructure/sqlalchemy/repository.py +0 -133
  54. kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -251
  55. {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/WHEEL +0 -0
  56. {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/entry_points.txt +0 -0
  57. {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/licenses/LICENSE +0 -0
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  import git
7
7
  import structlog
8
8
 
9
- from kodit.infrastructure.git.git_utils import sanitize_git_url
9
+ from kodit.domain.entities import WorkingCopy
10
10
 
11
11
 
12
12
  class GitWorkingCopyProvider:
@@ -17,15 +17,17 @@ class GitWorkingCopyProvider:
17
17
  self.clone_dir = clone_dir
18
18
  self.log = structlog.get_logger(__name__)
19
19
 
20
+ def get_clone_path(self, uri: str) -> Path:
21
+ """Get the clone path for a Git working copy."""
22
+ sanitized_uri = WorkingCopy.sanitize_git_url(uri)
23
+ dir_hash = hashlib.sha256(str(sanitized_uri).encode("utf-8")).hexdigest()[:16]
24
+ dir_name = f"repo-{dir_hash}"
25
+ return self.clone_dir / dir_name
26
+
20
27
  async def prepare(self, uri: str) -> Path:
21
28
  """Prepare a Git working copy."""
22
- # Sanitize the URI for directory name to prevent credential leaks
23
- sanitized_uri = sanitize_git_url(uri)
24
-
25
- # Use a repeatable, short sha256 hash of the sanitized URI for the directory
26
- dir_hash = hashlib.sha256(sanitized_uri.encode("utf-8")).hexdigest()[:16]
27
- dir_name = f"repo-{dir_hash}"
28
- clone_path = self.clone_dir / dir_name
29
+ sanitized_uri = WorkingCopy.sanitize_git_url(uri)
30
+ clone_path = self.get_clone_path(uri)
29
31
  clone_path.mkdir(parents=True, exist_ok=True)
30
32
 
31
33
  try:
@@ -41,3 +43,10 @@ class GitWorkingCopyProvider:
41
43
  self.log.info("Repository already exists, reusing...", uri=sanitized_uri)
42
44
 
43
45
  return clone_path
46
+
47
+ async def sync(self, uri: str) -> Path:
48
+ """Refresh a Git working copy."""
49
+ clone_path = self.get_clone_path(uri)
50
+ repo = git.Repo(clone_path)
51
+ repo.remotes.origin.pull()
52
+ return clone_path
@@ -4,59 +4,55 @@ import mimetypes
4
4
  from datetime import UTC, datetime
5
5
  from hashlib import sha256
6
6
  from pathlib import Path
7
- from typing import Any
8
7
 
9
8
  import aiofiles
10
9
  import git
11
- import structlog
10
+ from pydantic import AnyUrl
12
11
 
13
- from kodit.domain.entities import Author, File, Source
12
+ from kodit.domain.entities import Author, File
13
+ from kodit.domain.value_objects import FileProcessingStatus, SourceType
14
14
 
15
15
 
16
- class BaseFileMetadataExtractor:
17
- """Base class for file metadata extraction with common functionality."""
16
+ class FileMetadataExtractor:
17
+ """File metadata extractor."""
18
18
 
19
- async def extract(self, path: Path, source: Source) -> File:
19
+ def __init__(self, source_type: SourceType) -> None:
20
+ """Initialize the extractor."""
21
+ self.source_type = source_type
22
+
23
+ async def extract(self, file_path: Path) -> File:
20
24
  """Extract metadata from a file."""
21
- # Get timestamps - to be implemented by subclasses
22
- created_at, updated_at = await self._get_timestamps(path, source)
25
+ if self.source_type == SourceType.GIT:
26
+ created_at, updated_at = await self._get_git_timestamps(file_path)
27
+ else:
28
+ created_at, updated_at = await self._get_file_system_timestamps(file_path)
23
29
 
24
30
  # Read file content and calculate metadata
25
- async with aiofiles.open(path, "rb") as f:
31
+ async with aiofiles.open(file_path, "rb") as f:
26
32
  content = await f.read()
27
- mime_type = mimetypes.guess_type(path)
33
+ mime_type = mimetypes.guess_type(file_path)
28
34
  sha = sha256(content).hexdigest()
35
+ if self.source_type == SourceType.GIT:
36
+ authors = await self._extract_git_authors(file_path)
37
+ else:
38
+ authors = []
29
39
 
30
40
  return File(
31
41
  created_at=created_at,
32
42
  updated_at=updated_at,
33
- source_id=source.id,
34
- cloned_path=str(path),
43
+ uri=AnyUrl(file_path.resolve().absolute().as_uri()),
35
44
  mime_type=mime_type[0]
36
45
  if mime_type and mime_type[0]
37
46
  else "application/octet-stream",
38
- uri=path.as_uri(),
39
47
  sha256=sha,
40
- size_bytes=len(content),
41
- extension=path.suffix.removeprefix(".").lower(),
48
+ authors=authors,
49
+ file_processing_status=FileProcessingStatus.ADDED,
42
50
  )
43
51
 
44
- async def _get_timestamps(
45
- self, path: Path, source: Source
46
- ) -> tuple[datetime, datetime]:
47
- """Get creation and modification timestamps. To be implemented by subclasses."""
48
- raise NotImplementedError
49
-
50
-
51
- class GitFileMetadataExtractor(BaseFileMetadataExtractor):
52
- """Git-specific implementation for extracting file metadata."""
53
-
54
- async def _get_timestamps(
55
- self, path: Path, source: Source
56
- ) -> tuple[datetime, datetime]:
52
+ async def _get_git_timestamps(self, file_path: Path) -> tuple[datetime, datetime]:
57
53
  """Get timestamps from Git history."""
58
- git_repo = git.Repo(source.cloned_path)
59
- commits = list(git_repo.iter_commits(paths=str(path), all=True))
54
+ git_repo = git.Repo(file_path.parent, search_parent_directories=True)
55
+ commits = list(git_repo.iter_commits(paths=str(file_path), all=True))
60
56
 
61
57
  if commits:
62
58
  last_modified_at = commits[0].committed_datetime
@@ -66,38 +62,23 @@ class GitFileMetadataExtractor(BaseFileMetadataExtractor):
66
62
  now = datetime.now(UTC)
67
63
  return now, now
68
64
 
69
-
70
- class FolderFileMetadataExtractor(BaseFileMetadataExtractor):
71
- """Folder-specific implementation for extracting file metadata."""
72
-
73
- async def _get_timestamps(
65
+ async def _get_file_system_timestamps(
74
66
  self,
75
- path: Path,
76
- source: Source, # noqa: ARG002
67
+ file_path: Path,
77
68
  ) -> tuple[datetime, datetime]:
78
69
  """Get timestamps from file system."""
79
- stat = path.stat()
70
+ stat = file_path.stat()
80
71
  file_created_at = datetime.fromtimestamp(stat.st_ctime, UTC)
81
72
  file_modified_at = datetime.fromtimestamp(stat.st_mtime, UTC)
82
73
  return file_created_at, file_modified_at
83
74
 
84
-
85
- class GitAuthorExtractor:
86
- """Author extractor for Git repositories."""
87
-
88
- def __init__(self, repository: Any) -> None:
89
- """Initialize the extractor."""
90
- self.repository = repository
91
- self.log = structlog.get_logger(__name__)
92
-
93
- async def extract(self, path: Path, source: Source) -> list[Author]:
75
+ async def _extract_git_authors(self, file_path: Path) -> list[Author]:
94
76
  """Extract authors from a Git file."""
95
- authors: list[Author] = []
96
- git_repo = git.Repo(source.cloned_path)
77
+ git_repo = git.Repo(file_path.parent, search_parent_directories=True)
97
78
 
98
79
  try:
99
80
  # Get the file's blame
100
- blames = git_repo.blame("HEAD", str(path))
81
+ blames = git_repo.blame("HEAD", str(file_path))
101
82
 
102
83
  # Extract the blame's authors
103
84
  actors = [
@@ -108,21 +89,10 @@ class GitAuthorExtractor:
108
89
  ]
109
90
 
110
91
  # Get or create the authors in the database
111
- for actor in actors:
112
- if actor.email:
113
- author = Author.from_actor(actor)
114
- author = await self.repository.upsert_author(author)
115
- authors.append(author)
92
+ return [
93
+ Author(name=actor.name or "", email=actor.email or "")
94
+ for actor in actors
95
+ ]
116
96
  except git.GitCommandError:
117
97
  # Handle cases where file might not be tracked
118
- pass
119
-
120
- return authors
121
-
122
-
123
- class NoOpAuthorExtractor:
124
- """No-op author extractor for sources that don't have author information."""
125
-
126
- async def extract(self, path: Path, source: Source) -> list[Author]: # noqa: ARG002
127
- """Return empty list of authors."""
128
- return []
98
+ return []
@@ -3,7 +3,6 @@
3
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
4
 
5
5
  from kodit.config import AppContext, Endpoint
6
- from kodit.domain.entities import EmbeddingType
7
6
  from kodit.domain.services.embedding_service import EmbeddingDomainService
8
7
  from kodit.infrastructure.embedding.embedding_providers.local_embedding_provider import ( # noqa: E501
9
8
  CODE,
@@ -22,6 +21,7 @@ from kodit.infrastructure.embedding.vectorchord_vector_search_repository import
22
21
  from kodit.infrastructure.sqlalchemy.embedding_repository import (
23
22
  SqlAlchemyEmbeddingRepository,
24
23
  )
24
+ from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
25
25
  from kodit.log import log_event
26
26
 
27
27
 
@@ -4,7 +4,6 @@ from collections.abc import AsyncGenerator
4
4
 
5
5
  import structlog
6
6
 
7
- from kodit.domain.entities import Embedding, EmbeddingType
8
7
  from kodit.domain.services.embedding_service import (
9
8
  EmbeddingProvider,
10
9
  VectorSearchRepository,
@@ -19,6 +18,7 @@ from kodit.domain.value_objects import (
19
18
  from kodit.infrastructure.sqlalchemy.embedding_repository import (
20
19
  SqlAlchemyEmbeddingRepository,
21
20
  )
21
+ from kodit.infrastructure.sqlalchemy.entities import Embedding, EmbeddingType
22
22
 
23
23
 
24
24
  class LocalVectorSearchRepository(VectorSearchRepository):
@@ -7,7 +7,6 @@ import structlog
7
7
  from sqlalchemy import Result, TextClause, text
8
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
9
 
10
- from kodit.domain.entities import EmbeddingType
11
10
  from kodit.domain.services.embedding_service import (
12
11
  EmbeddingProvider,
13
12
  VectorSearchRepository,
@@ -19,6 +18,7 @@ from kodit.domain.value_objects import (
19
18
  SearchRequest,
20
19
  SearchResult,
21
20
  )
21
+ from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
22
22
 
23
23
  # SQL Queries
24
24
  CREATE_VCHORD_EXTENSION = """
@@ -1,5 +1,6 @@
1
1
  """Null enrichment provider for testing."""
2
2
 
3
+ import re
3
4
  from collections.abc import AsyncGenerator
4
5
 
5
6
  from kodit.domain.services.enrichment_service import EnrichmentProvider
@@ -12,14 +13,7 @@ class NullEnrichmentProvider(EnrichmentProvider):
12
13
  async def enrich(
13
14
  self, requests: list[EnrichmentRequest]
14
15
  ) -> AsyncGenerator[EnrichmentResponse, None]:
15
- """Return empty responses for all requests.
16
-
17
- Args:
18
- requests: List of enrichment requests.
19
-
20
- Yields:
21
- Empty enrichment responses.
22
-
23
- """
16
+ """Only keep alphabetic characters."""
24
17
  for request in requests:
25
- yield EnrichmentResponse(snippet_id=request.snippet_id, text="")
18
+ response = re.sub(r"[^a-zA-Z]", " ", request.text)
19
+ yield EnrichmentResponse(snippet_id=request.snippet_id, text=response)
@@ -1,11 +1,11 @@
1
1
  """Git utilities for infrastructure operations."""
2
2
 
3
3
  import tempfile
4
- from urllib.parse import urlparse, urlunparse
5
4
 
6
5
  import git
7
6
 
8
7
 
8
+ # FUTURE: move to clone dir
9
9
  def is_valid_clone_target(target: str) -> bool:
10
10
  """Return True if the target is clonable.
11
11
 
@@ -23,65 +23,3 @@ def is_valid_clone_target(target: str) -> bool:
23
23
  return False
24
24
  else:
25
25
  return True
26
-
27
-
28
- def sanitize_git_url(url: str) -> str:
29
- """Remove credentials from a git URL while preserving the rest of the URL structure.
30
-
31
- This function handles various git URL formats:
32
- - HTTPS URLs with username:password@host
33
- - HTTPS URLs with username@host (no password)
34
- - SSH URLs (left unchanged)
35
- - File URLs (left unchanged)
36
-
37
- Args:
38
- url: The git URL that may contain credentials.
39
-
40
- Returns:
41
- The sanitized URL with credentials removed.
42
-
43
- Examples:
44
- >>> sanitize_git_url("https://phil:token@dev.azure.com/org/project/_git/repo")
45
- "https://dev.azure.com/org/project/_git/repo"
46
- >>> sanitize_git_url("https://username@github.com/user/repo.git")
47
- "https://github.com/user/repo.git"
48
- >>> sanitize_git_url("git@github.com:user/repo.git")
49
- "git@github.com:user/repo.git"
50
-
51
- """
52
- # Handle SSH URLs (they don't have credentials in the URL format)
53
- if url.startswith(("git@", "ssh://")):
54
- return url
55
-
56
- # Handle file URLs
57
- if url.startswith("file://"):
58
- return url
59
-
60
- try:
61
- # Parse the URL
62
- parsed = urlparse(url)
63
-
64
- # If there are no credentials, return the URL as-is
65
- if not parsed.username:
66
- return url
67
-
68
- # Reconstruct the URL without credentials
69
- # Keep scheme, netloc (without username/password), path, params, query, fragment
70
- sanitized_netloc = parsed.hostname
71
- if parsed.port:
72
- sanitized_netloc = f"{parsed.hostname}:{parsed.port}"
73
-
74
- return urlunparse(
75
- (
76
- parsed.scheme,
77
- sanitized_netloc,
78
- parsed.path,
79
- parsed.params,
80
- parsed.query,
81
- parsed.fragment,
82
- )
83
- )
84
-
85
- except Exception: # noqa: BLE001
86
- # If URL parsing fails, return the original URL
87
- return url
@@ -5,11 +5,10 @@ from pathlib import Path
5
5
  import git
6
6
  import pathspec
7
7
 
8
- from kodit.domain.services.ignore_service import IgnorePatternProvider
9
8
  from kodit.infrastructure.git.git_utils import is_valid_clone_target
10
9
 
11
10
 
12
- class GitIgnorePatternProvider(IgnorePatternProvider):
11
+ class GitIgnorePatternProvider:
13
12
  """Ignore pattern provider for git repositories."""
14
13
 
15
14
  def __init__(self, base_dir: Path) -> None:
@@ -11,7 +11,6 @@ from kodit.application.factories.code_indexing_factory import (
11
11
  create_code_indexing_application_service,
12
12
  )
13
13
  from kodit.config import AppContext
14
- from kodit.domain.services.source_service import SourceService
15
14
 
16
15
 
17
16
  class AutoIndexingService:
@@ -44,29 +43,20 @@ class AutoIndexingService:
44
43
  async def _index_sources(self, sources: list[str]) -> None:
45
44
  """Index all configured sources in the background."""
46
45
  async with self.session_factory() as session:
47
- source_service = SourceService(
48
- clone_dir=self.app_context.get_clone_dir(),
49
- session_factory=lambda: session,
50
- )
51
-
52
46
  service = create_code_indexing_application_service(
53
47
  app_context=self.app_context,
54
48
  session=session,
55
- source_service=source_service,
56
49
  )
57
50
 
58
51
  for source in sources:
59
52
  try:
60
53
  self.log.info("Auto-indexing source", source=source)
61
54
 
62
- # Create source
63
- s = await source_service.create(source)
64
-
65
55
  # Create index
66
- index = await service.create_index(s.id)
56
+ index = await service.create_index_from_uri(source)
67
57
 
68
58
  # Run indexing (without progress callback for background mode)
69
- await service.run_index(index.id, progress_callback=None)
59
+ await service.run_index(index, progress_callback=None)
70
60
 
71
61
  self.log.info("Successfully auto-indexed source", source=source)
72
62
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  from collections import defaultdict
4
4
 
5
- from kodit.domain.services.indexing_service import FusionService
5
+ from kodit.domain.services.index_query_service import FusionService
6
6
  from kodit.domain.value_objects import FusionRequest, FusionResult
7
7
 
8
8
 
@@ -0,0 +1 @@
1
+ """Mapping layer for converting between domain and infrastructure models."""