kodit 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/code_indexing_factory.py +77 -28
- kodit/application/services/code_indexing_application_service.py +148 -119
- kodit/cli.py +49 -52
- kodit/domain/entities.py +268 -189
- kodit/domain/protocols.py +61 -0
- kodit/domain/services/embedding_service.py +1 -1
- kodit/domain/services/index_query_service.py +66 -0
- kodit/domain/services/index_service.py +323 -0
- kodit/domain/value_objects.py +225 -92
- kodit/infrastructure/cloning/git/working_copy.py +17 -8
- kodit/infrastructure/cloning/metadata.py +37 -67
- kodit/infrastructure/embedding/embedding_factory.py +1 -1
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
- kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
- kodit/infrastructure/git/git_utils.py +1 -63
- kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
- kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/__init__.py +1 -0
- kodit/infrastructure/mappers/index_mapper.py +344 -0
- kodit/infrastructure/snippet_extraction/factories.py +13 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +1 -1
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -1
- kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +1 -1
- kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/entities.py +203 -0
- kodit/infrastructure/sqlalchemy/file_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/index_repository.py +550 -0
- kodit/log.py +4 -1
- kodit/mcp.py +1 -13
- kodit/migrations/env.py +1 -1
- kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +34 -0
- kodit/migrations/versions/4552eb3f23ce_add_summary.py +34 -0
- kodit/utils/__init__.py +1 -0
- kodit/utils/path_utils.py +54 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/METADATA +1 -1
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/RECORD +42 -45
- kodit/domain/enums.py +0 -9
- kodit/domain/repositories.py +0 -128
- kodit/domain/services/ignore_service.py +0 -45
- kodit/domain/services/indexing_service.py +0 -204
- kodit/domain/services/snippet_extraction_service.py +0 -89
- kodit/domain/services/snippet_service.py +0 -211
- kodit/domain/services/source_service.py +0 -85
- kodit/infrastructure/cloning/folder/__init__.py +0 -1
- kodit/infrastructure/cloning/folder/factory.py +0 -128
- kodit/infrastructure/cloning/folder/working_copy.py +0 -38
- kodit/infrastructure/cloning/git/factory.py +0 -153
- kodit/infrastructure/indexing/index_repository.py +0 -273
- kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
- kodit/infrastructure/sqlalchemy/repository.py +0 -133
- kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -251
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/WHEEL +0 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/entry_points.txt +0 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
"""Domain services for snippet extraction."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from collections.abc import Mapping
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
from kodit.domain.enums import SnippetExtractionStrategy
|
|
8
|
-
from kodit.domain.value_objects import SnippetExtractionRequest, SnippetExtractionResult
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class LanguageDetectionService(ABC):
|
|
12
|
-
"""Abstract interface for language detection service."""
|
|
13
|
-
|
|
14
|
-
@abstractmethod
|
|
15
|
-
async def detect_language(self, file_path: Path) -> str:
|
|
16
|
-
"""Detect the programming language of a file."""
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class SnippetExtractor(ABC):
|
|
20
|
-
"""Abstract interface for snippet extraction."""
|
|
21
|
-
|
|
22
|
-
@abstractmethod
|
|
23
|
-
async def extract(self, file_path: Path, language: str) -> list[str]:
|
|
24
|
-
"""Extract snippets from a file."""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class SnippetExtractionService(ABC):
|
|
28
|
-
"""Domain service for extracting snippets from source code."""
|
|
29
|
-
|
|
30
|
-
@abstractmethod
|
|
31
|
-
async def extract_snippets(
|
|
32
|
-
self, request: SnippetExtractionRequest
|
|
33
|
-
) -> SnippetExtractionResult:
|
|
34
|
-
"""Extract snippets from a file using the specified strategy."""
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class SnippetExtractionDomainService:
|
|
38
|
-
"""Domain service implementation for snippet extraction business logic."""
|
|
39
|
-
|
|
40
|
-
def __init__(
|
|
41
|
-
self,
|
|
42
|
-
language_detector: LanguageDetectionService,
|
|
43
|
-
snippet_extractors: Mapping[SnippetExtractionStrategy, SnippetExtractor],
|
|
44
|
-
) -> None:
|
|
45
|
-
"""Initialize the snippet extraction domain service.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
language_detector: Service for detecting programming languages
|
|
49
|
-
snippet_extractors: Dictionary mapping strategies to extractor
|
|
50
|
-
implementations
|
|
51
|
-
|
|
52
|
-
"""
|
|
53
|
-
self.language_detector = language_detector
|
|
54
|
-
self.snippet_extractors = snippet_extractors
|
|
55
|
-
|
|
56
|
-
async def extract_snippets(
|
|
57
|
-
self, request: SnippetExtractionRequest
|
|
58
|
-
) -> SnippetExtractionResult:
|
|
59
|
-
"""Extract snippets from a file using the specified strategy.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
request: The snippet extraction request
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
SnippetExtractionResult containing the extracted snippets and
|
|
66
|
-
detected language
|
|
67
|
-
|
|
68
|
-
Raises:
|
|
69
|
-
ValueError: If the file doesn't exist or strategy is unsupported
|
|
70
|
-
|
|
71
|
-
"""
|
|
72
|
-
# Domain logic: validate file exists
|
|
73
|
-
if not request.file_path.exists():
|
|
74
|
-
raise ValueError(f"File does not exist: {request.file_path}")
|
|
75
|
-
|
|
76
|
-
# Domain logic: detect language
|
|
77
|
-
language = await self.language_detector.detect_language(request.file_path)
|
|
78
|
-
|
|
79
|
-
# Domain logic: choose strategy and extractor
|
|
80
|
-
if request.strategy not in self.snippet_extractors:
|
|
81
|
-
raise ValueError(f"Unsupported extraction strategy: {request.strategy}")
|
|
82
|
-
|
|
83
|
-
extractor = self.snippet_extractors[request.strategy]
|
|
84
|
-
snippets = await extractor.extract(request.file_path, language)
|
|
85
|
-
|
|
86
|
-
# Domain logic: filter out empty snippets
|
|
87
|
-
filtered_snippets = [snippet for snippet in snippets if snippet.strip()]
|
|
88
|
-
|
|
89
|
-
return SnippetExtractionResult(snippets=filtered_snippets, language=language)
|
|
@@ -1,211 +0,0 @@
|
|
|
1
|
-
"""Domain service for snippet operations."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Any
|
|
5
|
-
|
|
6
|
-
import structlog
|
|
7
|
-
|
|
8
|
-
from kodit.domain.entities import Snippet
|
|
9
|
-
from kodit.domain.enums import SnippetExtractionStrategy
|
|
10
|
-
from kodit.domain.interfaces import ProgressCallback
|
|
11
|
-
from kodit.domain.repositories import FileRepository, SnippetRepository
|
|
12
|
-
from kodit.domain.services.snippet_extraction_service import (
|
|
13
|
-
SnippetExtractionDomainService,
|
|
14
|
-
)
|
|
15
|
-
from kodit.domain.value_objects import (
|
|
16
|
-
MultiSearchRequest,
|
|
17
|
-
MultiSearchResult,
|
|
18
|
-
SnippetExtractionRequest,
|
|
19
|
-
SnippetListItem,
|
|
20
|
-
)
|
|
21
|
-
from kodit.reporting import Reporter
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class SnippetDomainService:
|
|
25
|
-
"""Domain service for snippet-related operations.
|
|
26
|
-
|
|
27
|
-
This service consolidates snippet operations that were previously
|
|
28
|
-
spread between application services. It handles:
|
|
29
|
-
- Snippet extraction from files
|
|
30
|
-
- Snippet persistence
|
|
31
|
-
- Snippet querying and filtering
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
def __init__(
|
|
35
|
-
self,
|
|
36
|
-
snippet_extraction_service: SnippetExtractionDomainService,
|
|
37
|
-
snippet_repository: SnippetRepository,
|
|
38
|
-
file_repository: FileRepository,
|
|
39
|
-
) -> None:
|
|
40
|
-
"""Initialize the snippet domain service.
|
|
41
|
-
|
|
42
|
-
Args:
|
|
43
|
-
snippet_extraction_service: Service for extracting snippets from files
|
|
44
|
-
snippet_repository: Repository for snippet persistence
|
|
45
|
-
file_repository: Repository for file operations
|
|
46
|
-
|
|
47
|
-
"""
|
|
48
|
-
self.snippet_extraction_service = snippet_extraction_service
|
|
49
|
-
self.snippet_repository = snippet_repository
|
|
50
|
-
self.file_repository = file_repository
|
|
51
|
-
self.log = structlog.get_logger(__name__)
|
|
52
|
-
|
|
53
|
-
async def extract_and_create_snippets(
|
|
54
|
-
self,
|
|
55
|
-
index_id: int,
|
|
56
|
-
strategy: SnippetExtractionStrategy,
|
|
57
|
-
progress_callback: ProgressCallback | None = None,
|
|
58
|
-
) -> list[Snippet]:
|
|
59
|
-
"""Extract snippets from all files in an index and persist them.
|
|
60
|
-
|
|
61
|
-
This method combines the extraction and persistence logic that was
|
|
62
|
-
previously split between domain and application services.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
index_id: The ID of the index to create snippets for
|
|
66
|
-
strategy: The extraction strategy to use
|
|
67
|
-
progress_callback: Optional callback for progress reporting
|
|
68
|
-
|
|
69
|
-
Returns:
|
|
70
|
-
List of created Snippet entities with IDs assigned
|
|
71
|
-
|
|
72
|
-
"""
|
|
73
|
-
files = await self.file_repository.get_files_for_index(index_id)
|
|
74
|
-
created_snippets = []
|
|
75
|
-
|
|
76
|
-
reporter = Reporter(self.log, progress_callback)
|
|
77
|
-
await reporter.start(
|
|
78
|
-
"create_snippets", len(files), "Creating snippets from files..."
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
for i, file in enumerate(files, 1):
|
|
82
|
-
if not self._should_process_file(file):
|
|
83
|
-
continue
|
|
84
|
-
|
|
85
|
-
try:
|
|
86
|
-
# Extract snippets from file
|
|
87
|
-
request = SnippetExtractionRequest(Path(file.cloned_path), strategy)
|
|
88
|
-
result = await self.snippet_extraction_service.extract_snippets(request)
|
|
89
|
-
|
|
90
|
-
# Create and persist snippet entities
|
|
91
|
-
for snippet_content in result.snippets:
|
|
92
|
-
snippet = Snippet(
|
|
93
|
-
file_id=file.id,
|
|
94
|
-
index_id=index_id,
|
|
95
|
-
content=snippet_content,
|
|
96
|
-
)
|
|
97
|
-
saved_snippet = await self.snippet_repository.save(snippet)
|
|
98
|
-
created_snippets.append(saved_snippet)
|
|
99
|
-
|
|
100
|
-
except (OSError, ValueError) as e:
|
|
101
|
-
self.log.debug(
|
|
102
|
-
"Skipping file",
|
|
103
|
-
file=file.cloned_path,
|
|
104
|
-
error=str(e),
|
|
105
|
-
)
|
|
106
|
-
continue
|
|
107
|
-
|
|
108
|
-
await reporter.step(
|
|
109
|
-
"create_snippets",
|
|
110
|
-
current=i,
|
|
111
|
-
total=len(files),
|
|
112
|
-
message=f"Processing {file.cloned_path}...",
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
await reporter.done("create_snippets")
|
|
116
|
-
return created_snippets
|
|
117
|
-
|
|
118
|
-
async def get_snippets_for_index(self, index_id: int) -> list[Snippet]:
|
|
119
|
-
"""Get all snippets for a specific index.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
index_id: The ID of the index
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
List of Snippet entities for the index
|
|
126
|
-
|
|
127
|
-
"""
|
|
128
|
-
# This delegates to the repository but provides a domain-level interface
|
|
129
|
-
return list(await self.snippet_repository.get_by_index(index_id))
|
|
130
|
-
|
|
131
|
-
async def update_snippet_content(self, snippet_id: int, content: str) -> None:
|
|
132
|
-
"""Update the content of an existing snippet.
|
|
133
|
-
|
|
134
|
-
Args:
|
|
135
|
-
snippet_id: The ID of the snippet to update
|
|
136
|
-
content: The new content for the snippet
|
|
137
|
-
|
|
138
|
-
"""
|
|
139
|
-
# Get the snippet first to ensure it exists
|
|
140
|
-
snippet = await self.snippet_repository.get(snippet_id)
|
|
141
|
-
if not snippet:
|
|
142
|
-
msg = f"Snippet not found: {snippet_id}"
|
|
143
|
-
raise ValueError(msg)
|
|
144
|
-
|
|
145
|
-
# Update the content
|
|
146
|
-
snippet.content = content
|
|
147
|
-
await self.snippet_repository.save(snippet)
|
|
148
|
-
|
|
149
|
-
async def delete_snippets_for_index(self, index_id: int) -> None:
|
|
150
|
-
"""Delete all snippets for a specific index.
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
index_id: The ID of the index
|
|
154
|
-
|
|
155
|
-
"""
|
|
156
|
-
await self.snippet_repository.delete_by_index(index_id)
|
|
157
|
-
|
|
158
|
-
async def search_snippets(
|
|
159
|
-
self, request: MultiSearchRequest
|
|
160
|
-
) -> list[SnippetListItem]:
|
|
161
|
-
"""Search snippets with filters.
|
|
162
|
-
|
|
163
|
-
Args:
|
|
164
|
-
request: The search request containing filters
|
|
165
|
-
|
|
166
|
-
Returns:
|
|
167
|
-
List of matching snippet items
|
|
168
|
-
|
|
169
|
-
"""
|
|
170
|
-
return list(await self.snippet_repository.search(request))
|
|
171
|
-
|
|
172
|
-
async def list_snippets(
|
|
173
|
-
self, file_path: str | None = None, source_uri: str | None = None
|
|
174
|
-
) -> list[MultiSearchResult]:
|
|
175
|
-
"""List snippets with optional filtering.
|
|
176
|
-
|
|
177
|
-
Args:
|
|
178
|
-
file_path: Optional file path to filter by
|
|
179
|
-
source_uri: Optional source URI to filter by
|
|
180
|
-
|
|
181
|
-
Returns:
|
|
182
|
-
List of search results matching the criteria
|
|
183
|
-
|
|
184
|
-
"""
|
|
185
|
-
snippet_items = await self.snippet_repository.list_snippets(
|
|
186
|
-
file_path, source_uri
|
|
187
|
-
)
|
|
188
|
-
# Convert SnippetListItem to MultiSearchResult for unified display format
|
|
189
|
-
return [
|
|
190
|
-
MultiSearchResult(
|
|
191
|
-
id=item.id,
|
|
192
|
-
uri=item.source_uri,
|
|
193
|
-
content=item.content,
|
|
194
|
-
original_scores=[],
|
|
195
|
-
)
|
|
196
|
-
for item in snippet_items
|
|
197
|
-
]
|
|
198
|
-
|
|
199
|
-
def _should_process_file(self, file: Any) -> bool:
|
|
200
|
-
"""Check if a file should be processed for snippet extraction.
|
|
201
|
-
|
|
202
|
-
Args:
|
|
203
|
-
file: The file to check
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
True if the file should be processed
|
|
207
|
-
|
|
208
|
-
"""
|
|
209
|
-
# Skip unsupported file types
|
|
210
|
-
mime_blacklist = ["unknown/unknown"]
|
|
211
|
-
return file.mime_type not in mime_blacklist
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
"""Source service rewritten to work directly with AsyncSession."""
|
|
2
|
-
|
|
3
|
-
from collections.abc import Callable
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
import structlog
|
|
7
|
-
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
|
8
|
-
|
|
9
|
-
from kodit.domain.entities import Source
|
|
10
|
-
from kodit.domain.interfaces import ProgressCallback
|
|
11
|
-
from kodit.domain.repositories import SourceRepository
|
|
12
|
-
from kodit.infrastructure.cloning.folder.factory import FolderSourceFactory
|
|
13
|
-
from kodit.infrastructure.cloning.folder.working_copy import FolderWorkingCopyProvider
|
|
14
|
-
from kodit.infrastructure.cloning.git.factory import (
|
|
15
|
-
GitSourceFactory,
|
|
16
|
-
GitWorkingCopyProvider,
|
|
17
|
-
)
|
|
18
|
-
from kodit.infrastructure.git.git_utils import is_valid_clone_target
|
|
19
|
-
from kodit.infrastructure.sqlalchemy.repository import SqlAlchemySourceRepository
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class SourceService:
|
|
23
|
-
"""Source service."""
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
clone_dir: Path,
|
|
28
|
-
session_factory: async_sessionmaker[AsyncSession] | Callable[[], AsyncSession],
|
|
29
|
-
) -> None:
|
|
30
|
-
"""Initialize the source service."""
|
|
31
|
-
self.clone_dir = clone_dir
|
|
32
|
-
self._session_factory = session_factory
|
|
33
|
-
self.log = structlog.get_logger(__name__)
|
|
34
|
-
|
|
35
|
-
async def get(self, source_id: int) -> Source:
|
|
36
|
-
"""Get a source."""
|
|
37
|
-
async with self._session_factory() as session:
|
|
38
|
-
repo = SqlAlchemySourceRepository(session)
|
|
39
|
-
|
|
40
|
-
source = await repo.get(source_id)
|
|
41
|
-
if source is None:
|
|
42
|
-
raise ValueError(f"Source not found: {source_id}")
|
|
43
|
-
|
|
44
|
-
return source
|
|
45
|
-
|
|
46
|
-
async def create(
|
|
47
|
-
self, uri_or_path_like: str, progress_callback: ProgressCallback | None = None
|
|
48
|
-
) -> Source:
|
|
49
|
-
"""Create a source."""
|
|
50
|
-
async with self._session_factory() as session:
|
|
51
|
-
repo = SqlAlchemySourceRepository(session)
|
|
52
|
-
git_factory, folder_factory = self._build_factories(repo, session)
|
|
53
|
-
|
|
54
|
-
if is_valid_clone_target(uri_or_path_like):
|
|
55
|
-
source = await git_factory.create(uri_or_path_like, progress_callback)
|
|
56
|
-
elif Path(uri_or_path_like).is_dir():
|
|
57
|
-
source = await folder_factory.create(
|
|
58
|
-
uri_or_path_like, progress_callback
|
|
59
|
-
)
|
|
60
|
-
else:
|
|
61
|
-
raise ValueError(f"Unsupported source: {uri_or_path_like}")
|
|
62
|
-
|
|
63
|
-
# Factories handle their own commits now
|
|
64
|
-
return source
|
|
65
|
-
|
|
66
|
-
def _build_factories(
|
|
67
|
-
self, repository: SourceRepository, session: AsyncSession
|
|
68
|
-
) -> tuple[GitSourceFactory, FolderSourceFactory]:
|
|
69
|
-
# Git-specific collaborators
|
|
70
|
-
git_wc = GitWorkingCopyProvider(self.clone_dir)
|
|
71
|
-
git_factory = GitSourceFactory(
|
|
72
|
-
repository=repository,
|
|
73
|
-
working_copy=git_wc,
|
|
74
|
-
session=session,
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
# Folder-specific collaborators
|
|
78
|
-
fold_wc = FolderWorkingCopyProvider(self.clone_dir)
|
|
79
|
-
folder_factory = FolderSourceFactory(
|
|
80
|
-
repository=repository,
|
|
81
|
-
working_copy=fold_wc,
|
|
82
|
-
session=session,
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
return git_factory, folder_factory
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Folder cloning infrastructure."""
|
|
@@ -1,128 +0,0 @@
|
|
|
1
|
-
"""Factory for creating folder-based working copies."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import structlog
|
|
6
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
-
|
|
8
|
-
from kodit.domain.entities import AuthorFileMapping, Source, SourceType
|
|
9
|
-
from kodit.domain.interfaces import NullProgressCallback, ProgressCallback
|
|
10
|
-
from kodit.domain.repositories import SourceRepository
|
|
11
|
-
from kodit.domain.value_objects import ProgressEvent
|
|
12
|
-
from kodit.infrastructure.cloning.folder.working_copy import FolderWorkingCopyProvider
|
|
13
|
-
from kodit.infrastructure.cloning.metadata import (
|
|
14
|
-
FolderFileMetadataExtractor,
|
|
15
|
-
NoOpAuthorExtractor,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class FolderSourceFactory:
|
|
20
|
-
"""Factory for creating folder sources."""
|
|
21
|
-
|
|
22
|
-
def __init__(
|
|
23
|
-
self,
|
|
24
|
-
repository: SourceRepository,
|
|
25
|
-
working_copy: FolderWorkingCopyProvider,
|
|
26
|
-
session: AsyncSession,
|
|
27
|
-
) -> None:
|
|
28
|
-
"""Initialize the source factory."""
|
|
29
|
-
self.log = structlog.get_logger(__name__)
|
|
30
|
-
self.repository = repository
|
|
31
|
-
self.working_copy = working_copy
|
|
32
|
-
self.metadata_extractor = FolderFileMetadataExtractor()
|
|
33
|
-
self.author_extractor = NoOpAuthorExtractor()
|
|
34
|
-
self.session = session
|
|
35
|
-
|
|
36
|
-
async def create(
|
|
37
|
-
self, uri: str, progress_callback: ProgressCallback | None = None
|
|
38
|
-
) -> Source:
|
|
39
|
-
"""Create a folder source from a path."""
|
|
40
|
-
# Use null callback if none provided
|
|
41
|
-
if progress_callback is None:
|
|
42
|
-
progress_callback = NullProgressCallback()
|
|
43
|
-
|
|
44
|
-
directory = Path(uri).expanduser().resolve()
|
|
45
|
-
|
|
46
|
-
# Check if source already exists
|
|
47
|
-
source = await self.repository.get_by_uri(directory.as_uri())
|
|
48
|
-
if source:
|
|
49
|
-
self.log.info("Source already exists, reusing...", source_id=source.id)
|
|
50
|
-
return source
|
|
51
|
-
|
|
52
|
-
# Validate directory exists
|
|
53
|
-
if not directory.exists():
|
|
54
|
-
msg = f"Folder does not exist: {directory}"
|
|
55
|
-
raise ValueError(msg)
|
|
56
|
-
|
|
57
|
-
# Prepare working copy
|
|
58
|
-
clone_path = await self.working_copy.prepare(directory.as_uri())
|
|
59
|
-
|
|
60
|
-
# Create source record
|
|
61
|
-
source = await self.repository.save(
|
|
62
|
-
Source(
|
|
63
|
-
uri=directory.as_uri(),
|
|
64
|
-
cloned_path=str(clone_path),
|
|
65
|
-
source_type=SourceType.FOLDER,
|
|
66
|
-
)
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
# Commit source creation so we get an ID for foreign key relationships
|
|
70
|
-
await self.session.commit()
|
|
71
|
-
|
|
72
|
-
# Get all files to process
|
|
73
|
-
files = [f for f in clone_path.rglob("*") if f.is_file()]
|
|
74
|
-
|
|
75
|
-
# Process files
|
|
76
|
-
await self._process_files(source, files, progress_callback)
|
|
77
|
-
|
|
78
|
-
# Commit file processing
|
|
79
|
-
await self.session.commit()
|
|
80
|
-
|
|
81
|
-
return source
|
|
82
|
-
|
|
83
|
-
async def _process_files(
|
|
84
|
-
self, source: Source, files: list[Path], progress_callback: ProgressCallback
|
|
85
|
-
) -> None:
|
|
86
|
-
"""Process files for a source."""
|
|
87
|
-
total_files = len(files)
|
|
88
|
-
|
|
89
|
-
# Notify start of operation
|
|
90
|
-
await progress_callback.on_progress(
|
|
91
|
-
ProgressEvent(
|
|
92
|
-
operation="process_files",
|
|
93
|
-
current=0,
|
|
94
|
-
total=total_files,
|
|
95
|
-
message="Processing files...",
|
|
96
|
-
)
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
for i, path in enumerate(files, 1):
|
|
100
|
-
if not path.is_file():
|
|
101
|
-
continue
|
|
102
|
-
|
|
103
|
-
# Extract file metadata
|
|
104
|
-
file_record = await self.metadata_extractor.extract(path, source)
|
|
105
|
-
await self.repository.create_file(file_record)
|
|
106
|
-
|
|
107
|
-
# Extract authors
|
|
108
|
-
authors = await self.author_extractor.extract(path, source)
|
|
109
|
-
for author in authors:
|
|
110
|
-
await self.repository.upsert_author_file_mapping(
|
|
111
|
-
AuthorFileMapping(
|
|
112
|
-
author_id=author.id,
|
|
113
|
-
file_id=file_record.id,
|
|
114
|
-
)
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
# Update progress
|
|
118
|
-
await progress_callback.on_progress(
|
|
119
|
-
ProgressEvent(
|
|
120
|
-
operation="process_files",
|
|
121
|
-
current=i,
|
|
122
|
-
total=total_files,
|
|
123
|
-
message=f"Processing {path.name}...",
|
|
124
|
-
)
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
# Notify completion
|
|
128
|
-
await progress_callback.on_complete("process_files")
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
"""Working copy provider for folder-based sources."""
|
|
2
|
-
|
|
3
|
-
import shutil
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class FolderWorkingCopyProvider:
|
|
8
|
-
"""Working copy provider for folder-based sources."""
|
|
9
|
-
|
|
10
|
-
def __init__(self, clone_dir: Path) -> None:
|
|
11
|
-
"""Initialize the provider."""
|
|
12
|
-
self.clone_dir = clone_dir
|
|
13
|
-
|
|
14
|
-
async def prepare(self, uri: str) -> Path:
|
|
15
|
-
"""Prepare a folder working copy."""
|
|
16
|
-
# Handle file:// URIs
|
|
17
|
-
if uri.startswith("file://"):
|
|
18
|
-
from urllib.parse import urlparse
|
|
19
|
-
|
|
20
|
-
parsed = urlparse(uri)
|
|
21
|
-
directory = Path(parsed.path).expanduser().resolve()
|
|
22
|
-
else:
|
|
23
|
-
directory = Path(uri).expanduser().resolve()
|
|
24
|
-
|
|
25
|
-
# Clone into a local directory
|
|
26
|
-
clone_path = self.clone_dir / directory.as_posix().replace("/", "_")
|
|
27
|
-
clone_path.mkdir(parents=True, exist_ok=True)
|
|
28
|
-
|
|
29
|
-
# Copy all files recursively, preserving directory structure, ignoring
|
|
30
|
-
# hidden files
|
|
31
|
-
shutil.copytree(
|
|
32
|
-
directory,
|
|
33
|
-
clone_path,
|
|
34
|
-
ignore=shutil.ignore_patterns(".*"),
|
|
35
|
-
dirs_exist_ok=True,
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
return clone_path
|