kodit 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +51 -23
- kodit/application/factories/reporting_factory.py +6 -2
- kodit/application/factories/server_factory.py +353 -0
- kodit/application/services/code_search_application_service.py +144 -0
- kodit/application/services/commit_indexing_application_service.py +700 -0
- kodit/application/services/indexing_worker_service.py +13 -44
- kodit/application/services/queue_service.py +24 -3
- kodit/application/services/reporting.py +0 -2
- kodit/application/services/sync_scheduler.py +15 -31
- kodit/cli.py +2 -753
- kodit/cli_utils.py +2 -9
- kodit/config.py +4 -97
- kodit/database.py +38 -1
- kodit/domain/enrichments/__init__.py +1 -0
- kodit/domain/enrichments/architecture/__init__.py +1 -0
- kodit/domain/enrichments/architecture/architecture.py +20 -0
- kodit/domain/enrichments/architecture/physical/__init__.py +1 -0
- kodit/domain/enrichments/architecture/physical/discovery_notes.py +14 -0
- kodit/domain/enrichments/architecture/physical/formatter.py +11 -0
- kodit/domain/enrichments/architecture/physical/physical.py +17 -0
- kodit/domain/enrichments/development/__init__.py +1 -0
- kodit/domain/enrichments/development/development.py +18 -0
- kodit/domain/enrichments/development/snippet/__init__.py +1 -0
- kodit/domain/enrichments/development/snippet/snippet.py +21 -0
- kodit/domain/enrichments/enricher.py +17 -0
- kodit/domain/enrichments/enrichment.py +39 -0
- kodit/domain/enrichments/request.py +12 -0
- kodit/domain/enrichments/response.py +11 -0
- kodit/domain/enrichments/usage/__init__.py +1 -0
- kodit/domain/enrichments/usage/api_docs.py +19 -0
- kodit/domain/enrichments/usage/usage.py +18 -0
- kodit/domain/{entities.py → entities/__init__.py} +50 -195
- kodit/domain/entities/git.py +190 -0
- kodit/domain/factories/__init__.py +1 -0
- kodit/domain/factories/git_repo_factory.py +76 -0
- kodit/domain/protocols.py +264 -64
- kodit/domain/services/bm25_service.py +5 -1
- kodit/domain/services/embedding_service.py +3 -0
- kodit/domain/services/enrichment_service.py +9 -30
- kodit/domain/services/git_repository_service.py +429 -0
- kodit/domain/services/git_service.py +300 -0
- kodit/domain/services/physical_architecture_service.py +182 -0
- kodit/domain/services/task_status_query_service.py +2 -2
- kodit/domain/value_objects.py +87 -135
- kodit/infrastructure/api/client/__init__.py +0 -2
- kodit/infrastructure/api/v1/__init__.py +0 -4
- kodit/infrastructure/api/v1/dependencies.py +92 -46
- kodit/infrastructure/api/v1/routers/__init__.py +0 -6
- kodit/infrastructure/api/v1/routers/commits.py +352 -0
- kodit/infrastructure/api/v1/routers/queue.py +2 -2
- kodit/infrastructure/api/v1/routers/repositories.py +282 -0
- kodit/infrastructure/api/v1/routers/search.py +31 -14
- kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
- kodit/infrastructure/api/v1/schemas/commit.py +96 -0
- kodit/infrastructure/api/v1/schemas/context.py +2 -0
- kodit/infrastructure/api/v1/schemas/enrichment.py +29 -0
- kodit/infrastructure/api/v1/schemas/repository.py +128 -0
- kodit/infrastructure/api/v1/schemas/search.py +12 -9
- kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
- kodit/infrastructure/api/v1/schemas/tag.py +31 -0
- kodit/infrastructure/api/v1/schemas/task_status.py +2 -0
- kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
- kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
- kodit/infrastructure/cloning/git/git_python_adaptor.py +534 -0
- kodit/infrastructure/cloning/git/working_copy.py +1 -1
- kodit/infrastructure/embedding/embedding_factory.py +3 -2
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
- kodit/infrastructure/enricher/__init__.py +1 -0
- kodit/infrastructure/enricher/enricher_factory.py +53 -0
- kodit/infrastructure/{enrichment/litellm_enrichment_provider.py → enricher/litellm_enricher.py} +36 -56
- kodit/infrastructure/{enrichment/local_enrichment_provider.py → enricher/local_enricher.py} +19 -24
- kodit/infrastructure/enricher/null_enricher.py +36 -0
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/enrichment_mapper.py +83 -0
- kodit/infrastructure/mappers/git_mapper.py +193 -0
- kodit/infrastructure/mappers/snippet_mapper.py +104 -0
- kodit/infrastructure/mappers/task_mapper.py +5 -44
- kodit/infrastructure/physical_architecture/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/detectors/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/detectors/docker_compose_detector.py +336 -0
- kodit/infrastructure/physical_architecture/formatters/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/formatters/narrative_formatter.py +149 -0
- kodit/infrastructure/reporting/log_progress.py +8 -5
- kodit/infrastructure/reporting/telemetry_progress.py +21 -0
- kodit/infrastructure/slicing/api_doc_extractor.py +836 -0
- kodit/infrastructure/slicing/ast_analyzer.py +1128 -0
- kodit/infrastructure/slicing/slicer.py +87 -421
- kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
- kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +118 -0
- kodit/infrastructure/sqlalchemy/entities.py +402 -158
- kodit/infrastructure/sqlalchemy/git_branch_repository.py +274 -0
- kodit/infrastructure/sqlalchemy/git_commit_repository.py +346 -0
- kodit/infrastructure/sqlalchemy/git_repository.py +262 -0
- kodit/infrastructure/sqlalchemy/git_tag_repository.py +268 -0
- kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +479 -0
- kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
- kodit/infrastructure/sqlalchemy/task_status_repository.py +24 -12
- kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
- kodit/mcp.py +12 -30
- kodit/migrations/env.py +1 -0
- kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
- kodit/migrations/versions/19f8c7faf8b9_add_generic_enrichment_type.py +260 -0
- kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
- kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
- kodit/py.typed +0 -0
- kodit/utils/dump_config.py +361 -0
- kodit/utils/dump_openapi.py +6 -4
- kodit/utils/path_utils.py +29 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/METADATA +3 -3
- kodit-0.5.1.dist-info/RECORD +168 -0
- kodit/application/factories/code_indexing_factory.py +0 -195
- kodit/application/services/auto_indexing_service.py +0 -99
- kodit/application/services/code_indexing_application_service.py +0 -410
- kodit/domain/services/index_query_service.py +0 -70
- kodit/domain/services/index_service.py +0 -269
- kodit/infrastructure/api/client/index_client.py +0 -57
- kodit/infrastructure/api/v1/routers/indexes.py +0 -164
- kodit/infrastructure/api/v1/schemas/index.py +0 -101
- kodit/infrastructure/bm25/bm25_factory.py +0 -28
- kodit/infrastructure/cloning/__init__.py +0 -1
- kodit/infrastructure/cloning/metadata.py +0 -98
- kodit/infrastructure/enrichment/__init__.py +0 -1
- kodit/infrastructure/enrichment/enrichment_factory.py +0 -52
- kodit/infrastructure/enrichment/null_enrichment_provider.py +0 -19
- kodit/infrastructure/mappers/index_mapper.py +0 -345
- kodit/infrastructure/reporting/tdqm_progress.py +0 -38
- kodit/infrastructure/slicing/language_detection_service.py +0 -18
- kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
- kodit-0.4.3.dist-info/RECORD +0 -125
- /kodit/infrastructure/{enrichment → enricher}/utils.py +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/WHEEL +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/entry_points.txt +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
"""Metadata extraction for cloned sources."""
|
|
2
|
-
|
|
3
|
-
import mimetypes
|
|
4
|
-
from datetime import UTC, datetime
|
|
5
|
-
from hashlib import sha256
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import aiofiles
|
|
9
|
-
import git
|
|
10
|
-
from pydantic import AnyUrl
|
|
11
|
-
|
|
12
|
-
from kodit.domain.entities import Author, File
|
|
13
|
-
from kodit.domain.value_objects import FileProcessingStatus, SourceType
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class FileMetadataExtractor:
|
|
17
|
-
"""File metadata extractor."""
|
|
18
|
-
|
|
19
|
-
def __init__(self, source_type: SourceType) -> None:
|
|
20
|
-
"""Initialize the extractor."""
|
|
21
|
-
self.source_type = source_type
|
|
22
|
-
|
|
23
|
-
async def extract(self, file_path: Path) -> File:
|
|
24
|
-
"""Extract metadata from a file."""
|
|
25
|
-
if self.source_type == SourceType.GIT:
|
|
26
|
-
created_at, updated_at = await self._get_git_timestamps(file_path)
|
|
27
|
-
else:
|
|
28
|
-
created_at, updated_at = await self._get_file_system_timestamps(file_path)
|
|
29
|
-
|
|
30
|
-
# Read file content and calculate metadata
|
|
31
|
-
async with aiofiles.open(file_path, "rb") as f:
|
|
32
|
-
content = await f.read()
|
|
33
|
-
mime_type = mimetypes.guess_type(file_path)
|
|
34
|
-
sha = sha256(content).hexdigest()
|
|
35
|
-
if self.source_type == SourceType.GIT:
|
|
36
|
-
authors = await self._extract_git_authors(file_path)
|
|
37
|
-
else:
|
|
38
|
-
authors = []
|
|
39
|
-
|
|
40
|
-
return File(
|
|
41
|
-
created_at=created_at,
|
|
42
|
-
updated_at=updated_at,
|
|
43
|
-
uri=AnyUrl(file_path.resolve().absolute().as_uri()),
|
|
44
|
-
mime_type=mime_type[0]
|
|
45
|
-
if mime_type and mime_type[0]
|
|
46
|
-
else "application/octet-stream",
|
|
47
|
-
sha256=sha,
|
|
48
|
-
authors=authors,
|
|
49
|
-
file_processing_status=FileProcessingStatus.ADDED,
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
async def _get_git_timestamps(self, file_path: Path) -> tuple[datetime, datetime]:
|
|
53
|
-
"""Get timestamps from Git history."""
|
|
54
|
-
git_repo = git.Repo(file_path.parent, search_parent_directories=True)
|
|
55
|
-
commits = list(git_repo.iter_commits(paths=str(file_path), all=True))
|
|
56
|
-
|
|
57
|
-
if commits:
|
|
58
|
-
last_modified_at = commits[0].committed_datetime
|
|
59
|
-
first_modified_at = commits[-1].committed_datetime
|
|
60
|
-
return first_modified_at, last_modified_at
|
|
61
|
-
# Fallback to current time if no commits found
|
|
62
|
-
now = datetime.now(UTC)
|
|
63
|
-
return now, now
|
|
64
|
-
|
|
65
|
-
async def _get_file_system_timestamps(
|
|
66
|
-
self,
|
|
67
|
-
file_path: Path,
|
|
68
|
-
) -> tuple[datetime, datetime]:
|
|
69
|
-
"""Get timestamps from file system."""
|
|
70
|
-
stat = file_path.stat()
|
|
71
|
-
file_created_at = datetime.fromtimestamp(stat.st_ctime, UTC)
|
|
72
|
-
file_modified_at = datetime.fromtimestamp(stat.st_mtime, UTC)
|
|
73
|
-
return file_created_at, file_modified_at
|
|
74
|
-
|
|
75
|
-
async def _extract_git_authors(self, file_path: Path) -> list[Author]:
|
|
76
|
-
"""Extract authors from a Git file."""
|
|
77
|
-
git_repo = git.Repo(file_path.parent, search_parent_directories=True)
|
|
78
|
-
|
|
79
|
-
try:
|
|
80
|
-
# Get the file's blame
|
|
81
|
-
blames = git_repo.blame("HEAD", str(file_path))
|
|
82
|
-
|
|
83
|
-
# Extract the blame's authors
|
|
84
|
-
actors = [
|
|
85
|
-
commit.author
|
|
86
|
-
for blame in blames or []
|
|
87
|
-
for commit in blame
|
|
88
|
-
if isinstance(commit, git.Commit)
|
|
89
|
-
]
|
|
90
|
-
|
|
91
|
-
# Get or create the authors in the database
|
|
92
|
-
return [
|
|
93
|
-
Author(name=actor.name or "", email=actor.email or "")
|
|
94
|
-
for actor in actors
|
|
95
|
-
]
|
|
96
|
-
except git.GitCommandError:
|
|
97
|
-
# Handle cases where file might not be tracked
|
|
98
|
-
return []
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Infrastructure enrichment module."""
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
"""Enrichment factory for creating enrichment domain services."""
|
|
2
|
-
|
|
3
|
-
from kodit.config import AppContext, Endpoint
|
|
4
|
-
from kodit.domain.services.enrichment_service import (
|
|
5
|
-
EnrichmentDomainService,
|
|
6
|
-
EnrichmentProvider,
|
|
7
|
-
)
|
|
8
|
-
from kodit.infrastructure.enrichment.litellm_enrichment_provider import (
|
|
9
|
-
LiteLLMEnrichmentProvider,
|
|
10
|
-
)
|
|
11
|
-
from kodit.infrastructure.enrichment.local_enrichment_provider import (
|
|
12
|
-
LocalEnrichmentProvider,
|
|
13
|
-
)
|
|
14
|
-
from kodit.log import log_event
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
|
|
18
|
-
"""Get the endpoint configuration for the enrichment service.
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
app_context: The application context.
|
|
22
|
-
|
|
23
|
-
Returns:
|
|
24
|
-
The endpoint configuration or None.
|
|
25
|
-
|
|
26
|
-
"""
|
|
27
|
-
return app_context.enrichment_endpoint or None
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def enrichment_domain_service_factory(
|
|
31
|
-
app_context: AppContext,
|
|
32
|
-
) -> EnrichmentDomainService:
|
|
33
|
-
"""Create an enrichment domain service.
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
app_context: The application context.
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
An enrichment domain service instance.
|
|
40
|
-
|
|
41
|
-
"""
|
|
42
|
-
endpoint = _get_endpoint_configuration(app_context)
|
|
43
|
-
|
|
44
|
-
enrichment_provider: EnrichmentProvider | None = None
|
|
45
|
-
if endpoint:
|
|
46
|
-
log_event("kodit.enrichment", {"provider": "litellm"})
|
|
47
|
-
enrichment_provider = LiteLLMEnrichmentProvider(endpoint=endpoint)
|
|
48
|
-
else:
|
|
49
|
-
log_event("kodit.enrichment", {"provider": "local"})
|
|
50
|
-
enrichment_provider = LocalEnrichmentProvider()
|
|
51
|
-
|
|
52
|
-
return EnrichmentDomainService(enrichment_provider=enrichment_provider)
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
"""Null enrichment provider for testing."""
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
|
|
6
|
-
from kodit.domain.services.enrichment_service import EnrichmentProvider
|
|
7
|
-
from kodit.domain.value_objects import EnrichmentRequest, EnrichmentResponse
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class NullEnrichmentProvider(EnrichmentProvider):
|
|
11
|
-
"""Null enrichment provider that returns empty responses."""
|
|
12
|
-
|
|
13
|
-
async def enrich(
|
|
14
|
-
self, requests: list[EnrichmentRequest]
|
|
15
|
-
) -> AsyncGenerator[EnrichmentResponse, None]:
|
|
16
|
-
"""Only keep alphabetic characters."""
|
|
17
|
-
for request in requests:
|
|
18
|
-
response = re.sub(r"[^a-zA-Z]", " ", request.text)
|
|
19
|
-
yield EnrichmentResponse(snippet_id=request.snippet_id, text=response)
|
|
@@ -1,345 +0,0 @@
|
|
|
1
|
-
"""Mapping between domain Index aggregate and SQLAlchemy entities."""
|
|
2
|
-
|
|
3
|
-
from datetime import UTC, datetime
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from pydantic import AnyUrl
|
|
7
|
-
from sqlalchemy import select
|
|
8
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
|
-
|
|
10
|
-
import kodit.domain.entities as domain_entities
|
|
11
|
-
from kodit.domain.value_objects import (
|
|
12
|
-
FileProcessingStatus,
|
|
13
|
-
SourceType,
|
|
14
|
-
)
|
|
15
|
-
from kodit.infrastructure.sqlalchemy import entities as db_entities
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
# TODO(Phil): Make this a pure mapper without any DB access # noqa: TD003, FIX002
|
|
19
|
-
class IndexMapper:
|
|
20
|
-
"""Mapper for converting between domain Index aggregate and database entities."""
|
|
21
|
-
|
|
22
|
-
def __init__(self, session: AsyncSession) -> None:
|
|
23
|
-
"""Initialize mapper with database session."""
|
|
24
|
-
self._session = session
|
|
25
|
-
|
|
26
|
-
async def to_domain_index(
|
|
27
|
-
self, db_index: db_entities.Index
|
|
28
|
-
) -> domain_entities.Index:
|
|
29
|
-
"""Convert SQLAlchemy Index to domain Index aggregate.
|
|
30
|
-
|
|
31
|
-
Loads the full aggregate including Source, WorkingCopy, Files, and Snippets.
|
|
32
|
-
"""
|
|
33
|
-
# Load the source
|
|
34
|
-
db_source = await self._session.get(db_entities.Source, db_index.source_id)
|
|
35
|
-
if not db_source:
|
|
36
|
-
raise ValueError(f"Source not found for index {db_index.id}")
|
|
37
|
-
|
|
38
|
-
# Load files for the source
|
|
39
|
-
files_stmt = select(db_entities.File).where(
|
|
40
|
-
db_entities.File.source_id == db_source.id
|
|
41
|
-
)
|
|
42
|
-
db_files = (await self._session.scalars(files_stmt)).all()
|
|
43
|
-
|
|
44
|
-
# Convert files to domain
|
|
45
|
-
domain_files = []
|
|
46
|
-
for db_file in db_files:
|
|
47
|
-
# Load authors for this file
|
|
48
|
-
authors_stmt = (
|
|
49
|
-
select(db_entities.Author)
|
|
50
|
-
.join(db_entities.AuthorFileMapping)
|
|
51
|
-
.where(db_entities.AuthorFileMapping.file_id == db_file.id)
|
|
52
|
-
)
|
|
53
|
-
db_authors = (await self._session.scalars(authors_stmt)).all()
|
|
54
|
-
|
|
55
|
-
domain_authors = [
|
|
56
|
-
domain_entities.Author(
|
|
57
|
-
id=author.id, name=author.name, email=author.email
|
|
58
|
-
)
|
|
59
|
-
for author in db_authors
|
|
60
|
-
]
|
|
61
|
-
|
|
62
|
-
domain_file = domain_entities.File(
|
|
63
|
-
id=db_file.id,
|
|
64
|
-
created_at=db_file.created_at,
|
|
65
|
-
updated_at=db_file.updated_at,
|
|
66
|
-
uri=AnyUrl(db_file.uri),
|
|
67
|
-
sha256=db_file.sha256,
|
|
68
|
-
authors=domain_authors,
|
|
69
|
-
mime_type=db_file.mime_type,
|
|
70
|
-
file_processing_status=FileProcessingStatus(
|
|
71
|
-
db_file.file_processing_status
|
|
72
|
-
),
|
|
73
|
-
)
|
|
74
|
-
domain_files.append(domain_file)
|
|
75
|
-
|
|
76
|
-
# Create working copy
|
|
77
|
-
working_copy = domain_entities.WorkingCopy(
|
|
78
|
-
created_at=db_source.created_at,
|
|
79
|
-
updated_at=db_source.updated_at,
|
|
80
|
-
remote_uri=AnyUrl(db_source.uri),
|
|
81
|
-
cloned_path=Path(db_source.cloned_path),
|
|
82
|
-
source_type=SourceType(db_source.type.value),
|
|
83
|
-
files=domain_files,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
# Create source
|
|
87
|
-
domain_source = domain_entities.Source(
|
|
88
|
-
id=db_source.id,
|
|
89
|
-
created_at=db_source.created_at,
|
|
90
|
-
updated_at=db_source.updated_at,
|
|
91
|
-
working_copy=working_copy,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
# Load snippets for this index
|
|
95
|
-
snippets_stmt = select(db_entities.Snippet).where(
|
|
96
|
-
db_entities.Snippet.index_id == db_index.id
|
|
97
|
-
)
|
|
98
|
-
db_snippets = (await self._session.scalars(snippets_stmt)).all()
|
|
99
|
-
|
|
100
|
-
domain_snippets = []
|
|
101
|
-
for db_snippet in db_snippets:
|
|
102
|
-
domain_snippet = await self.to_domain_snippet(db_snippet, domain_files)
|
|
103
|
-
domain_snippets.append(domain_snippet)
|
|
104
|
-
|
|
105
|
-
# Create index aggregate
|
|
106
|
-
return domain_entities.Index(
|
|
107
|
-
id=db_index.id,
|
|
108
|
-
created_at=db_index.created_at,
|
|
109
|
-
updated_at=db_index.updated_at,
|
|
110
|
-
source=domain_source,
|
|
111
|
-
snippets=domain_snippets,
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
async def to_domain_source(
|
|
115
|
-
self, db_source: db_entities.Source
|
|
116
|
-
) -> domain_entities.Source:
|
|
117
|
-
"""Convert SQLAlchemy Source to domain Source."""
|
|
118
|
-
# Load files for the source
|
|
119
|
-
files_stmt = select(db_entities.File).where(
|
|
120
|
-
db_entities.File.source_id == db_source.id
|
|
121
|
-
)
|
|
122
|
-
db_files = (await self._session.scalars(files_stmt)).all()
|
|
123
|
-
|
|
124
|
-
# Convert files to domain
|
|
125
|
-
domain_files = []
|
|
126
|
-
for db_file in db_files:
|
|
127
|
-
# Load authors for this file
|
|
128
|
-
authors_stmt = (
|
|
129
|
-
select(db_entities.Author)
|
|
130
|
-
.join(db_entities.AuthorFileMapping)
|
|
131
|
-
.where(db_entities.AuthorFileMapping.file_id == db_file.id)
|
|
132
|
-
)
|
|
133
|
-
db_authors = (await self._session.scalars(authors_stmt)).all()
|
|
134
|
-
|
|
135
|
-
domain_authors = [
|
|
136
|
-
domain_entities.Author(
|
|
137
|
-
id=author.id, name=author.name, email=author.email
|
|
138
|
-
)
|
|
139
|
-
for author in db_authors
|
|
140
|
-
]
|
|
141
|
-
|
|
142
|
-
domain_file = domain_entities.File(
|
|
143
|
-
id=db_file.id,
|
|
144
|
-
created_at=db_file.created_at,
|
|
145
|
-
updated_at=db_file.updated_at,
|
|
146
|
-
uri=AnyUrl(db_file.uri),
|
|
147
|
-
sha256=db_file.sha256,
|
|
148
|
-
authors=domain_authors,
|
|
149
|
-
mime_type=db_file.mime_type,
|
|
150
|
-
file_processing_status=FileProcessingStatus(
|
|
151
|
-
db_file.file_processing_status
|
|
152
|
-
),
|
|
153
|
-
)
|
|
154
|
-
domain_files.append(domain_file)
|
|
155
|
-
|
|
156
|
-
# Create working copy
|
|
157
|
-
working_copy = domain_entities.WorkingCopy(
|
|
158
|
-
created_at=db_source.created_at,
|
|
159
|
-
updated_at=db_source.updated_at,
|
|
160
|
-
remote_uri=AnyUrl(db_source.uri),
|
|
161
|
-
cloned_path=Path(db_source.cloned_path),
|
|
162
|
-
source_type=SourceType(db_source.type.value),
|
|
163
|
-
files=domain_files,
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
# Create source
|
|
167
|
-
return domain_entities.Source(
|
|
168
|
-
id=db_source.id,
|
|
169
|
-
created_at=db_source.created_at,
|
|
170
|
-
updated_at=db_source.updated_at,
|
|
171
|
-
working_copy=working_copy,
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
async def to_domain_file(self, db_file: db_entities.File) -> domain_entities.File:
|
|
175
|
-
"""Convert SQLAlchemy File to domain File."""
|
|
176
|
-
# Load authors for this file
|
|
177
|
-
authors_stmt = (
|
|
178
|
-
select(db_entities.Author)
|
|
179
|
-
.join(db_entities.AuthorFileMapping)
|
|
180
|
-
.where(db_entities.AuthorFileMapping.file_id == db_file.id)
|
|
181
|
-
)
|
|
182
|
-
db_authors = (await self._session.scalars(authors_stmt)).all()
|
|
183
|
-
|
|
184
|
-
domain_authors = [
|
|
185
|
-
domain_entities.Author(id=author.id, name=author.name, email=author.email)
|
|
186
|
-
for author in db_authors
|
|
187
|
-
]
|
|
188
|
-
|
|
189
|
-
return domain_entities.File(
|
|
190
|
-
id=db_file.id,
|
|
191
|
-
created_at=db_file.created_at,
|
|
192
|
-
updated_at=db_file.updated_at,
|
|
193
|
-
uri=AnyUrl(db_file.uri),
|
|
194
|
-
sha256=db_file.sha256,
|
|
195
|
-
authors=domain_authors,
|
|
196
|
-
mime_type=db_file.mime_type,
|
|
197
|
-
file_processing_status=FileProcessingStatus(db_file.file_processing_status),
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
async def to_domain_snippet(
|
|
201
|
-
self, db_snippet: db_entities.Snippet, domain_files: list[domain_entities.File]
|
|
202
|
-
) -> domain_entities.Snippet:
|
|
203
|
-
"""Convert SQLAlchemy Snippet to domain Snippet."""
|
|
204
|
-
# Find the file this snippet derives from
|
|
205
|
-
derives_from = []
|
|
206
|
-
for domain_file in domain_files:
|
|
207
|
-
if domain_file.id == db_snippet.file_id:
|
|
208
|
-
derives_from.append(domain_file)
|
|
209
|
-
break
|
|
210
|
-
|
|
211
|
-
# Create domain snippet with original content
|
|
212
|
-
domain_snippet = domain_entities.Snippet(
|
|
213
|
-
id=db_snippet.id,
|
|
214
|
-
created_at=db_snippet.created_at,
|
|
215
|
-
updated_at=db_snippet.updated_at,
|
|
216
|
-
derives_from=derives_from,
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
# Add original content
|
|
220
|
-
if db_snippet.content:
|
|
221
|
-
domain_snippet.add_original_content(db_snippet.content, "unknown")
|
|
222
|
-
|
|
223
|
-
# Add summary content if it exists
|
|
224
|
-
if db_snippet.summary:
|
|
225
|
-
domain_snippet.add_summary(db_snippet.summary)
|
|
226
|
-
|
|
227
|
-
return domain_snippet
|
|
228
|
-
|
|
229
|
-
async def from_domain_index( # noqa: C901
|
|
230
|
-
self, domain_index: domain_entities.Index
|
|
231
|
-
) -> tuple[
|
|
232
|
-
db_entities.Index,
|
|
233
|
-
db_entities.Source,
|
|
234
|
-
list[db_entities.File],
|
|
235
|
-
list[db_entities.Author],
|
|
236
|
-
]:
|
|
237
|
-
"""Convert domain Index aggregate to SQLAlchemy entities.
|
|
238
|
-
|
|
239
|
-
Returns all the entities that need to be persisted.
|
|
240
|
-
"""
|
|
241
|
-
# Create source entity
|
|
242
|
-
db_source = db_entities.Source(
|
|
243
|
-
uri=str(domain_index.source.working_copy.remote_uri),
|
|
244
|
-
cloned_path=str(domain_index.source.working_copy.cloned_path),
|
|
245
|
-
source_type=db_entities.SourceType(
|
|
246
|
-
domain_index.source.working_copy.source_type.value
|
|
247
|
-
),
|
|
248
|
-
)
|
|
249
|
-
if domain_index.source.id:
|
|
250
|
-
db_source.id = domain_index.source.id
|
|
251
|
-
if domain_index.source.created_at:
|
|
252
|
-
db_source.created_at = domain_index.source.created_at
|
|
253
|
-
if domain_index.source.updated_at:
|
|
254
|
-
db_source.updated_at = domain_index.source.updated_at
|
|
255
|
-
|
|
256
|
-
# Create index entity
|
|
257
|
-
# Will be set after source is saved
|
|
258
|
-
db_index = db_entities.Index(source_id=db_source.id or 0)
|
|
259
|
-
if domain_index.id:
|
|
260
|
-
db_index.id = domain_index.id
|
|
261
|
-
if domain_index.created_at:
|
|
262
|
-
db_index.created_at = domain_index.created_at
|
|
263
|
-
if domain_index.updated_at:
|
|
264
|
-
db_index.updated_at = domain_index.updated_at
|
|
265
|
-
|
|
266
|
-
# Create file entities
|
|
267
|
-
db_files = []
|
|
268
|
-
all_authors = []
|
|
269
|
-
|
|
270
|
-
for domain_file in domain_index.source.working_copy.files:
|
|
271
|
-
now = datetime.now(UTC)
|
|
272
|
-
db_file = db_entities.File(
|
|
273
|
-
created_at=domain_file.created_at or now,
|
|
274
|
-
updated_at=domain_file.updated_at or now,
|
|
275
|
-
source_id=db_source.id or 0, # Will be set after source is saved
|
|
276
|
-
mime_type="", # Would need to be determined
|
|
277
|
-
uri=str(domain_file.uri),
|
|
278
|
-
# Would need to be determined from working copy + relative path
|
|
279
|
-
cloned_path="",
|
|
280
|
-
sha256=domain_file.sha256,
|
|
281
|
-
size_bytes=0, # Would need to be determined
|
|
282
|
-
extension="", # Would need to be determined
|
|
283
|
-
file_processing_status=domain_file.file_processing_status.value,
|
|
284
|
-
)
|
|
285
|
-
if domain_file.id:
|
|
286
|
-
db_file.id = domain_file.id
|
|
287
|
-
|
|
288
|
-
db_files.append(db_file)
|
|
289
|
-
all_authors.extend(domain_file.authors)
|
|
290
|
-
|
|
291
|
-
# Create unique author entities
|
|
292
|
-
unique_authors = {}
|
|
293
|
-
for author in all_authors:
|
|
294
|
-
key = (author.name, author.email)
|
|
295
|
-
if key not in unique_authors:
|
|
296
|
-
db_author = db_entities.Author(name=author.name, email=author.email)
|
|
297
|
-
if author.id:
|
|
298
|
-
db_author.id = author.id
|
|
299
|
-
unique_authors[key] = db_author
|
|
300
|
-
|
|
301
|
-
return db_index, db_source, db_files, list(unique_authors.values())
|
|
302
|
-
|
|
303
|
-
async def from_domain_snippet(
|
|
304
|
-
self, domain_snippet: domain_entities.Snippet, index_id: int
|
|
305
|
-
) -> db_entities.Snippet:
|
|
306
|
-
"""Convert domain Snippet to SQLAlchemy Snippet."""
|
|
307
|
-
# Get file ID from derives_from (use first file if multiple)
|
|
308
|
-
if not domain_snippet.derives_from:
|
|
309
|
-
raise ValueError("Snippet must derive from at least one file")
|
|
310
|
-
|
|
311
|
-
file_id = domain_snippet.derives_from[0].id
|
|
312
|
-
if file_id is None:
|
|
313
|
-
raise ValueError("File must have an ID")
|
|
314
|
-
|
|
315
|
-
db_snippet = db_entities.Snippet(
|
|
316
|
-
file_id=file_id,
|
|
317
|
-
index_id=index_id,
|
|
318
|
-
content=domain_snippet.original_text(),
|
|
319
|
-
summary=domain_snippet.summary_text(),
|
|
320
|
-
)
|
|
321
|
-
|
|
322
|
-
if domain_snippet.id:
|
|
323
|
-
db_snippet.id = domain_snippet.id
|
|
324
|
-
if domain_snippet.created_at:
|
|
325
|
-
db_snippet.created_at = domain_snippet.created_at
|
|
326
|
-
if domain_snippet.updated_at:
|
|
327
|
-
db_snippet.updated_at = domain_snippet.updated_at
|
|
328
|
-
|
|
329
|
-
return db_snippet
|
|
330
|
-
|
|
331
|
-
async def load_snippets_for_index(
|
|
332
|
-
self, index_id: int, domain_files: list[domain_entities.File]
|
|
333
|
-
) -> list[domain_entities.Snippet]:
|
|
334
|
-
"""Load all snippets for an index and convert to domain entities."""
|
|
335
|
-
stmt = select(db_entities.Snippet).where(
|
|
336
|
-
db_entities.Snippet.index_id == index_id
|
|
337
|
-
)
|
|
338
|
-
db_snippets = (await self._session.scalars(stmt)).all()
|
|
339
|
-
|
|
340
|
-
domain_snippets = []
|
|
341
|
-
for db_snippet in db_snippets:
|
|
342
|
-
domain_snippet = await self.to_domain_snippet(db_snippet, domain_files)
|
|
343
|
-
domain_snippets.append(domain_snippet)
|
|
344
|
-
|
|
345
|
-
return domain_snippets
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
"""TQDM progress."""
|
|
2
|
-
|
|
3
|
-
from tqdm import tqdm
|
|
4
|
-
|
|
5
|
-
from kodit.config import ReportingConfig
|
|
6
|
-
from kodit.domain.entities import TaskStatus
|
|
7
|
-
from kodit.domain.protocols import ReportingModule
|
|
8
|
-
from kodit.domain.value_objects import ReportingState
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TQDMReportingModule(ReportingModule):
|
|
12
|
-
"""TQDM reporting module."""
|
|
13
|
-
|
|
14
|
-
def __init__(self, config: ReportingConfig) -> None:
|
|
15
|
-
"""Initialize the TQDM reporting module."""
|
|
16
|
-
self.config = config
|
|
17
|
-
self.pbar = tqdm()
|
|
18
|
-
|
|
19
|
-
async def on_change(self, progress: TaskStatus) -> None:
|
|
20
|
-
"""On step changed."""
|
|
21
|
-
step = progress
|
|
22
|
-
if step.state == ReportingState.COMPLETED:
|
|
23
|
-
self.pbar.close()
|
|
24
|
-
return
|
|
25
|
-
|
|
26
|
-
self.pbar.set_description(step.operation)
|
|
27
|
-
self.pbar.refresh()
|
|
28
|
-
# Update description if message is provided
|
|
29
|
-
if step.error:
|
|
30
|
-
# Fix the event message to a specific size so it's not jumping around
|
|
31
|
-
# If it's too small, add spaces
|
|
32
|
-
# If it's too large, truncate
|
|
33
|
-
if len(step.error) < 30:
|
|
34
|
-
self.pbar.set_description(step.error + " " * (30 - len(step.error)))
|
|
35
|
-
else:
|
|
36
|
-
self.pbar.set_description(step.error[-30:])
|
|
37
|
-
else:
|
|
38
|
-
self.pbar.set_description(step.operation)
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
"""Language detection service implementation."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from kodit.domain.services.index_service import LanguageDetectionService
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class FileSystemLanguageDetectionService(LanguageDetectionService):
|
|
9
|
-
"""Simple file extension based language detection service."""
|
|
10
|
-
|
|
11
|
-
def __init__(self, language_map: dict[str, str]) -> None:
|
|
12
|
-
"""Initialize with a mapping of extensions to languages."""
|
|
13
|
-
self._language_map = language_map
|
|
14
|
-
|
|
15
|
-
async def detect_language(self, file_path: Path) -> str:
|
|
16
|
-
"""Detect language based on file extension."""
|
|
17
|
-
extension = file_path.suffix.lstrip(".")
|
|
18
|
-
return self._language_map.get(extension, "unknown")
|