kodit 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +6 -0
- kodit/cli.py +8 -2
- kodit/embedding/embedding_factory.py +11 -0
- kodit/embedding/embedding_provider/embedding_provider.py +42 -14
- kodit/embedding/embedding_provider/hash_embedding_provider.py +16 -7
- kodit/embedding/embedding_provider/local_embedding_provider.py +43 -11
- kodit/embedding/embedding_provider/openai_embedding_provider.py +18 -22
- kodit/embedding/local_vector_search_service.py +46 -13
- kodit/embedding/vector_search_service.py +18 -1
- kodit/embedding/vectorchord_vector_search_service.py +63 -16
- kodit/enrichment/enrichment_factory.py +3 -0
- kodit/enrichment/enrichment_provider/enrichment_provider.py +21 -1
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +39 -28
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +25 -27
- kodit/enrichment/enrichment_service.py +19 -7
- kodit/indexing/indexing_service.py +50 -23
- kodit/log.py +126 -24
- kodit/migrations/versions/9e53ea8bb3b0_add_authors.py +103 -0
- kodit/source/source_factories.py +356 -0
- kodit/source/source_models.py +17 -5
- kodit/source/source_repository.py +49 -20
- kodit/source/source_service.py +41 -218
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/METADATA +2 -2
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/RECORD +28 -27
- kodit/migrations/versions/42e836b21102_add_authors.py +0 -64
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/WHEEL +0 -0
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/licenses/LICENSE +0 -0
kodit/log.py
CHANGED
|
@@ -1,20 +1,33 @@
|
|
|
1
1
|
"""Logging configuration for kodit."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import platform
|
|
5
|
+
import re
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
4
8
|
import sys
|
|
5
9
|
import uuid
|
|
6
10
|
from enum import Enum
|
|
7
11
|
from functools import lru_cache
|
|
12
|
+
from pathlib import Path
|
|
8
13
|
from typing import Any
|
|
9
14
|
|
|
15
|
+
import rudderstack.analytics as rudder_analytics
|
|
10
16
|
import structlog
|
|
11
|
-
from posthog import Posthog
|
|
12
17
|
from structlog.types import EventDict
|
|
13
18
|
|
|
19
|
+
from kodit import _version
|
|
14
20
|
from kodit.config import AppContext
|
|
15
21
|
|
|
22
|
+
_MAC_RE = re.compile(r"(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}")
|
|
23
|
+
|
|
16
24
|
log = structlog.get_logger(__name__)
|
|
17
25
|
|
|
26
|
+
rudder_analytics.write_key = "2wm1RmV2GnO92NGSs8yYtmSI0mi"
|
|
27
|
+
rudder_analytics.dataPlaneUrl = (
|
|
28
|
+
"https://danbmedefzavzlslreyxjgcjwlf.dataplane.rudderstack.com"
|
|
29
|
+
)
|
|
30
|
+
|
|
18
31
|
|
|
19
32
|
def drop_color_message_key(_, __, event_dict: EventDict) -> EventDict: # noqa: ANN001
|
|
20
33
|
"""Drop the `color_message` key from the event dict."""
|
|
@@ -131,35 +144,124 @@ def configure_logging(app_context: AppContext) -> None:
|
|
|
131
144
|
sys.excepthook = handle_exception
|
|
132
145
|
|
|
133
146
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
)
|
|
147
|
+
def configure_telemetry(app_context: AppContext) -> None:
|
|
148
|
+
"""Configure telemetry for the application."""
|
|
149
|
+
if app_context.disable_telemetry:
|
|
150
|
+
structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
|
|
151
|
+
rudder_analytics.send = False
|
|
138
152
|
|
|
153
|
+
rudder_analytics.identify(
|
|
154
|
+
anonymous_id=get_stable_mac_str(),
|
|
155
|
+
traits={},
|
|
156
|
+
)
|
|
139
157
|
|
|
140
|
-
@lru_cache(maxsize=1)
|
|
141
|
-
def get_mac_address() -> str:
|
|
142
|
-
"""Get the MAC address of the primary network interface.
|
|
143
158
|
|
|
144
|
-
|
|
145
|
-
|
|
159
|
+
def log_event(event: str, properties: dict[str, Any] | None = None) -> None:
|
|
160
|
+
"""Log an event to Rudderstack."""
|
|
161
|
+
p = properties or {}
|
|
162
|
+
# Set default posthog properties
|
|
163
|
+
p["$app_name"] = "kodit"
|
|
164
|
+
p["$app_version"] = _version.version
|
|
165
|
+
p["$os"] = sys.platform
|
|
166
|
+
p["$os_version"] = sys.version
|
|
167
|
+
rudder_analytics.track(
|
|
168
|
+
anonymous_id=get_stable_mac_str(),
|
|
169
|
+
event=event,
|
|
170
|
+
properties=properties or {},
|
|
171
|
+
)
|
|
172
|
+
|
|
146
173
|
|
|
174
|
+
# ----------------------------------------------------------------------
|
|
175
|
+
# Helper functions
|
|
176
|
+
# ----------------------------------------------------------------------
|
|
177
|
+
def _mac_int(mac: str) -> int:
|
|
178
|
+
return int(mac.replace(":", "").replace("-", ""), 16)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _is_globally_administered(mac_int: int) -> bool:
|
|
182
|
+
first_octet = (mac_int >> 40) & 0xFF
|
|
183
|
+
return not (first_octet & 0b11) # both bits must be 0
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _from_sysfs() -> list[int]:
|
|
187
|
+
base = Path("/sys/class/net")
|
|
188
|
+
if not base.is_dir():
|
|
189
|
+
return []
|
|
190
|
+
macs: list[int] = []
|
|
191
|
+
for iface in base.iterdir():
|
|
192
|
+
try:
|
|
193
|
+
with (base / iface / "address").open() as f:
|
|
194
|
+
content = f.read().strip()
|
|
195
|
+
if _MAC_RE.fullmatch(content):
|
|
196
|
+
macs.append(_mac_int(content))
|
|
197
|
+
except (FileNotFoundError, PermissionError):
|
|
198
|
+
pass
|
|
199
|
+
return macs
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _from_command(cmd: str) -> list[int]:
|
|
203
|
+
try:
|
|
204
|
+
out = subprocess.check_output( # noqa: S602
|
|
205
|
+
cmd,
|
|
206
|
+
shell=True,
|
|
207
|
+
text=True,
|
|
208
|
+
stderr=subprocess.DEVNULL,
|
|
209
|
+
encoding="utf-8",
|
|
210
|
+
)
|
|
211
|
+
except Exception: # noqa: BLE001
|
|
212
|
+
return []
|
|
213
|
+
return [_mac_int(m.group()) for m in _MAC_RE.finditer(out)]
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@lru_cache(maxsize=1)
|
|
217
|
+
def get_stable_mac_int() -> int | None:
|
|
218
|
+
"""Return a *hardware* MAC as an int, or None if none can be found.
|
|
219
|
+
|
|
220
|
+
Search order:
|
|
221
|
+
1. /sys/class/net (Linux)
|
|
222
|
+
2. `ip link show` (Linux), `ifconfig -a` (Linux+macOS)
|
|
223
|
+
3. `getmac` and `wmic nic` (Windows)
|
|
224
|
+
The first globally-administered, non-multicast address wins.
|
|
147
225
|
"""
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
226
|
+
system = platform.system()
|
|
227
|
+
candidates: list[int] = []
|
|
228
|
+
|
|
229
|
+
if system == "Linux":
|
|
230
|
+
candidates += _from_sysfs()
|
|
231
|
+
if not candidates and shutil.which("ip"):
|
|
232
|
+
candidates += _from_command("ip link show")
|
|
233
|
+
if not candidates: # fall back to ifconfig
|
|
234
|
+
candidates += _from_command("ifconfig -a")
|
|
235
|
+
|
|
236
|
+
elif system == "Darwin": # macOS
|
|
237
|
+
candidates += _from_command("ifconfig -a")
|
|
238
|
+
|
|
239
|
+
elif system == "Windows":
|
|
240
|
+
# getmac is present on every supported Windows version
|
|
241
|
+
candidates += _from_command("getmac /v /fo list")
|
|
242
|
+
# wmic still exists through at least Win 11
|
|
243
|
+
candidates += _from_command(
|
|
244
|
+
'wmic nic where "MACAddress is not null" get MACAddress /format:list'
|
|
245
|
+
)
|
|
151
246
|
|
|
247
|
+
# Prefer globally administered, non-multicast addresses
|
|
248
|
+
for mac in candidates:
|
|
249
|
+
if _is_globally_administered(mac):
|
|
250
|
+
return mac
|
|
152
251
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
|
|
157
|
-
posthog.disabled = True
|
|
252
|
+
# If all we saw were locally-administered MACs, just return the first one
|
|
253
|
+
if candidates:
|
|
254
|
+
return candidates[0]
|
|
158
255
|
|
|
256
|
+
return None
|
|
159
257
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
)
|
|
165
|
-
|
|
258
|
+
|
|
259
|
+
def get_stable_mac_str() -> str:
|
|
260
|
+
"""Return a *stable* 12-digit hex string (lower-case, no separators).
|
|
261
|
+
|
|
262
|
+
Falls back to uuid.getnode() if necessary, so it never raises.
|
|
263
|
+
"""
|
|
264
|
+
mac_int = get_stable_mac_int()
|
|
265
|
+
if mac_int is None:
|
|
266
|
+
mac_int = uuid.getnode() # may still be random in VMs
|
|
267
|
+
return f"{mac_int:012x}"
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ruff: noqa
|
|
2
|
+
"""add authors
|
|
3
|
+
|
|
4
|
+
Revision ID: 9e53ea8bb3b0
|
|
5
|
+
Revises: c3f5137d30f5
|
|
6
|
+
Create Date: 2025-06-14 10:50:36.058114
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Sequence, Union
|
|
11
|
+
|
|
12
|
+
from alembic import op
|
|
13
|
+
import sqlalchemy as sa
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = "9e53ea8bb3b0"
|
|
18
|
+
down_revision: Union[str, None] = "c3f5137d30f5"
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade() -> None:
|
|
24
|
+
"""Upgrade schema."""
|
|
25
|
+
# Define the enum type separately so we can explicitly create it when needed
|
|
26
|
+
source_type = sa.Enum("UNKNOWN", "FOLDER", "GIT", name="sourcetype")
|
|
27
|
+
|
|
28
|
+
# Explicitly create the enum type for PostgreSQL (no-op on SQLite)
|
|
29
|
+
source_type.create(op.get_bind(), checkfirst=True)
|
|
30
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
31
|
+
op.create_table(
|
|
32
|
+
"authors",
|
|
33
|
+
sa.Column("name", sa.String(length=255), nullable=False),
|
|
34
|
+
sa.Column("email", sa.String(length=255), nullable=False),
|
|
35
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
36
|
+
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
|
|
37
|
+
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
|
|
38
|
+
sa.PrimaryKeyConstraint("id"),
|
|
39
|
+
sa.UniqueConstraint("name", "email", name="uix_author"),
|
|
40
|
+
)
|
|
41
|
+
op.create_index(op.f("ix_authors_email"), "authors", ["email"], unique=False)
|
|
42
|
+
op.create_index(op.f("ix_authors_name"), "authors", ["name"], unique=False)
|
|
43
|
+
op.create_table(
|
|
44
|
+
"author_file_mappings",
|
|
45
|
+
sa.Column("author_id", sa.Integer(), nullable=False),
|
|
46
|
+
sa.Column("file_id", sa.Integer(), nullable=False),
|
|
47
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
48
|
+
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
|
|
49
|
+
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
|
|
50
|
+
sa.ForeignKeyConstraint(
|
|
51
|
+
["author_id"],
|
|
52
|
+
["authors.id"],
|
|
53
|
+
),
|
|
54
|
+
sa.ForeignKeyConstraint(
|
|
55
|
+
["file_id"],
|
|
56
|
+
["files.id"],
|
|
57
|
+
),
|
|
58
|
+
sa.PrimaryKeyConstraint("id"),
|
|
59
|
+
sa.UniqueConstraint("author_id", "file_id", name="uix_author_file_mapping"),
|
|
60
|
+
)
|
|
61
|
+
op.create_index(
|
|
62
|
+
op.f("ix_author_file_mappings_author_id"),
|
|
63
|
+
"author_file_mappings",
|
|
64
|
+
["author_id"],
|
|
65
|
+
unique=False,
|
|
66
|
+
)
|
|
67
|
+
op.create_index(
|
|
68
|
+
op.f("ix_author_file_mappings_file_id"),
|
|
69
|
+
"author_file_mappings",
|
|
70
|
+
["file_id"],
|
|
71
|
+
unique=False,
|
|
72
|
+
)
|
|
73
|
+
op.add_column(
|
|
74
|
+
"files", sa.Column("extension", sa.String(length=255), nullable=False)
|
|
75
|
+
)
|
|
76
|
+
op.create_index(op.f("ix_files_extension"), "files", ["extension"], unique=False)
|
|
77
|
+
op.add_column("sources", sa.Column("type", source_type, nullable=False))
|
|
78
|
+
op.create_index(op.f("ix_sources_type"), "sources", ["type"], unique=False)
|
|
79
|
+
# ### end Alembic commands ###
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def downgrade() -> None:
|
|
83
|
+
"""Downgrade schema."""
|
|
84
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
85
|
+
op.drop_index(op.f("ix_sources_type"), table_name="sources")
|
|
86
|
+
op.drop_column("sources", "type")
|
|
87
|
+
op.drop_index(op.f("ix_files_extension"), table_name="files")
|
|
88
|
+
op.drop_column("files", "extension")
|
|
89
|
+
op.drop_index(
|
|
90
|
+
op.f("ix_author_file_mappings_file_id"), table_name="author_file_mappings"
|
|
91
|
+
)
|
|
92
|
+
op.drop_index(
|
|
93
|
+
op.f("ix_author_file_mappings_author_id"), table_name="author_file_mappings"
|
|
94
|
+
)
|
|
95
|
+
op.drop_table("author_file_mappings")
|
|
96
|
+
op.drop_index(op.f("ix_authors_name"), table_name="authors")
|
|
97
|
+
op.drop_index(op.f("ix_authors_email"), table_name="authors")
|
|
98
|
+
op.drop_table("authors")
|
|
99
|
+
|
|
100
|
+
# Explicitly drop the enum type (PostgreSQL)
|
|
101
|
+
source_type = sa.Enum("UNKNOWN", "FOLDER", "GIT", name="sourcetype")
|
|
102
|
+
source_type.drop(op.get_bind(), checkfirst=True)
|
|
103
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""Source factories for creating different types of sources.
|
|
2
|
+
|
|
3
|
+
This module provides factory classes for creating sources, improving cohesion by
|
|
4
|
+
separating the concerns of different source types.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import mimetypes
|
|
8
|
+
import shutil
|
|
9
|
+
import tempfile
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from datetime import UTC, datetime
|
|
12
|
+
from hashlib import sha256
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Protocol
|
|
15
|
+
|
|
16
|
+
import aiofiles
|
|
17
|
+
import git
|
|
18
|
+
import structlog
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
|
|
21
|
+
from kodit.source.ignore import IgnorePatterns
|
|
22
|
+
from kodit.source.source_models import (
|
|
23
|
+
Author,
|
|
24
|
+
AuthorFileMapping,
|
|
25
|
+
File,
|
|
26
|
+
Source,
|
|
27
|
+
SourceType,
|
|
28
|
+
)
|
|
29
|
+
from kodit.source.source_repository import SourceRepository
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class WorkingCopyProvider(Protocol):
|
|
33
|
+
"""Protocol for providing working copies of sources."""
|
|
34
|
+
|
|
35
|
+
async def prepare(self, uri: str) -> Path:
|
|
36
|
+
"""Prepare a working copy and return its path."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FileMetadataExtractor(Protocol):
|
|
41
|
+
"""Protocol for extracting file metadata."""
|
|
42
|
+
|
|
43
|
+
async def extract(self, path: Path, source: Source) -> File:
|
|
44
|
+
"""Extract metadata from a file."""
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class AuthorExtractor(Protocol):
|
|
49
|
+
"""Protocol for extracting author information."""
|
|
50
|
+
|
|
51
|
+
async def extract(self, path: Path, source: Source) -> list[Author]:
|
|
52
|
+
"""Extract authors for a file."""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class SourceFactory(ABC):
|
|
57
|
+
"""Abstract base class for source factories."""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
working_copy: WorkingCopyProvider,
|
|
62
|
+
metadata_extractor: FileMetadataExtractor,
|
|
63
|
+
author_extractor: AuthorExtractor,
|
|
64
|
+
repository: SourceRepository,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Initialize the source factory."""
|
|
67
|
+
self.working_copy = working_copy
|
|
68
|
+
self.metadata_extractor = metadata_extractor
|
|
69
|
+
self.author_extractor = author_extractor
|
|
70
|
+
self.repository = repository
|
|
71
|
+
self.log = structlog.get_logger(__name__)
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
async def create(self, uri: str) -> Source:
|
|
75
|
+
"""Create a source from a URI."""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
async def _process_files(self, source: Source, files: list[Path]) -> None:
|
|
79
|
+
"""Process files for a source."""
|
|
80
|
+
for path in tqdm(files, total=len(files), leave=False):
|
|
81
|
+
if not path.is_file():
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
# Extract file metadata
|
|
85
|
+
file_record = await self.metadata_extractor.extract(path, source)
|
|
86
|
+
await self.repository.create_file(file_record)
|
|
87
|
+
|
|
88
|
+
# Extract authors
|
|
89
|
+
authors = await self.author_extractor.extract(path, source)
|
|
90
|
+
for author in authors:
|
|
91
|
+
await self.repository.upsert_author_file_mapping(
|
|
92
|
+
AuthorFileMapping(
|
|
93
|
+
author_id=author.id,
|
|
94
|
+
file_id=file_record.id,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class GitSourceFactory(SourceFactory):
|
|
100
|
+
"""Factory for creating Git sources."""
|
|
101
|
+
|
|
102
|
+
async def create(self, uri: str) -> Source:
|
|
103
|
+
"""Create a git source from a URI."""
|
|
104
|
+
# Normalize the URI
|
|
105
|
+
self.log.debug("Normalising git uri", uri=uri)
|
|
106
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
107
|
+
git.Repo.clone_from(uri, temp_dir)
|
|
108
|
+
remote = git.Repo(temp_dir).remote()
|
|
109
|
+
uri = remote.url
|
|
110
|
+
|
|
111
|
+
# Check if source already exists
|
|
112
|
+
self.log.debug("Checking if source already exists", uri=uri)
|
|
113
|
+
source = await self.repository.get_source_by_uri(uri)
|
|
114
|
+
|
|
115
|
+
if source:
|
|
116
|
+
self.log.info("Source already exists, reusing...", source_id=source.id)
|
|
117
|
+
return source
|
|
118
|
+
|
|
119
|
+
# Prepare working copy
|
|
120
|
+
clone_path = await self.working_copy.prepare(uri)
|
|
121
|
+
|
|
122
|
+
# Create source record
|
|
123
|
+
self.log.debug("Creating source", uri=uri, clone_path=str(clone_path))
|
|
124
|
+
source = await self.repository.create_source(
|
|
125
|
+
Source(
|
|
126
|
+
uri=uri,
|
|
127
|
+
cloned_path=str(clone_path),
|
|
128
|
+
source_type=SourceType.GIT,
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Get files to process using ignore patterns
|
|
133
|
+
ignore_patterns = IgnorePatterns(clone_path)
|
|
134
|
+
files = [
|
|
135
|
+
f
|
|
136
|
+
for f in clone_path.rglob("*")
|
|
137
|
+
if f.is_file() and not ignore_patterns.should_ignore(f)
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
# Process files
|
|
141
|
+
self.log.info("Inspecting files", source_id=source.id, num_files=len(files))
|
|
142
|
+
await self._process_files(source, files)
|
|
143
|
+
|
|
144
|
+
return source
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class FolderSourceFactory(SourceFactory):
|
|
148
|
+
"""Factory for creating folder sources."""
|
|
149
|
+
|
|
150
|
+
async def create(self, uri: str) -> Source:
|
|
151
|
+
"""Create a folder source from a path."""
|
|
152
|
+
directory = Path(uri).expanduser().resolve()
|
|
153
|
+
|
|
154
|
+
# Check if source already exists
|
|
155
|
+
source = await self.repository.get_source_by_uri(directory.as_uri())
|
|
156
|
+
if source:
|
|
157
|
+
self.log.info("Source already exists, reusing...", source_id=source.id)
|
|
158
|
+
return source
|
|
159
|
+
|
|
160
|
+
# Validate directory exists
|
|
161
|
+
if not directory.exists():
|
|
162
|
+
msg = f"Folder does not exist: {directory}"
|
|
163
|
+
raise ValueError(msg)
|
|
164
|
+
|
|
165
|
+
# Prepare working copy
|
|
166
|
+
clone_path = await self.working_copy.prepare(directory.as_uri())
|
|
167
|
+
|
|
168
|
+
# Create source record
|
|
169
|
+
source = await self.repository.create_source(
|
|
170
|
+
Source(
|
|
171
|
+
uri=directory.as_uri(),
|
|
172
|
+
cloned_path=str(clone_path),
|
|
173
|
+
source_type=SourceType.FOLDER,
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Get all files to process
|
|
178
|
+
files = [f for f in clone_path.rglob("*") if f.is_file()]
|
|
179
|
+
|
|
180
|
+
# Process files
|
|
181
|
+
await self._process_files(source, files)
|
|
182
|
+
|
|
183
|
+
return source
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class GitWorkingCopyProvider:
|
|
187
|
+
"""Working copy provider for Git repositories."""
|
|
188
|
+
|
|
189
|
+
def __init__(self, clone_dir: Path) -> None:
|
|
190
|
+
"""Initialize the provider."""
|
|
191
|
+
self.clone_dir = clone_dir
|
|
192
|
+
self.log = structlog.get_logger(__name__)
|
|
193
|
+
|
|
194
|
+
async def prepare(self, uri: str) -> Path:
|
|
195
|
+
"""Prepare a Git working copy."""
|
|
196
|
+
# Create a unique directory name for the clone
|
|
197
|
+
clone_path = self.clone_dir / uri.replace("/", "_").replace(":", "_")
|
|
198
|
+
clone_path.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
self.log.info("Cloning repository", uri=uri, clone_path=str(clone_path))
|
|
202
|
+
git.Repo.clone_from(uri, clone_path)
|
|
203
|
+
except git.GitCommandError as e:
|
|
204
|
+
if "already exists and is not an empty directory" not in str(e):
|
|
205
|
+
msg = f"Failed to clone repository: {e}"
|
|
206
|
+
raise ValueError(msg) from e
|
|
207
|
+
self.log.info("Repository already exists, reusing...", uri=uri)
|
|
208
|
+
|
|
209
|
+
return clone_path
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class FolderWorkingCopyProvider:
|
|
213
|
+
"""Working copy provider for local folders."""
|
|
214
|
+
|
|
215
|
+
def __init__(self, clone_dir: Path) -> None:
|
|
216
|
+
"""Initialize the provider."""
|
|
217
|
+
self.clone_dir = clone_dir
|
|
218
|
+
|
|
219
|
+
async def prepare(self, uri: str) -> Path:
|
|
220
|
+
"""Prepare a folder working copy."""
|
|
221
|
+
# Handle file:// URIs
|
|
222
|
+
if uri.startswith("file://"):
|
|
223
|
+
from urllib.parse import urlparse
|
|
224
|
+
|
|
225
|
+
parsed = urlparse(uri)
|
|
226
|
+
directory = Path(parsed.path).expanduser().resolve()
|
|
227
|
+
else:
|
|
228
|
+
directory = Path(uri).expanduser().resolve()
|
|
229
|
+
|
|
230
|
+
# Clone into a local directory
|
|
231
|
+
clone_path = self.clone_dir / directory.as_posix().replace("/", "_")
|
|
232
|
+
clone_path.mkdir(parents=True, exist_ok=True)
|
|
233
|
+
|
|
234
|
+
# Copy all files recursively, preserving directory structure, ignoring
|
|
235
|
+
# hidden files
|
|
236
|
+
shutil.copytree(
|
|
237
|
+
directory,
|
|
238
|
+
clone_path,
|
|
239
|
+
ignore=shutil.ignore_patterns(".*"),
|
|
240
|
+
dirs_exist_ok=True,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
return clone_path
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class BaseFileMetadataExtractor:
|
|
247
|
+
"""Base class for file metadata extraction with common functionality."""
|
|
248
|
+
|
|
249
|
+
async def extract(self, path: Path, source: Source) -> File:
|
|
250
|
+
"""Extract metadata from a file."""
|
|
251
|
+
# Get timestamps - to be implemented by subclasses
|
|
252
|
+
created_at, updated_at = await self._get_timestamps(path, source)
|
|
253
|
+
|
|
254
|
+
# Read file content and calculate metadata
|
|
255
|
+
async with aiofiles.open(path, "rb") as f:
|
|
256
|
+
content = await f.read()
|
|
257
|
+
mime_type = mimetypes.guess_type(path)
|
|
258
|
+
sha = sha256(content).hexdigest()
|
|
259
|
+
|
|
260
|
+
return File(
|
|
261
|
+
created_at=created_at,
|
|
262
|
+
updated_at=updated_at,
|
|
263
|
+
source_id=source.id,
|
|
264
|
+
cloned_path=str(path),
|
|
265
|
+
mime_type=mime_type[0]
|
|
266
|
+
if mime_type and mime_type[0]
|
|
267
|
+
else "application/octet-stream",
|
|
268
|
+
uri=path.as_uri(),
|
|
269
|
+
sha256=sha,
|
|
270
|
+
size_bytes=len(content),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
async def _get_timestamps(
|
|
274
|
+
self, path: Path, source: Source
|
|
275
|
+
) -> tuple[datetime, datetime]:
|
|
276
|
+
"""Get creation and modification timestamps. To be implemented by subclasses."""
|
|
277
|
+
raise NotImplementedError
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class GitFileMetadataExtractor(BaseFileMetadataExtractor):
|
|
281
|
+
"""Git-specific implementation for extracting file metadata."""
|
|
282
|
+
|
|
283
|
+
async def _get_timestamps(
|
|
284
|
+
self, path: Path, source: Source
|
|
285
|
+
) -> tuple[datetime, datetime]:
|
|
286
|
+
"""Get timestamps from Git history."""
|
|
287
|
+
git_repo = git.Repo(source.cloned_path)
|
|
288
|
+
commits = list(git_repo.iter_commits(paths=str(path), all=True))
|
|
289
|
+
|
|
290
|
+
if commits:
|
|
291
|
+
last_modified_at = commits[0].committed_datetime
|
|
292
|
+
first_modified_at = commits[-1].committed_datetime
|
|
293
|
+
return first_modified_at, last_modified_at
|
|
294
|
+
# Fallback to current time if no commits found
|
|
295
|
+
now = datetime.now(UTC)
|
|
296
|
+
return now, now
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class FolderFileMetadataExtractor(BaseFileMetadataExtractor):
|
|
300
|
+
"""Folder-specific implementation for extracting file metadata."""
|
|
301
|
+
|
|
302
|
+
async def _get_timestamps(
|
|
303
|
+
self,
|
|
304
|
+
path: Path,
|
|
305
|
+
source: Source, # noqa: ARG002
|
|
306
|
+
) -> tuple[datetime, datetime]:
|
|
307
|
+
"""Get timestamps from file system."""
|
|
308
|
+
stat = path.stat()
|
|
309
|
+
file_created_at = datetime.fromtimestamp(stat.st_ctime, UTC)
|
|
310
|
+
file_modified_at = datetime.fromtimestamp(stat.st_mtime, UTC)
|
|
311
|
+
return file_created_at, file_modified_at
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class GitAuthorExtractor:
|
|
315
|
+
"""Author extractor for Git repositories."""
|
|
316
|
+
|
|
317
|
+
def __init__(self, repository: SourceRepository) -> None:
|
|
318
|
+
"""Initialize the extractor."""
|
|
319
|
+
self.repository = repository
|
|
320
|
+
|
|
321
|
+
async def extract(self, path: Path, source: Source) -> list[Author]:
|
|
322
|
+
"""Extract authors from a Git file."""
|
|
323
|
+
authors: list[Author] = []
|
|
324
|
+
git_repo = git.Repo(source.cloned_path)
|
|
325
|
+
|
|
326
|
+
try:
|
|
327
|
+
# Get the file's blame
|
|
328
|
+
blames = git_repo.blame("HEAD", str(path))
|
|
329
|
+
|
|
330
|
+
# Extract the blame's authors
|
|
331
|
+
actors = [
|
|
332
|
+
commit.author
|
|
333
|
+
for blame in blames or []
|
|
334
|
+
for commit in blame
|
|
335
|
+
if isinstance(commit, git.Commit)
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
# Get or create the authors in the database
|
|
339
|
+
for actor in actors:
|
|
340
|
+
if actor.email:
|
|
341
|
+
author = Author.from_actor(actor)
|
|
342
|
+
author = await self.repository.upsert_author(author)
|
|
343
|
+
authors.append(author)
|
|
344
|
+
except git.GitCommandError:
|
|
345
|
+
# Handle cases where file might not be tracked
|
|
346
|
+
pass
|
|
347
|
+
|
|
348
|
+
return authors
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
class NoOpAuthorExtractor:
|
|
352
|
+
"""No-op author extractor for sources that don't have author information."""
|
|
353
|
+
|
|
354
|
+
async def extract(self, path: Path, source: Source) -> list[Author]: # noqa: ARG002
|
|
355
|
+
"""Return empty list of authors."""
|
|
356
|
+
return []
|
kodit/source/source_models.py
CHANGED
|
@@ -8,7 +8,8 @@ folders) and their relationships.
|
|
|
8
8
|
import datetime
|
|
9
9
|
from enum import Enum as EnumType
|
|
10
10
|
|
|
11
|
-
from
|
|
11
|
+
from git import Actor
|
|
12
|
+
from sqlalchemy import Enum, ForeignKey, Integer, String, UniqueConstraint
|
|
12
13
|
from sqlalchemy.orm import Mapped, mapped_column
|
|
13
14
|
|
|
14
15
|
from kodit.database import Base, CommonMixin
|
|
@@ -60,8 +61,15 @@ class Author(Base, CommonMixin):
|
|
|
60
61
|
|
|
61
62
|
__tablename__ = "authors"
|
|
62
63
|
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
__table_args__ = (UniqueConstraint("name", "email", name="uix_author"),)
|
|
65
|
+
|
|
66
|
+
name: Mapped[str] = mapped_column(String(255), index=True)
|
|
67
|
+
email: Mapped[str] = mapped_column(String(255), index=True)
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def from_actor(actor: Actor) -> "Author":
|
|
71
|
+
"""Create an Author from an Actor."""
|
|
72
|
+
return Author(name=actor.name, email=actor.email)
|
|
65
73
|
|
|
66
74
|
|
|
67
75
|
class AuthorFileMapping(Base, CommonMixin):
|
|
@@ -69,8 +77,12 @@ class AuthorFileMapping(Base, CommonMixin):
|
|
|
69
77
|
|
|
70
78
|
__tablename__ = "author_file_mappings"
|
|
71
79
|
|
|
72
|
-
|
|
73
|
-
|
|
80
|
+
__table_args__ = (
|
|
81
|
+
UniqueConstraint("author_id", "file_id", name="uix_author_file_mapping"),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
author_id: Mapped[int] = mapped_column(ForeignKey("authors.id"), index=True)
|
|
85
|
+
file_id: Mapped[int] = mapped_column(ForeignKey("files.id"), index=True)
|
|
74
86
|
|
|
75
87
|
|
|
76
88
|
class File(Base, CommonMixin):
|