kodit 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (29) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +6 -0
  3. kodit/cli.py +8 -2
  4. kodit/embedding/embedding_factory.py +11 -0
  5. kodit/embedding/embedding_provider/embedding_provider.py +42 -14
  6. kodit/embedding/embedding_provider/hash_embedding_provider.py +16 -7
  7. kodit/embedding/embedding_provider/local_embedding_provider.py +43 -11
  8. kodit/embedding/embedding_provider/openai_embedding_provider.py +18 -22
  9. kodit/embedding/local_vector_search_service.py +46 -13
  10. kodit/embedding/vector_search_service.py +18 -1
  11. kodit/embedding/vectorchord_vector_search_service.py +63 -16
  12. kodit/enrichment/enrichment_factory.py +3 -0
  13. kodit/enrichment/enrichment_provider/enrichment_provider.py +21 -1
  14. kodit/enrichment/enrichment_provider/local_enrichment_provider.py +39 -28
  15. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +25 -27
  16. kodit/enrichment/enrichment_service.py +19 -7
  17. kodit/indexing/indexing_service.py +50 -23
  18. kodit/log.py +126 -24
  19. kodit/migrations/versions/9e53ea8bb3b0_add_authors.py +103 -0
  20. kodit/source/source_factories.py +356 -0
  21. kodit/source/source_models.py +17 -5
  22. kodit/source/source_repository.py +49 -20
  23. kodit/source/source_service.py +41 -218
  24. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/METADATA +2 -2
  25. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/RECORD +28 -27
  26. kodit/migrations/versions/42e836b21102_add_authors.py +0 -64
  27. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/WHEEL +0 -0
  28. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/entry_points.txt +0 -0
  29. {kodit-0.2.2.dist-info → kodit-0.2.4.dist-info}/licenses/LICENSE +0 -0
kodit/log.py CHANGED
@@ -1,20 +1,33 @@
1
1
  """Logging configuration for kodit."""
2
2
 
3
3
  import logging
4
+ import platform
5
+ import re
6
+ import shutil
7
+ import subprocess
4
8
  import sys
5
9
  import uuid
6
10
  from enum import Enum
7
11
  from functools import lru_cache
12
+ from pathlib import Path
8
13
  from typing import Any
9
14
 
15
+ import rudderstack.analytics as rudder_analytics
10
16
  import structlog
11
- from posthog import Posthog
12
17
  from structlog.types import EventDict
13
18
 
19
+ from kodit import _version
14
20
  from kodit.config import AppContext
15
21
 
22
+ _MAC_RE = re.compile(r"(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}")
23
+
16
24
  log = structlog.get_logger(__name__)
17
25
 
26
+ rudder_analytics.write_key = "2wm1RmV2GnO92NGSs8yYtmSI0mi"
27
+ rudder_analytics.dataPlaneUrl = (
28
+ "https://danbmedefzavzlslreyxjgcjwlf.dataplane.rudderstack.com"
29
+ )
30
+
18
31
 
19
32
  def drop_color_message_key(_, __, event_dict: EventDict) -> EventDict: # noqa: ANN001
20
33
  """Drop the `color_message` key from the event dict."""
@@ -131,35 +144,124 @@ def configure_logging(app_context: AppContext) -> None:
131
144
  sys.excepthook = handle_exception
132
145
 
133
146
 
134
- posthog = Posthog(
135
- project_api_key="phc_JsX0yx8NLPcIxamfp4Zc7xyFykXjwmekKUQz060cSt3",
136
- host="https://eu.i.posthog.com",
137
- )
147
+ def configure_telemetry(app_context: AppContext) -> None:
148
+ """Configure telemetry for the application."""
149
+ if app_context.disable_telemetry:
150
+ structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
151
+ rudder_analytics.send = False
138
152
 
153
+ rudder_analytics.identify(
154
+ anonymous_id=get_stable_mac_str(),
155
+ traits={},
156
+ )
139
157
 
140
- @lru_cache(maxsize=1)
141
- def get_mac_address() -> str:
142
- """Get the MAC address of the primary network interface.
143
158
 
144
- Returns:
145
- str: The MAC address or a fallback UUID if not available
159
+ def log_event(event: str, properties: dict[str, Any] | None = None) -> None:
160
+ """Log an event to Rudderstack."""
161
+ p = properties or {}
162
+ # Set default posthog properties
163
+ p["$app_name"] = "kodit"
164
+ p["$app_version"] = _version.version
165
+ p["$os"] = sys.platform
166
+ p["$os_version"] = sys.version
167
+ rudder_analytics.track(
168
+ anonymous_id=get_stable_mac_str(),
169
+ event=event,
170
+ properties=properties or {},
171
+ )
172
+
146
173
 
174
+ # ----------------------------------------------------------------------
175
+ # Helper functions
176
+ # ----------------------------------------------------------------------
177
+ def _mac_int(mac: str) -> int:
178
+ return int(mac.replace(":", "").replace("-", ""), 16)
179
+
180
+
181
+ def _is_globally_administered(mac_int: int) -> bool:
182
+ first_octet = (mac_int >> 40) & 0xFF
183
+ return not (first_octet & 0b11) # both bits must be 0
184
+
185
+
186
+ def _from_sysfs() -> list[int]:
187
+ base = Path("/sys/class/net")
188
+ if not base.is_dir():
189
+ return []
190
+ macs: list[int] = []
191
+ for iface in base.iterdir():
192
+ try:
193
+ with (base / iface / "address").open() as f:
194
+ content = f.read().strip()
195
+ if _MAC_RE.fullmatch(content):
196
+ macs.append(_mac_int(content))
197
+ except (FileNotFoundError, PermissionError):
198
+ pass
199
+ return macs
200
+
201
+
202
+ def _from_command(cmd: str) -> list[int]:
203
+ try:
204
+ out = subprocess.check_output( # noqa: S602
205
+ cmd,
206
+ shell=True,
207
+ text=True,
208
+ stderr=subprocess.DEVNULL,
209
+ encoding="utf-8",
210
+ )
211
+ except Exception: # noqa: BLE001
212
+ return []
213
+ return [_mac_int(m.group()) for m in _MAC_RE.finditer(out)]
214
+
215
+
216
+ @lru_cache(maxsize=1)
217
+ def get_stable_mac_int() -> int | None:
218
+ """Return a *hardware* MAC as an int, or None if none can be found.
219
+
220
+ Search order:
221
+ 1. /sys/class/net (Linux)
222
+ 2. `ip link show` (Linux), `ifconfig -a` (Linux+macOS)
223
+ 3. `getmac` and `wmic nic` (Windows)
224
+ The first globally-administered, non-multicast address wins.
147
225
  """
148
- # Get the MAC address of the primary network interface
149
- mac = uuid.getnode()
150
- return f"{mac:012x}" if mac != uuid.getnode() else str(uuid.uuid4())
226
+ system = platform.system()
227
+ candidates: list[int] = []
228
+
229
+ if system == "Linux":
230
+ candidates += _from_sysfs()
231
+ if not candidates and shutil.which("ip"):
232
+ candidates += _from_command("ip link show")
233
+ if not candidates: # fall back to ifconfig
234
+ candidates += _from_command("ifconfig -a")
235
+
236
+ elif system == "Darwin": # macOS
237
+ candidates += _from_command("ifconfig -a")
238
+
239
+ elif system == "Windows":
240
+ # getmac is present on every supported Windows version
241
+ candidates += _from_command("getmac /v /fo list")
242
+ # wmic still exists through at least Win 11
243
+ candidates += _from_command(
244
+ 'wmic nic where "MACAddress is not null" get MACAddress /format:list'
245
+ )
151
246
 
247
+ # Prefer globally administered, non-multicast addresses
248
+ for mac in candidates:
249
+ if _is_globally_administered(mac):
250
+ return mac
152
251
 
153
- def configure_telemetry(app_context: AppContext) -> None:
154
- """Configure telemetry for the application."""
155
- if app_context.disable_telemetry:
156
- structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
157
- posthog.disabled = True
252
+ # If all we saw were locally-administered MACs, just return the first one
253
+ if candidates:
254
+ return candidates[0]
158
255
 
256
+ return None
159
257
 
160
- def log_event(event: str, properties: dict[str, Any] | None = None) -> None:
161
- """Log an event to PostHog."""
162
- log.debug(
163
- "Logging event", id=get_mac_address(), ph_event=event, ph_properties=properties
164
- )
165
- posthog.capture(get_mac_address(), event, properties or {})
258
+
259
+ def get_stable_mac_str() -> str:
260
+ """Return a *stable* 12-digit hex string (lower-case, no separators).
261
+
262
+ Falls back to uuid.getnode() if necessary, so it never raises.
263
+ """
264
+ mac_int = get_stable_mac_int()
265
+ if mac_int is None:
266
+ mac_int = uuid.getnode() # may still be random in VMs
267
+ return f"{mac_int:012x}"
@@ -0,0 +1,103 @@
1
+ # ruff: noqa
2
+ """add authors
3
+
4
+ Revision ID: 9e53ea8bb3b0
5
+ Revises: c3f5137d30f5
6
+ Create Date: 2025-06-14 10:50:36.058114
7
+
8
+ """
9
+
10
+ from typing import Sequence, Union
11
+
12
+ from alembic import op
13
+ import sqlalchemy as sa
14
+
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = "9e53ea8bb3b0"
18
+ down_revision: Union[str, None] = "c3f5137d30f5"
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade() -> None:
24
+ """Upgrade schema."""
25
+ # Define the enum type separately so we can explicitly create it when needed
26
+ source_type = sa.Enum("UNKNOWN", "FOLDER", "GIT", name="sourcetype")
27
+
28
+ # Explicitly create the enum type for PostgreSQL (no-op on SQLite)
29
+ source_type.create(op.get_bind(), checkfirst=True)
30
+ # ### commands auto generated by Alembic - please adjust! ###
31
+ op.create_table(
32
+ "authors",
33
+ sa.Column("name", sa.String(length=255), nullable=False),
34
+ sa.Column("email", sa.String(length=255), nullable=False),
35
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
36
+ sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
37
+ sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
38
+ sa.PrimaryKeyConstraint("id"),
39
+ sa.UniqueConstraint("name", "email", name="uix_author"),
40
+ )
41
+ op.create_index(op.f("ix_authors_email"), "authors", ["email"], unique=False)
42
+ op.create_index(op.f("ix_authors_name"), "authors", ["name"], unique=False)
43
+ op.create_table(
44
+ "author_file_mappings",
45
+ sa.Column("author_id", sa.Integer(), nullable=False),
46
+ sa.Column("file_id", sa.Integer(), nullable=False),
47
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
48
+ sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
49
+ sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
50
+ sa.ForeignKeyConstraint(
51
+ ["author_id"],
52
+ ["authors.id"],
53
+ ),
54
+ sa.ForeignKeyConstraint(
55
+ ["file_id"],
56
+ ["files.id"],
57
+ ),
58
+ sa.PrimaryKeyConstraint("id"),
59
+ sa.UniqueConstraint("author_id", "file_id", name="uix_author_file_mapping"),
60
+ )
61
+ op.create_index(
62
+ op.f("ix_author_file_mappings_author_id"),
63
+ "author_file_mappings",
64
+ ["author_id"],
65
+ unique=False,
66
+ )
67
+ op.create_index(
68
+ op.f("ix_author_file_mappings_file_id"),
69
+ "author_file_mappings",
70
+ ["file_id"],
71
+ unique=False,
72
+ )
73
+ op.add_column(
74
+ "files", sa.Column("extension", sa.String(length=255), nullable=False)
75
+ )
76
+ op.create_index(op.f("ix_files_extension"), "files", ["extension"], unique=False)
77
+ op.add_column("sources", sa.Column("type", source_type, nullable=False))
78
+ op.create_index(op.f("ix_sources_type"), "sources", ["type"], unique=False)
79
+ # ### end Alembic commands ###
80
+
81
+
82
+ def downgrade() -> None:
83
+ """Downgrade schema."""
84
+ # ### commands auto generated by Alembic - please adjust! ###
85
+ op.drop_index(op.f("ix_sources_type"), table_name="sources")
86
+ op.drop_column("sources", "type")
87
+ op.drop_index(op.f("ix_files_extension"), table_name="files")
88
+ op.drop_column("files", "extension")
89
+ op.drop_index(
90
+ op.f("ix_author_file_mappings_file_id"), table_name="author_file_mappings"
91
+ )
92
+ op.drop_index(
93
+ op.f("ix_author_file_mappings_author_id"), table_name="author_file_mappings"
94
+ )
95
+ op.drop_table("author_file_mappings")
96
+ op.drop_index(op.f("ix_authors_name"), table_name="authors")
97
+ op.drop_index(op.f("ix_authors_email"), table_name="authors")
98
+ op.drop_table("authors")
99
+
100
+ # Explicitly drop the enum type (PostgreSQL)
101
+ source_type = sa.Enum("UNKNOWN", "FOLDER", "GIT", name="sourcetype")
102
+ source_type.drop(op.get_bind(), checkfirst=True)
103
+ # ### end Alembic commands ###
@@ -0,0 +1,356 @@
1
+ """Source factories for creating different types of sources.
2
+
3
+ This module provides factory classes for creating sources, improving cohesion by
4
+ separating the concerns of different source types.
5
+ """
6
+
7
+ import mimetypes
8
+ import shutil
9
+ import tempfile
10
+ from abc import ABC, abstractmethod
11
+ from datetime import UTC, datetime
12
+ from hashlib import sha256
13
+ from pathlib import Path
14
+ from typing import Protocol
15
+
16
+ import aiofiles
17
+ import git
18
+ import structlog
19
+ from tqdm import tqdm
20
+
21
+ from kodit.source.ignore import IgnorePatterns
22
+ from kodit.source.source_models import (
23
+ Author,
24
+ AuthorFileMapping,
25
+ File,
26
+ Source,
27
+ SourceType,
28
+ )
29
+ from kodit.source.source_repository import SourceRepository
30
+
31
+
32
+ class WorkingCopyProvider(Protocol):
33
+ """Protocol for providing working copies of sources."""
34
+
35
+ async def prepare(self, uri: str) -> Path:
36
+ """Prepare a working copy and return its path."""
37
+ ...
38
+
39
+
40
+ class FileMetadataExtractor(Protocol):
41
+ """Protocol for extracting file metadata."""
42
+
43
+ async def extract(self, path: Path, source: Source) -> File:
44
+ """Extract metadata from a file."""
45
+ ...
46
+
47
+
48
+ class AuthorExtractor(Protocol):
49
+ """Protocol for extracting author information."""
50
+
51
+ async def extract(self, path: Path, source: Source) -> list[Author]:
52
+ """Extract authors for a file."""
53
+ ...
54
+
55
+
56
+ class SourceFactory(ABC):
57
+ """Abstract base class for source factories."""
58
+
59
+ def __init__(
60
+ self,
61
+ working_copy: WorkingCopyProvider,
62
+ metadata_extractor: FileMetadataExtractor,
63
+ author_extractor: AuthorExtractor,
64
+ repository: SourceRepository,
65
+ ) -> None:
66
+ """Initialize the source factory."""
67
+ self.working_copy = working_copy
68
+ self.metadata_extractor = metadata_extractor
69
+ self.author_extractor = author_extractor
70
+ self.repository = repository
71
+ self.log = structlog.get_logger(__name__)
72
+
73
+ @abstractmethod
74
+ async def create(self, uri: str) -> Source:
75
+ """Create a source from a URI."""
76
+ ...
77
+
78
+ async def _process_files(self, source: Source, files: list[Path]) -> None:
79
+ """Process files for a source."""
80
+ for path in tqdm(files, total=len(files), leave=False):
81
+ if not path.is_file():
82
+ continue
83
+
84
+ # Extract file metadata
85
+ file_record = await self.metadata_extractor.extract(path, source)
86
+ await self.repository.create_file(file_record)
87
+
88
+ # Extract authors
89
+ authors = await self.author_extractor.extract(path, source)
90
+ for author in authors:
91
+ await self.repository.upsert_author_file_mapping(
92
+ AuthorFileMapping(
93
+ author_id=author.id,
94
+ file_id=file_record.id,
95
+ )
96
+ )
97
+
98
+
99
+ class GitSourceFactory(SourceFactory):
100
+ """Factory for creating Git sources."""
101
+
102
+ async def create(self, uri: str) -> Source:
103
+ """Create a git source from a URI."""
104
+ # Normalize the URI
105
+ self.log.debug("Normalising git uri", uri=uri)
106
+ with tempfile.TemporaryDirectory() as temp_dir:
107
+ git.Repo.clone_from(uri, temp_dir)
108
+ remote = git.Repo(temp_dir).remote()
109
+ uri = remote.url
110
+
111
+ # Check if source already exists
112
+ self.log.debug("Checking if source already exists", uri=uri)
113
+ source = await self.repository.get_source_by_uri(uri)
114
+
115
+ if source:
116
+ self.log.info("Source already exists, reusing...", source_id=source.id)
117
+ return source
118
+
119
+ # Prepare working copy
120
+ clone_path = await self.working_copy.prepare(uri)
121
+
122
+ # Create source record
123
+ self.log.debug("Creating source", uri=uri, clone_path=str(clone_path))
124
+ source = await self.repository.create_source(
125
+ Source(
126
+ uri=uri,
127
+ cloned_path=str(clone_path),
128
+ source_type=SourceType.GIT,
129
+ )
130
+ )
131
+
132
+ # Get files to process using ignore patterns
133
+ ignore_patterns = IgnorePatterns(clone_path)
134
+ files = [
135
+ f
136
+ for f in clone_path.rglob("*")
137
+ if f.is_file() and not ignore_patterns.should_ignore(f)
138
+ ]
139
+
140
+ # Process files
141
+ self.log.info("Inspecting files", source_id=source.id, num_files=len(files))
142
+ await self._process_files(source, files)
143
+
144
+ return source
145
+
146
+
147
+ class FolderSourceFactory(SourceFactory):
148
+ """Factory for creating folder sources."""
149
+
150
+ async def create(self, uri: str) -> Source:
151
+ """Create a folder source from a path."""
152
+ directory = Path(uri).expanduser().resolve()
153
+
154
+ # Check if source already exists
155
+ source = await self.repository.get_source_by_uri(directory.as_uri())
156
+ if source:
157
+ self.log.info("Source already exists, reusing...", source_id=source.id)
158
+ return source
159
+
160
+ # Validate directory exists
161
+ if not directory.exists():
162
+ msg = f"Folder does not exist: {directory}"
163
+ raise ValueError(msg)
164
+
165
+ # Prepare working copy
166
+ clone_path = await self.working_copy.prepare(directory.as_uri())
167
+
168
+ # Create source record
169
+ source = await self.repository.create_source(
170
+ Source(
171
+ uri=directory.as_uri(),
172
+ cloned_path=str(clone_path),
173
+ source_type=SourceType.FOLDER,
174
+ )
175
+ )
176
+
177
+ # Get all files to process
178
+ files = [f for f in clone_path.rglob("*") if f.is_file()]
179
+
180
+ # Process files
181
+ await self._process_files(source, files)
182
+
183
+ return source
184
+
185
+
186
+ class GitWorkingCopyProvider:
187
+ """Working copy provider for Git repositories."""
188
+
189
+ def __init__(self, clone_dir: Path) -> None:
190
+ """Initialize the provider."""
191
+ self.clone_dir = clone_dir
192
+ self.log = structlog.get_logger(__name__)
193
+
194
+ async def prepare(self, uri: str) -> Path:
195
+ """Prepare a Git working copy."""
196
+ # Create a unique directory name for the clone
197
+ clone_path = self.clone_dir / uri.replace("/", "_").replace(":", "_")
198
+ clone_path.mkdir(parents=True, exist_ok=True)
199
+
200
+ try:
201
+ self.log.info("Cloning repository", uri=uri, clone_path=str(clone_path))
202
+ git.Repo.clone_from(uri, clone_path)
203
+ except git.GitCommandError as e:
204
+ if "already exists and is not an empty directory" not in str(e):
205
+ msg = f"Failed to clone repository: {e}"
206
+ raise ValueError(msg) from e
207
+ self.log.info("Repository already exists, reusing...", uri=uri)
208
+
209
+ return clone_path
210
+
211
+
212
+ class FolderWorkingCopyProvider:
213
+ """Working copy provider for local folders."""
214
+
215
+ def __init__(self, clone_dir: Path) -> None:
216
+ """Initialize the provider."""
217
+ self.clone_dir = clone_dir
218
+
219
+ async def prepare(self, uri: str) -> Path:
220
+ """Prepare a folder working copy."""
221
+ # Handle file:// URIs
222
+ if uri.startswith("file://"):
223
+ from urllib.parse import urlparse
224
+
225
+ parsed = urlparse(uri)
226
+ directory = Path(parsed.path).expanduser().resolve()
227
+ else:
228
+ directory = Path(uri).expanduser().resolve()
229
+
230
+ # Clone into a local directory
231
+ clone_path = self.clone_dir / directory.as_posix().replace("/", "_")
232
+ clone_path.mkdir(parents=True, exist_ok=True)
233
+
234
+ # Copy all files recursively, preserving directory structure, ignoring
235
+ # hidden files
236
+ shutil.copytree(
237
+ directory,
238
+ clone_path,
239
+ ignore=shutil.ignore_patterns(".*"),
240
+ dirs_exist_ok=True,
241
+ )
242
+
243
+ return clone_path
244
+
245
+
246
+ class BaseFileMetadataExtractor:
247
+ """Base class for file metadata extraction with common functionality."""
248
+
249
+ async def extract(self, path: Path, source: Source) -> File:
250
+ """Extract metadata from a file."""
251
+ # Get timestamps - to be implemented by subclasses
252
+ created_at, updated_at = await self._get_timestamps(path, source)
253
+
254
+ # Read file content and calculate metadata
255
+ async with aiofiles.open(path, "rb") as f:
256
+ content = await f.read()
257
+ mime_type = mimetypes.guess_type(path)
258
+ sha = sha256(content).hexdigest()
259
+
260
+ return File(
261
+ created_at=created_at,
262
+ updated_at=updated_at,
263
+ source_id=source.id,
264
+ cloned_path=str(path),
265
+ mime_type=mime_type[0]
266
+ if mime_type and mime_type[0]
267
+ else "application/octet-stream",
268
+ uri=path.as_uri(),
269
+ sha256=sha,
270
+ size_bytes=len(content),
271
+ )
272
+
273
+ async def _get_timestamps(
274
+ self, path: Path, source: Source
275
+ ) -> tuple[datetime, datetime]:
276
+ """Get creation and modification timestamps. To be implemented by subclasses."""
277
+ raise NotImplementedError
278
+
279
+
280
+ class GitFileMetadataExtractor(BaseFileMetadataExtractor):
281
+ """Git-specific implementation for extracting file metadata."""
282
+
283
+ async def _get_timestamps(
284
+ self, path: Path, source: Source
285
+ ) -> tuple[datetime, datetime]:
286
+ """Get timestamps from Git history."""
287
+ git_repo = git.Repo(source.cloned_path)
288
+ commits = list(git_repo.iter_commits(paths=str(path), all=True))
289
+
290
+ if commits:
291
+ last_modified_at = commits[0].committed_datetime
292
+ first_modified_at = commits[-1].committed_datetime
293
+ return first_modified_at, last_modified_at
294
+ # Fallback to current time if no commits found
295
+ now = datetime.now(UTC)
296
+ return now, now
297
+
298
+
299
+ class FolderFileMetadataExtractor(BaseFileMetadataExtractor):
300
+ """Folder-specific implementation for extracting file metadata."""
301
+
302
+ async def _get_timestamps(
303
+ self,
304
+ path: Path,
305
+ source: Source, # noqa: ARG002
306
+ ) -> tuple[datetime, datetime]:
307
+ """Get timestamps from file system."""
308
+ stat = path.stat()
309
+ file_created_at = datetime.fromtimestamp(stat.st_ctime, UTC)
310
+ file_modified_at = datetime.fromtimestamp(stat.st_mtime, UTC)
311
+ return file_created_at, file_modified_at
312
+
313
+
314
+ class GitAuthorExtractor:
315
+ """Author extractor for Git repositories."""
316
+
317
+ def __init__(self, repository: SourceRepository) -> None:
318
+ """Initialize the extractor."""
319
+ self.repository = repository
320
+
321
+ async def extract(self, path: Path, source: Source) -> list[Author]:
322
+ """Extract authors from a Git file."""
323
+ authors: list[Author] = []
324
+ git_repo = git.Repo(source.cloned_path)
325
+
326
+ try:
327
+ # Get the file's blame
328
+ blames = git_repo.blame("HEAD", str(path))
329
+
330
+ # Extract the blame's authors
331
+ actors = [
332
+ commit.author
333
+ for blame in blames or []
334
+ for commit in blame
335
+ if isinstance(commit, git.Commit)
336
+ ]
337
+
338
+ # Get or create the authors in the database
339
+ for actor in actors:
340
+ if actor.email:
341
+ author = Author.from_actor(actor)
342
+ author = await self.repository.upsert_author(author)
343
+ authors.append(author)
344
+ except git.GitCommandError:
345
+ # Handle cases where file might not be tracked
346
+ pass
347
+
348
+ return authors
349
+
350
+
351
+ class NoOpAuthorExtractor:
352
+ """No-op author extractor for sources that don't have author information."""
353
+
354
+ async def extract(self, path: Path, source: Source) -> list[Author]: # noqa: ARG002
355
+ """Return empty list of authors."""
356
+ return []
@@ -8,7 +8,8 @@ folders) and their relationships.
8
8
  import datetime
9
9
  from enum import Enum as EnumType
10
10
 
11
- from sqlalchemy import Enum, ForeignKey, Integer, String
11
+ from git import Actor
12
+ from sqlalchemy import Enum, ForeignKey, Integer, String, UniqueConstraint
12
13
  from sqlalchemy.orm import Mapped, mapped_column
13
14
 
14
15
  from kodit.database import Base, CommonMixin
@@ -60,8 +61,15 @@ class Author(Base, CommonMixin):
60
61
 
61
62
  __tablename__ = "authors"
62
63
 
63
- name: Mapped[str] = mapped_column(String(255), index=True, unique=True)
64
- email: Mapped[str] = mapped_column(String(255), index=True, unique=True)
64
+ __table_args__ = (UniqueConstraint("name", "email", name="uix_author"),)
65
+
66
+ name: Mapped[str] = mapped_column(String(255), index=True)
67
+ email: Mapped[str] = mapped_column(String(255), index=True)
68
+
69
+ @staticmethod
70
+ def from_actor(actor: Actor) -> "Author":
71
+ """Create an Author from an Actor."""
72
+ return Author(name=actor.name, email=actor.email)
65
73
 
66
74
 
67
75
  class AuthorFileMapping(Base, CommonMixin):
@@ -69,8 +77,12 @@ class AuthorFileMapping(Base, CommonMixin):
69
77
 
70
78
  __tablename__ = "author_file_mappings"
71
79
 
72
- author_id: Mapped[int] = mapped_column(ForeignKey("authors.id"))
73
- file_id: Mapped[int] = mapped_column(ForeignKey("files.id"))
80
+ __table_args__ = (
81
+ UniqueConstraint("author_id", "file_id", name="uix_author_file_mapping"),
82
+ )
83
+
84
+ author_id: Mapped[int] = mapped_column(ForeignKey("authors.id"), index=True)
85
+ file_id: Mapped[int] = mapped_column(ForeignKey("files.id"), index=True)
74
86
 
75
87
 
76
88
  class File(Base, CommonMixin):