kodit 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +6 -0
- kodit/bm25/local_bm25.py +8 -0
- kodit/bm25/vectorchord_bm25.py +4 -1
- kodit/cli.py +8 -2
- kodit/config.py +14 -24
- kodit/embedding/embedding_factory.py +25 -6
- kodit/embedding/embedding_provider/embedding_provider.py +2 -2
- kodit/embedding/embedding_provider/openai_embedding_provider.py +3 -1
- kodit/embedding/local_vector_search_service.py +4 -0
- kodit/embedding/vectorchord_vector_search_service.py +10 -2
- kodit/enrichment/enrichment_factory.py +26 -7
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +4 -0
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +5 -1
- kodit/indexing/indexing_service.py +28 -3
- kodit/log.py +126 -24
- kodit/migrations/versions/9e53ea8bb3b0_add_authors.py +103 -0
- kodit/source/git.py +16 -0
- kodit/source/ignore.py +53 -0
- kodit/source/source_factories.py +356 -0
- kodit/source/source_models.py +52 -2
- kodit/source/source_repository.py +80 -16
- kodit/source/source_service.py +45 -155
- {kodit-0.2.1.dist-info → kodit-0.2.3.dist-info}/METADATA +4 -2
- {kodit-0.2.1.dist-info → kodit-0.2.3.dist-info}/RECORD +28 -24
- {kodit-0.2.1.dist-info → kodit-0.2.3.dist-info}/WHEEL +0 -0
- {kodit-0.2.1.dist-info → kodit-0.2.3.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.1.dist-info → kodit-0.2.3.dist-info}/licenses/LICENSE +0 -0
kodit/log.py
CHANGED
|
@@ -1,20 +1,33 @@
|
|
|
1
1
|
"""Logging configuration for kodit."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import platform
|
|
5
|
+
import re
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
4
8
|
import sys
|
|
5
9
|
import uuid
|
|
6
10
|
from enum import Enum
|
|
7
11
|
from functools import lru_cache
|
|
12
|
+
from pathlib import Path
|
|
8
13
|
from typing import Any
|
|
9
14
|
|
|
15
|
+
import rudderstack.analytics as rudder_analytics
|
|
10
16
|
import structlog
|
|
11
|
-
from posthog import Posthog
|
|
12
17
|
from structlog.types import EventDict
|
|
13
18
|
|
|
19
|
+
from kodit import _version
|
|
14
20
|
from kodit.config import AppContext
|
|
15
21
|
|
|
22
|
+
_MAC_RE = re.compile(r"(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}")
|
|
23
|
+
|
|
16
24
|
log = structlog.get_logger(__name__)
|
|
17
25
|
|
|
26
|
+
rudder_analytics.write_key = "2wm1RmV2GnO92NGSs8yYtmSI0mi"
|
|
27
|
+
rudder_analytics.dataPlaneUrl = (
|
|
28
|
+
"https://danbmedefzavzlslreyxjgcjwlf.dataplane.rudderstack.com"
|
|
29
|
+
)
|
|
30
|
+
|
|
18
31
|
|
|
19
32
|
def drop_color_message_key(_, __, event_dict: EventDict) -> EventDict: # noqa: ANN001
|
|
20
33
|
"""Drop the `color_message` key from the event dict."""
|
|
@@ -131,35 +144,124 @@ def configure_logging(app_context: AppContext) -> None:
|
|
|
131
144
|
sys.excepthook = handle_exception
|
|
132
145
|
|
|
133
146
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
)
|
|
147
|
+
def configure_telemetry(app_context: AppContext) -> None:
|
|
148
|
+
"""Configure telemetry for the application."""
|
|
149
|
+
if app_context.disable_telemetry:
|
|
150
|
+
structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
|
|
151
|
+
rudder_analytics.send = False
|
|
138
152
|
|
|
153
|
+
rudder_analytics.identify(
|
|
154
|
+
anonymous_id=get_stable_mac_str(),
|
|
155
|
+
traits={},
|
|
156
|
+
)
|
|
139
157
|
|
|
140
|
-
@lru_cache(maxsize=1)
|
|
141
|
-
def get_mac_address() -> str:
|
|
142
|
-
"""Get the MAC address of the primary network interface.
|
|
143
158
|
|
|
144
|
-
|
|
145
|
-
|
|
159
|
+
def log_event(event: str, properties: dict[str, Any] | None = None) -> None:
|
|
160
|
+
"""Log an event to Rudderstack."""
|
|
161
|
+
p = properties or {}
|
|
162
|
+
# Set default posthog properties
|
|
163
|
+
p["$app_name"] = "kodit"
|
|
164
|
+
p["$app_version"] = _version.version
|
|
165
|
+
p["$os"] = sys.platform
|
|
166
|
+
p["$os_version"] = sys.version
|
|
167
|
+
rudder_analytics.track(
|
|
168
|
+
anonymous_id=get_stable_mac_str(),
|
|
169
|
+
event=event,
|
|
170
|
+
properties=properties or {},
|
|
171
|
+
)
|
|
172
|
+
|
|
146
173
|
|
|
174
|
+
# ----------------------------------------------------------------------
|
|
175
|
+
# Helper functions
|
|
176
|
+
# ----------------------------------------------------------------------
|
|
177
|
+
def _mac_int(mac: str) -> int:
|
|
178
|
+
return int(mac.replace(":", "").replace("-", ""), 16)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _is_globally_administered(mac_int: int) -> bool:
|
|
182
|
+
first_octet = (mac_int >> 40) & 0xFF
|
|
183
|
+
return not (first_octet & 0b11) # both bits must be 0
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _from_sysfs() -> list[int]:
|
|
187
|
+
base = Path("/sys/class/net")
|
|
188
|
+
if not base.is_dir():
|
|
189
|
+
return []
|
|
190
|
+
macs: list[int] = []
|
|
191
|
+
for iface in base.iterdir():
|
|
192
|
+
try:
|
|
193
|
+
with (base / iface / "address").open() as f:
|
|
194
|
+
content = f.read().strip()
|
|
195
|
+
if _MAC_RE.fullmatch(content):
|
|
196
|
+
macs.append(_mac_int(content))
|
|
197
|
+
except (FileNotFoundError, PermissionError):
|
|
198
|
+
pass
|
|
199
|
+
return macs
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _from_command(cmd: str) -> list[int]:
|
|
203
|
+
try:
|
|
204
|
+
out = subprocess.check_output( # noqa: S602
|
|
205
|
+
cmd,
|
|
206
|
+
shell=True,
|
|
207
|
+
text=True,
|
|
208
|
+
stderr=subprocess.DEVNULL,
|
|
209
|
+
encoding="utf-8",
|
|
210
|
+
)
|
|
211
|
+
except Exception: # noqa: BLE001
|
|
212
|
+
return []
|
|
213
|
+
return [_mac_int(m.group()) for m in _MAC_RE.finditer(out)]
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@lru_cache(maxsize=1)
|
|
217
|
+
def get_stable_mac_int() -> int | None:
|
|
218
|
+
"""Return a *hardware* MAC as an int, or None if none can be found.
|
|
219
|
+
|
|
220
|
+
Search order:
|
|
221
|
+
1. /sys/class/net (Linux)
|
|
222
|
+
2. `ip link show` (Linux), `ifconfig -a` (Linux+macOS)
|
|
223
|
+
3. `getmac` and `wmic nic` (Windows)
|
|
224
|
+
The first globally-administered, non-multicast address wins.
|
|
147
225
|
"""
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
226
|
+
system = platform.system()
|
|
227
|
+
candidates: list[int] = []
|
|
228
|
+
|
|
229
|
+
if system == "Linux":
|
|
230
|
+
candidates += _from_sysfs()
|
|
231
|
+
if not candidates and shutil.which("ip"):
|
|
232
|
+
candidates += _from_command("ip link show")
|
|
233
|
+
if not candidates: # fall back to ifconfig
|
|
234
|
+
candidates += _from_command("ifconfig -a")
|
|
235
|
+
|
|
236
|
+
elif system == "Darwin": # macOS
|
|
237
|
+
candidates += _from_command("ifconfig -a")
|
|
238
|
+
|
|
239
|
+
elif system == "Windows":
|
|
240
|
+
# getmac is present on every supported Windows version
|
|
241
|
+
candidates += _from_command("getmac /v /fo list")
|
|
242
|
+
# wmic still exists through at least Win 11
|
|
243
|
+
candidates += _from_command(
|
|
244
|
+
'wmic nic where "MACAddress is not null" get MACAddress /format:list'
|
|
245
|
+
)
|
|
151
246
|
|
|
247
|
+
# Prefer globally administered, non-multicast addresses
|
|
248
|
+
for mac in candidates:
|
|
249
|
+
if _is_globally_administered(mac):
|
|
250
|
+
return mac
|
|
152
251
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
|
|
157
|
-
posthog.disabled = True
|
|
252
|
+
# If all we saw were locally-administered MACs, just return the first one
|
|
253
|
+
if candidates:
|
|
254
|
+
return candidates[0]
|
|
158
255
|
|
|
256
|
+
return None
|
|
159
257
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
)
|
|
165
|
-
|
|
258
|
+
|
|
259
|
+
def get_stable_mac_str() -> str:
|
|
260
|
+
"""Return a *stable* 12-digit hex string (lower-case, no separators).
|
|
261
|
+
|
|
262
|
+
Falls back to uuid.getnode() if necessary, so it never raises.
|
|
263
|
+
"""
|
|
264
|
+
mac_int = get_stable_mac_int()
|
|
265
|
+
if mac_int is None:
|
|
266
|
+
mac_int = uuid.getnode() # may still be random in VMs
|
|
267
|
+
return f"{mac_int:012x}"
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# ruff: noqa
|
|
2
|
+
"""add authors
|
|
3
|
+
|
|
4
|
+
Revision ID: 9e53ea8bb3b0
|
|
5
|
+
Revises: c3f5137d30f5
|
|
6
|
+
Create Date: 2025-06-14 10:50:36.058114
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Sequence, Union
|
|
11
|
+
|
|
12
|
+
from alembic import op
|
|
13
|
+
import sqlalchemy as sa
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = "9e53ea8bb3b0"
|
|
18
|
+
down_revision: Union[str, None] = "c3f5137d30f5"
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade() -> None:
|
|
24
|
+
"""Upgrade schema."""
|
|
25
|
+
# Define the enum type separately so we can explicitly create it when needed
|
|
26
|
+
source_type = sa.Enum("UNKNOWN", "FOLDER", "GIT", name="sourcetype")
|
|
27
|
+
|
|
28
|
+
# Explicitly create the enum type for PostgreSQL (no-op on SQLite)
|
|
29
|
+
source_type.create(op.get_bind(), checkfirst=True)
|
|
30
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
31
|
+
op.create_table(
|
|
32
|
+
"authors",
|
|
33
|
+
sa.Column("name", sa.String(length=255), nullable=False),
|
|
34
|
+
sa.Column("email", sa.String(length=255), nullable=False),
|
|
35
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
36
|
+
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
|
|
37
|
+
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
|
|
38
|
+
sa.PrimaryKeyConstraint("id"),
|
|
39
|
+
sa.UniqueConstraint("name", "email", name="uix_author"),
|
|
40
|
+
)
|
|
41
|
+
op.create_index(op.f("ix_authors_email"), "authors", ["email"], unique=False)
|
|
42
|
+
op.create_index(op.f("ix_authors_name"), "authors", ["name"], unique=False)
|
|
43
|
+
op.create_table(
|
|
44
|
+
"author_file_mappings",
|
|
45
|
+
sa.Column("author_id", sa.Integer(), nullable=False),
|
|
46
|
+
sa.Column("file_id", sa.Integer(), nullable=False),
|
|
47
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
48
|
+
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
|
|
49
|
+
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
|
|
50
|
+
sa.ForeignKeyConstraint(
|
|
51
|
+
["author_id"],
|
|
52
|
+
["authors.id"],
|
|
53
|
+
),
|
|
54
|
+
sa.ForeignKeyConstraint(
|
|
55
|
+
["file_id"],
|
|
56
|
+
["files.id"],
|
|
57
|
+
),
|
|
58
|
+
sa.PrimaryKeyConstraint("id"),
|
|
59
|
+
sa.UniqueConstraint("author_id", "file_id", name="uix_author_file_mapping"),
|
|
60
|
+
)
|
|
61
|
+
op.create_index(
|
|
62
|
+
op.f("ix_author_file_mappings_author_id"),
|
|
63
|
+
"author_file_mappings",
|
|
64
|
+
["author_id"],
|
|
65
|
+
unique=False,
|
|
66
|
+
)
|
|
67
|
+
op.create_index(
|
|
68
|
+
op.f("ix_author_file_mappings_file_id"),
|
|
69
|
+
"author_file_mappings",
|
|
70
|
+
["file_id"],
|
|
71
|
+
unique=False,
|
|
72
|
+
)
|
|
73
|
+
op.add_column(
|
|
74
|
+
"files", sa.Column("extension", sa.String(length=255), nullable=False)
|
|
75
|
+
)
|
|
76
|
+
op.create_index(op.f("ix_files_extension"), "files", ["extension"], unique=False)
|
|
77
|
+
op.add_column("sources", sa.Column("type", source_type, nullable=False))
|
|
78
|
+
op.create_index(op.f("ix_sources_type"), "sources", ["type"], unique=False)
|
|
79
|
+
# ### end Alembic commands ###
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def downgrade() -> None:
|
|
83
|
+
"""Downgrade schema."""
|
|
84
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
85
|
+
op.drop_index(op.f("ix_sources_type"), table_name="sources")
|
|
86
|
+
op.drop_column("sources", "type")
|
|
87
|
+
op.drop_index(op.f("ix_files_extension"), table_name="files")
|
|
88
|
+
op.drop_column("files", "extension")
|
|
89
|
+
op.drop_index(
|
|
90
|
+
op.f("ix_author_file_mappings_file_id"), table_name="author_file_mappings"
|
|
91
|
+
)
|
|
92
|
+
op.drop_index(
|
|
93
|
+
op.f("ix_author_file_mappings_author_id"), table_name="author_file_mappings"
|
|
94
|
+
)
|
|
95
|
+
op.drop_table("author_file_mappings")
|
|
96
|
+
op.drop_index(op.f("ix_authors_name"), table_name="authors")
|
|
97
|
+
op.drop_index(op.f("ix_authors_email"), table_name="authors")
|
|
98
|
+
op.drop_table("authors")
|
|
99
|
+
|
|
100
|
+
# Explicitly drop the enum type (PostgreSQL)
|
|
101
|
+
source_type = sa.Enum("UNKNOWN", "FOLDER", "GIT", name="sourcetype")
|
|
102
|
+
source_type.drop(op.get_bind(), checkfirst=True)
|
|
103
|
+
# ### end Alembic commands ###
|
kodit/source/git.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Git utilities."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
|
|
5
|
+
import git
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_valid_clone_target(target: str) -> bool:
|
|
9
|
+
"""Return True if the target is clonable."""
|
|
10
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
11
|
+
try:
|
|
12
|
+
git.Repo.clone_from(target, temp_dir)
|
|
13
|
+
except git.GitCommandError:
|
|
14
|
+
return False
|
|
15
|
+
else:
|
|
16
|
+
return True
|
kodit/source/ignore.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Ignore patterns."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import git
|
|
6
|
+
import pathspec
|
|
7
|
+
|
|
8
|
+
from kodit.source.git import is_valid_clone_target
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class IgnorePatterns:
|
|
12
|
+
"""Ignore patterns."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, base_dir: Path) -> None:
|
|
15
|
+
"""Initialize the ignore patterns."""
|
|
16
|
+
if not base_dir.is_dir():
|
|
17
|
+
msg = f"Base directory is not a directory: {base_dir}"
|
|
18
|
+
raise ValueError(msg)
|
|
19
|
+
|
|
20
|
+
self.base_dir = base_dir
|
|
21
|
+
|
|
22
|
+
# Check if the base_dir is a valid git repository
|
|
23
|
+
self.git_repo = None
|
|
24
|
+
if is_valid_clone_target(str(base_dir)):
|
|
25
|
+
self.git_repo = git.Repo(base_dir)
|
|
26
|
+
|
|
27
|
+
def should_ignore(self, path: Path) -> bool:
|
|
28
|
+
"""Check if a path should be ignored."""
|
|
29
|
+
if path.is_dir():
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
# Get the path relative to the base_dir
|
|
33
|
+
relative_path = path.relative_to(self.base_dir)
|
|
34
|
+
|
|
35
|
+
# If this file is _part_ of a .git directory, then it should be ignored
|
|
36
|
+
if relative_path.as_posix().startswith(".git"):
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
# If it is a git repository, then we need to check if the file is ignored
|
|
40
|
+
if self.git_repo and len(self.git_repo.ignored(path)) > 0:
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
# If the repo has a .noindex file
|
|
44
|
+
noindex_path = Path(self.base_dir / ".noindex")
|
|
45
|
+
if noindex_path.exists():
|
|
46
|
+
with noindex_path.open() as f:
|
|
47
|
+
patterns = [line.strip() for line in f if line.strip()]
|
|
48
|
+
if patterns:
|
|
49
|
+
spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns)
|
|
50
|
+
if spec.match_file(relative_path.as_posix()):
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
return False
|