kodit 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

kodit/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.2.2'
21
- __version_tuple__ = version_tuple = (0, 2, 2)
20
+ __version__ = version = '0.2.3'
21
+ __version_tuple__ = version_tuple = (0, 2, 3)
kodit/app.py CHANGED
@@ -21,6 +21,12 @@ async def root() -> dict[str, str]:
21
21
  return {"message": "Hello, World!"}
22
22
 
23
23
 
24
+ @app.get("/healthz")
25
+ async def healthz() -> dict[str, str]:
26
+ """Return a health check for the kodit API."""
27
+ return {"status": "ok"}
28
+
29
+
24
30
  # Add mcp routes last, otherwise previous routes aren't added
25
31
  app.mount("", mcp_app)
26
32
 
kodit/cli.py CHANGED
@@ -81,6 +81,7 @@ async def index(
81
81
  )
82
82
 
83
83
  if not sources:
84
+ log_event("kodit.cli.index.list")
84
85
  # No source specified, list all indexes
85
86
  indexes = await service.list_indexes()
86
87
  headers: list[str | Cell] = [
@@ -108,7 +109,8 @@ async def index(
108
109
  msg = "File indexing is not implemented yet"
109
110
  raise click.UsageError(msg)
110
111
 
111
- # Index directory
112
+ # Index source
113
+ log_event("kodit.cli.index.create")
112
114
  s = await source_service.create(source)
113
115
  index = await service.create(s.id)
114
116
  await service.run(index.id)
@@ -134,6 +136,7 @@ async def code(
134
136
 
135
137
  This works best if your query is code.
136
138
  """
139
+ log_event("kodit.cli.search.code")
137
140
  source_repository = SourceRepository(session)
138
141
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
139
142
  repository = IndexRepository(session)
@@ -177,6 +180,7 @@ async def keyword(
177
180
  top_k: int,
178
181
  ) -> None:
179
182
  """Search for snippets using keyword search."""
183
+ log_event("kodit.cli.search.keyword")
180
184
  source_repository = SourceRepository(session)
181
185
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
182
186
  repository = IndexRepository(session)
@@ -223,6 +227,7 @@ async def text(
223
227
 
224
228
  This works best if your query is text.
225
229
  """
230
+ log_event("kodit.cli.search.text")
226
231
  source_repository = SourceRepository(session)
227
232
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
228
233
  repository = IndexRepository(session)
@@ -270,6 +275,7 @@ async def hybrid( # noqa: PLR0913
270
275
  text: str,
271
276
  ) -> None:
272
277
  """Search for snippets using hybrid search."""
278
+ log_event("kodit.cli.search.hybrid")
273
279
  source_repository = SourceRepository(session)
274
280
  source_service = SourceService(app_context.get_clone_dir(), source_repository)
275
281
  repository = IndexRepository(session)
@@ -321,7 +327,7 @@ def serve(
321
327
  """Start the kodit server, which hosts the MCP server and the kodit API."""
322
328
  log = structlog.get_logger(__name__)
323
329
  log.info("Starting kodit server", host=host, port=port)
324
- log_event("kodit_server_started")
330
+ log_event("kodit.cli.serve")
325
331
 
326
332
  # Configure uvicorn with graceful shutdown
327
333
  config = uvicorn.Config(
@@ -19,6 +19,7 @@ from kodit.embedding.vectorchord_vector_search_service import (
19
19
  TaskName,
20
20
  VectorChordVectorSearchService,
21
21
  )
22
+ from kodit.log import log_event
22
23
 
23
24
 
24
25
  def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
@@ -34,6 +35,7 @@ def embedding_factory(
34
35
  endpoint = _get_endpoint_configuration(app_context)
35
36
 
36
37
  if endpoint and endpoint.type == "openai":
38
+ log_event("kodit.embedding", {"provider": "openai"})
37
39
  from openai import AsyncOpenAI
38
40
 
39
41
  embedding_provider = OpenAIEmbeddingProvider(
@@ -44,11 +46,14 @@ def embedding_factory(
44
46
  model_name=endpoint.model or "text-embedding-3-small",
45
47
  )
46
48
  else:
49
+ log_event("kodit.embedding", {"provider": "local"})
47
50
  embedding_provider = LocalEmbeddingProvider(CODE)
48
51
 
49
52
  if app_context.default_search.provider == "vectorchord":
53
+ log_event("kodit.database", {"provider": "vectorchord"})
50
54
  return VectorChordVectorSearchService(task_name, session, embedding_provider)
51
55
  if app_context.default_search.provider == "sqlite":
56
+ log_event("kodit.database", {"provider": "sqlite"})
52
57
  return LocalVectorSearchService(
53
58
  embedding_repository=embedding_repository,
54
59
  embedding_provider=embedding_provider,
@@ -39,14 +39,14 @@ def split_sub_batches(
39
39
 
40
40
  while data_to_process:
41
41
  next_item = data_to_process[0]
42
- item_tokens = len(encoding.encode(next_item))
42
+ item_tokens = len(encoding.encode(next_item, disallowed_special=()))
43
43
 
44
44
  if item_tokens > max_context_window:
45
45
  # Loop around trying to truncate the snippet until it fits in the max
46
46
  # embedding size
47
47
  while item_tokens > max_context_window:
48
48
  next_item = next_item[:-1]
49
- item_tokens = len(encoding.encode(next_item))
49
+ item_tokens = len(encoding.encode(next_item, disallowed_special=()))
50
50
 
51
51
  data_to_process[0] = next_item
52
52
 
@@ -11,6 +11,7 @@ from kodit.enrichment.enrichment_service import (
11
11
  EnrichmentService,
12
12
  LLMEnrichmentService,
13
13
  )
14
+ from kodit.log import log_event
14
15
 
15
16
 
16
17
  def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
@@ -24,6 +25,7 @@ def enrichment_factory(app_context: AppContext) -> EnrichmentService:
24
25
  endpoint = app_context.enrichment_endpoint or app_context.default_endpoint or None
25
26
 
26
27
  if endpoint and endpoint.type == "openai":
28
+ log_event("kodit.enrichment", {"provider": "openai"})
27
29
  from openai import AsyncOpenAI
28
30
 
29
31
  enrichment_provider = OpenAIEnrichmentProvider(
@@ -34,6 +36,7 @@ def enrichment_factory(app_context: AppContext) -> EnrichmentService:
34
36
  model_name=endpoint.model or "gpt-4o-mini",
35
37
  )
36
38
  else:
39
+ log_event("kodit.enrichment", {"provider": "local"})
37
40
  enrichment_provider = LocalEnrichmentProvider()
38
41
 
39
42
  return LLMEnrichmentService(enrichment_provider=enrichment_provider)
@@ -26,6 +26,7 @@ from kodit.enrichment.enrichment_service import EnrichmentService
26
26
  from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
27
27
  from kodit.indexing.indexing_models import Snippet
28
28
  from kodit.indexing.indexing_repository import IndexRepository
29
+ from kodit.log import log_event
29
30
  from kodit.snippets.snippets import SnippetService
30
31
  from kodit.source.source_service import SourceService
31
32
  from kodit.util.spinner import Spinner
@@ -45,7 +46,7 @@ class IndexView(pydantic.BaseModel):
45
46
  created_at: datetime
46
47
  updated_at: datetime | None = None
47
48
  source: str | None = None
48
- num_snippets: int | None = None
49
+ num_snippets: int
49
50
 
50
51
 
51
52
  class SearchRequest(pydantic.BaseModel):
@@ -119,6 +120,8 @@ class IndexService:
119
120
  ValueError: If the source doesn't exist or already has an index.
120
121
 
121
122
  """
123
+ log_event("kodit.index.create")
124
+
122
125
  # Check if the source exists
123
126
  source = await self.source_service.get(source_id)
124
127
 
@@ -129,6 +132,8 @@ class IndexService:
129
132
  return IndexView(
130
133
  id=index.id,
131
134
  created_at=index.created_at,
135
+ num_snippets=await self.repository.num_snippets_for_index(index.id),
136
+ source=source.uri,
132
137
  )
133
138
 
134
139
  async def list_indexes(self) -> list[IndexView]:
@@ -142,19 +147,33 @@ class IndexService:
142
147
  indexes = await self.repository.list_indexes()
143
148
 
144
149
  # Transform database results into DTOs
145
- return [
150
+ indexes = [
146
151
  IndexView(
147
152
  id=index.id,
148
153
  created_at=index.created_at,
149
154
  updated_at=index.updated_at,
150
- num_snippets=await self.repository.num_snippets_for_index(index.id),
155
+ num_snippets=await self.repository.num_snippets_for_index(index.id)
156
+ or 0,
151
157
  source=source.uri,
152
158
  )
153
159
  for index, source in indexes
154
160
  ]
155
161
 
162
+ # Help Kodit by measuring how much people are using indexes
163
+ log_event(
164
+ "kodit.index.list",
165
+ {
166
+ "num_indexes": len(indexes),
167
+ "num_snippets": sum([index.num_snippets for index in indexes]),
168
+ },
169
+ )
170
+
171
+ return indexes
172
+
156
173
  async def run(self, index_id: int) -> None:
157
174
  """Run the indexing process for a specific index."""
175
+ log_event("kodit.index.run")
176
+
158
177
  # Get and validate index
159
178
  index = await self.repository.get_by_id(index_id)
160
179
  if not index:
@@ -218,6 +237,8 @@ class IndexService:
218
237
 
219
238
  async def search(self, request: SearchRequest) -> list[SearchResult]:
220
239
  """Search for relevant data."""
240
+ log_event("kodit.index.search")
241
+
221
242
  fusion_list: list[list[FusionRequest]] = []
222
243
  if request.keywords:
223
244
  # Gather results for each keyword
kodit/log.py CHANGED
@@ -1,20 +1,33 @@
1
1
  """Logging configuration for kodit."""
2
2
 
3
3
  import logging
4
+ import platform
5
+ import re
6
+ import shutil
7
+ import subprocess
4
8
  import sys
5
9
  import uuid
6
10
  from enum import Enum
7
11
  from functools import lru_cache
12
+ from pathlib import Path
8
13
  from typing import Any
9
14
 
15
+ import rudderstack.analytics as rudder_analytics
10
16
  import structlog
11
- from posthog import Posthog
12
17
  from structlog.types import EventDict
13
18
 
19
+ from kodit import _version
14
20
  from kodit.config import AppContext
15
21
 
22
+ _MAC_RE = re.compile(r"(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}")
23
+
16
24
  log = structlog.get_logger(__name__)
17
25
 
26
+ rudder_analytics.write_key = "2wm1RmV2GnO92NGSs8yYtmSI0mi"
27
+ rudder_analytics.dataPlaneUrl = (
28
+ "https://danbmedefzavzlslreyxjgcjwlf.dataplane.rudderstack.com"
29
+ )
30
+
18
31
 
19
32
  def drop_color_message_key(_, __, event_dict: EventDict) -> EventDict: # noqa: ANN001
20
33
  """Drop the `color_message` key from the event dict."""
@@ -131,35 +144,124 @@ def configure_logging(app_context: AppContext) -> None:
131
144
  sys.excepthook = handle_exception
132
145
 
133
146
 
134
- posthog = Posthog(
135
- project_api_key="phc_JsX0yx8NLPcIxamfp4Zc7xyFykXjwmekKUQz060cSt3",
136
- host="https://eu.i.posthog.com",
137
- )
147
+ def configure_telemetry(app_context: AppContext) -> None:
148
+ """Configure telemetry for the application."""
149
+ if app_context.disable_telemetry:
150
+ structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
151
+ rudder_analytics.send = False
138
152
 
153
+ rudder_analytics.identify(
154
+ anonymous_id=get_stable_mac_str(),
155
+ traits={},
156
+ )
139
157
 
140
- @lru_cache(maxsize=1)
141
- def get_mac_address() -> str:
142
- """Get the MAC address of the primary network interface.
143
158
 
144
- Returns:
145
- str: The MAC address or a fallback UUID if not available
159
+ def log_event(event: str, properties: dict[str, Any] | None = None) -> None:
160
+ """Log an event to Rudderstack."""
161
+ p = properties or {}
162
+ # Set default posthog properties
163
+ p["$app_name"] = "kodit"
164
+ p["$app_version"] = _version.version
165
+ p["$os"] = sys.platform
166
+ p["$os_version"] = sys.version
167
+ rudder_analytics.track(
168
+ anonymous_id=get_stable_mac_str(),
169
+ event=event,
170
+ properties=properties or {},
171
+ )
172
+
146
173
 
174
+ # ----------------------------------------------------------------------
175
+ # Helper functions
176
+ # ----------------------------------------------------------------------
177
+ def _mac_int(mac: str) -> int:
178
+ return int(mac.replace(":", "").replace("-", ""), 16)
179
+
180
+
181
+ def _is_globally_administered(mac_int: int) -> bool:
182
+ first_octet = (mac_int >> 40) & 0xFF
183
+ return not (first_octet & 0b11) # both bits must be 0
184
+
185
+
186
+ def _from_sysfs() -> list[int]:
187
+ base = Path("/sys/class/net")
188
+ if not base.is_dir():
189
+ return []
190
+ macs: list[int] = []
191
+ for iface in base.iterdir():
192
+ try:
193
+ with (base / iface / "address").open() as f:
194
+ content = f.read().strip()
195
+ if _MAC_RE.fullmatch(content):
196
+ macs.append(_mac_int(content))
197
+ except (FileNotFoundError, PermissionError):
198
+ pass
199
+ return macs
200
+
201
+
202
+ def _from_command(cmd: str) -> list[int]:
203
+ try:
204
+ out = subprocess.check_output( # noqa: S602
205
+ cmd,
206
+ shell=True,
207
+ text=True,
208
+ stderr=subprocess.DEVNULL,
209
+ encoding="utf-8",
210
+ )
211
+ except Exception: # noqa: BLE001
212
+ return []
213
+ return [_mac_int(m.group()) for m in _MAC_RE.finditer(out)]
214
+
215
+
216
+ @lru_cache(maxsize=1)
217
+ def get_stable_mac_int() -> int | None:
218
+ """Return a *hardware* MAC as an int, or None if none can be found.
219
+
220
+ Search order:
221
+ 1. /sys/class/net (Linux)
222
+ 2. `ip link show` (Linux), `ifconfig -a` (Linux+macOS)
223
+ 3. `getmac` and `wmic nic` (Windows)
224
+ The first globally-administered, non-multicast address wins.
147
225
  """
148
- # Get the MAC address of the primary network interface
149
- mac = uuid.getnode()
150
- return f"{mac:012x}" if mac != uuid.getnode() else str(uuid.uuid4())
226
+ system = platform.system()
227
+ candidates: list[int] = []
228
+
229
+ if system == "Linux":
230
+ candidates += _from_sysfs()
231
+ if not candidates and shutil.which("ip"):
232
+ candidates += _from_command("ip link show")
233
+ if not candidates: # fall back to ifconfig
234
+ candidates += _from_command("ifconfig -a")
235
+
236
+ elif system == "Darwin": # macOS
237
+ candidates += _from_command("ifconfig -a")
238
+
239
+ elif system == "Windows":
240
+ # getmac is present on every supported Windows version
241
+ candidates += _from_command("getmac /v /fo list")
242
+ # wmic still exists through at least Win 11
243
+ candidates += _from_command(
244
+ 'wmic nic where "MACAddress is not null" get MACAddress /format:list'
245
+ )
151
246
 
247
+ # Prefer globally administered, non-multicast addresses
248
+ for mac in candidates:
249
+ if _is_globally_administered(mac):
250
+ return mac
152
251
 
153
- def configure_telemetry(app_context: AppContext) -> None:
154
- """Configure telemetry for the application."""
155
- if app_context.disable_telemetry:
156
- structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
157
- posthog.disabled = True
252
+ # If all we saw were locally-administered MACs, just return the first one
253
+ if candidates:
254
+ return candidates[0]
158
255
 
256
+ return None
159
257
 
160
- def log_event(event: str, properties: dict[str, Any] | None = None) -> None:
161
- """Log an event to PostHog."""
162
- log.debug(
163
- "Logging event", id=get_mac_address(), ph_event=event, ph_properties=properties
164
- )
165
- posthog.capture(get_mac_address(), event, properties or {})
258
+
259
+ def get_stable_mac_str() -> str:
260
+ """Return a *stable* 12-digit hex string (lower-case, no separators).
261
+
262
+ Falls back to uuid.getnode() if necessary, so it never raises.
263
+ """
264
+ mac_int = get_stable_mac_int()
265
+ if mac_int is None:
266
+ mac_int = uuid.getnode() # may still be random in VMs
267
+ return f"{mac_int:012x}"
@@ -0,0 +1,103 @@
1
+ # ruff: noqa
2
+ """add authors
3
+
4
+ Revision ID: 9e53ea8bb3b0
5
+ Revises: c3f5137d30f5
6
+ Create Date: 2025-06-14 10:50:36.058114
7
+
8
+ """
9
+
10
+ from typing import Sequence, Union
11
+
12
+ from alembic import op
13
+ import sqlalchemy as sa
14
+
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = "9e53ea8bb3b0"
18
+ down_revision: Union[str, None] = "c3f5137d30f5"
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade() -> None:
24
+ """Upgrade schema."""
25
+ # Define the enum type separately so we can explicitly create it when needed
26
+ source_type = sa.Enum("UNKNOWN", "FOLDER", "GIT", name="sourcetype")
27
+
28
+ # Explicitly create the enum type for PostgreSQL (no-op on SQLite)
29
+ source_type.create(op.get_bind(), checkfirst=True)
30
+ # ### commands auto generated by Alembic - please adjust! ###
31
+ op.create_table(
32
+ "authors",
33
+ sa.Column("name", sa.String(length=255), nullable=False),
34
+ sa.Column("email", sa.String(length=255), nullable=False),
35
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
36
+ sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
37
+ sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
38
+ sa.PrimaryKeyConstraint("id"),
39
+ sa.UniqueConstraint("name", "email", name="uix_author"),
40
+ )
41
+ op.create_index(op.f("ix_authors_email"), "authors", ["email"], unique=False)
42
+ op.create_index(op.f("ix_authors_name"), "authors", ["name"], unique=False)
43
+ op.create_table(
44
+ "author_file_mappings",
45
+ sa.Column("author_id", sa.Integer(), nullable=False),
46
+ sa.Column("file_id", sa.Integer(), nullable=False),
47
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
48
+ sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
49
+ sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
50
+ sa.ForeignKeyConstraint(
51
+ ["author_id"],
52
+ ["authors.id"],
53
+ ),
54
+ sa.ForeignKeyConstraint(
55
+ ["file_id"],
56
+ ["files.id"],
57
+ ),
58
+ sa.PrimaryKeyConstraint("id"),
59
+ sa.UniqueConstraint("author_id", "file_id", name="uix_author_file_mapping"),
60
+ )
61
+ op.create_index(
62
+ op.f("ix_author_file_mappings_author_id"),
63
+ "author_file_mappings",
64
+ ["author_id"],
65
+ unique=False,
66
+ )
67
+ op.create_index(
68
+ op.f("ix_author_file_mappings_file_id"),
69
+ "author_file_mappings",
70
+ ["file_id"],
71
+ unique=False,
72
+ )
73
+ op.add_column(
74
+ "files", sa.Column("extension", sa.String(length=255), nullable=False)
75
+ )
76
+ op.create_index(op.f("ix_files_extension"), "files", ["extension"], unique=False)
77
+ op.add_column("sources", sa.Column("type", source_type, nullable=False))
78
+ op.create_index(op.f("ix_sources_type"), "sources", ["type"], unique=False)
79
+ # ### end Alembic commands ###
80
+
81
+
82
+ def downgrade() -> None:
83
+ """Downgrade schema."""
84
+ # ### commands auto generated by Alembic - please adjust! ###
85
+ op.drop_index(op.f("ix_sources_type"), table_name="sources")
86
+ op.drop_column("sources", "type")
87
+ op.drop_index(op.f("ix_files_extension"), table_name="files")
88
+ op.drop_column("files", "extension")
89
+ op.drop_index(
90
+ op.f("ix_author_file_mappings_file_id"), table_name="author_file_mappings"
91
+ )
92
+ op.drop_index(
93
+ op.f("ix_author_file_mappings_author_id"), table_name="author_file_mappings"
94
+ )
95
+ op.drop_table("author_file_mappings")
96
+ op.drop_index(op.f("ix_authors_name"), table_name="authors")
97
+ op.drop_index(op.f("ix_authors_email"), table_name="authors")
98
+ op.drop_table("authors")
99
+
100
+ # Explicitly drop the enum type (PostgreSQL)
101
+ source_type = sa.Enum("UNKNOWN", "FOLDER", "GIT", name="sourcetype")
102
+ source_type.drop(op.get_bind(), checkfirst=True)
103
+ # ### end Alembic commands ###