PyPI - kodit - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

kodit 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kodit might be problematic. Click here for more details.

Files changed (29) hide show

kodit/_version.py +2 -2
kodit/alembic/env.py +5 -4
kodit/app.py +13 -9
kodit/bm25/__init__.py +1 -0
kodit/bm25/bm25.py +71 -0
kodit/cli.py +124 -38
kodit/config.py +94 -2
kodit/database.py +41 -57
kodit/indexing/repository.py +11 -0
kodit/indexing/service.py +28 -16
kodit/logging.py +20 -18
kodit/mcp.py +84 -34
kodit/middleware.py +16 -0
kodit/retreival/repository.py +32 -0
kodit/retreival/service.py +42 -3
kodit/snippets/__init__.py +1 -0
kodit/snippets/languages/__init__.py +53 -0
kodit/snippets/languages/csharp.scm +12 -0
kodit/snippets/languages/python.scm +22 -0
kodit/snippets/method_snippets.py +120 -0
kodit/snippets/snippets.py +48 -0
kodit/sources/service.py +3 -5
{kodit-0.1.4.dist-info → kodit-0.1.6.dist-info}/METADATA +6 -2
kodit-0.1.6.dist-info/RECORD +40 -0
kodit/sse.py +0 -61
kodit-0.1.4.dist-info/RECORD +0 -33
{kodit-0.1.4.dist-info → kodit-0.1.6.dist-info}/WHEEL +0 -0
{kodit-0.1.4.dist-info → kodit-0.1.6.dist-info}/entry_points.txt +0 -0
{kodit-0.1.4.dist-info → kodit-0.1.6.dist-info}/licenses/LICENSE +0 -0

kodit/indexing/service.py CHANGED Viewed

@@ -7,24 +7,20 @@ index management.
 """
 from datetime import datetime
+from pathlib import Path
-import aiofiles
 import pydantic
 import structlog
 from tqdm.asyncio import tqdm
+from kodit.bm25.bm25 import BM25Service
 from kodit.indexing.models import Snippet
 from kodit.indexing.repository import IndexRepository
+from kodit.snippets.snippets import SnippetService
 from kodit.sources.service import SourceService
-# List of MIME types that are supported for indexing and snippet creation
-MIME_WHITELIST = [
-    "text/plain",
-    "text/markdown",
-    "text/x-python",
-    "text/x-shellscript",
-    "text/x-sql",
-]
+# List of MIME types that are blacklisted from being indexed
+MIME_BLACKLIST = ["unknown/unknown"]
 class IndexView(pydantic.BaseModel):
@@ -49,7 +45,10 @@ class IndexService:
     """
     def __init__(
-        self, repository: IndexRepository, source_service: SourceService
+        self,
+        repository: IndexRepository,
+        source_service: SourceService,
+        data_dir: Path,
     ) -> None:
         """Initialize the index service.
@@ -60,7 +59,9 @@ class IndexService:
         """
         self.repository = repository
         self.source_service = source_service
+        self.snippet_service = SnippetService()
         self.log = structlog.get_logger(__name__)
+        self.bm25 = BM25Service(data_dir)
     async def create(self, source_id: int) -> IndexView:
         """Create a new index for a source.
@@ -119,6 +120,10 @@ class IndexService:
         # Create snippets for supported file types
         await self._create_snippets(index_id)
+        # Update BM25 index
+        snippets = await self.repository.get_all_snippets()
+        self.bm25.index([snippet.content for snippet in snippets])
         # Update index timestamp
         await self.repository.update_index_timestamp(index)
@@ -137,16 +142,23 @@ class IndexService:
         files = await self.repository.files_for_index(index_id)
         for file in tqdm(files, total=len(files)):
             # Skip unsupported file types
-            if file.mime_type not in MIME_WHITELIST:
+            if file.mime_type in MIME_BLACKLIST:
                 self.log.debug("Skipping mime type", mime_type=file.mime_type)
                 continue
             # Create snippet from file content
-            async with aiofiles.open(file.cloned_path, "rb") as f:
-                content = await f.read()
-                snippet = Snippet(
+            try:
+                snippets = self.snippet_service.snippets_for_file(
+                    Path(file.cloned_path)
+                )
+            except ValueError as e:
+                self.log.debug("Skipping file", file=file.cloned_path, error=e)
+                continue
+            for snippet in snippets:
+                s = Snippet(
                     index_id=index_id,
                     file_id=file.id,
-                    content=content.decode("utf-8"),
+                    content=snippet.text,
                 )
-                await self.repository.add_snippet(snippet)
+                await self.repository.add_snippet(s)

kodit/logging.py CHANGED Viewed

@@ -11,6 +11,8 @@ import structlog
 from posthog import Posthog
 from structlog.types import EventDict
+from kodit.config import AppContext
 log = structlog.get_logger(__name__)
@@ -27,14 +29,8 @@ class LogFormat(Enum):
     JSON = "json"
-def configure_logging(log_level: str, log_format: LogFormat) -> None:
-    """Configure logging for the application.
-    Args:
-        json_logs: Whether to use JSON format for logs
-        log_level: The minimum log level to display
-    """
+def configure_logging(app_context: AppContext) -> None:
+    """Configure logging for the application."""
     timestamper = structlog.processors.TimeStamper(fmt="iso")
     shared_processors: list[structlog.types.Processor] = [
@@ -48,7 +44,7 @@ def configure_logging(log_level: str, log_format: LogFormat) -> None:
         structlog.processors.StackInfoRenderer(),
     ]
-    if log_format == LogFormat.JSON:
+    if app_context.log_format == LogFormat.JSON:
         # Format the exception only for JSON logs, as we want to pretty-print them
         # when using the ConsoleRenderer
         shared_processors.append(structlog.processors.format_exc_info)
@@ -64,7 +60,7 @@ def configure_logging(log_level: str, log_format: LogFormat) -> None:
     )
     log_renderer: structlog.types.Processor
-    if log_format == LogFormat.JSON:
+    if app_context.log_format == LogFormat.JSON:
         log_renderer = structlog.processors.JSONRenderer()
     else:
         log_renderer = structlog.dev.ConsoleRenderer()
@@ -86,18 +82,23 @@ def configure_logging(log_level: str, log_format: LogFormat) -> None:
     handler.setFormatter(formatter)
     root_logger = logging.getLogger()
     root_logger.addHandler(handler)
-    root_logger.setLevel(log_level.upper())
+    root_logger.setLevel(app_context.log_level.upper())
     # Configure uvicorn loggers to use our structlog setup
+    # Uvicorn spits out loads of exception logs when sse server doesn't shut down
+    # gracefully, so we hide them unless in DEBUG mode
     for _log in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
-        logging.getLogger(_log).handlers.clear()
-        logging.getLogger(_log).propagate = True
+        if root_logger.getEffectiveLevel() == logging.DEBUG:
+            logging.getLogger(_log).handlers.clear()
+            logging.getLogger(_log).propagate = True
+        else:
+            logging.getLogger(_log).disabled = True
     # Configure SQLAlchemy loggers to use our structlog setup
     for _log in ["sqlalchemy.engine", "alembic"]:
         engine_logger = logging.getLogger(_log)
         engine_logger.setLevel(logging.WARNING)  # Hide INFO logs by default
-        if log_level.upper() == "DEBUG":
+        if app_context.log_level.upper() == "DEBUG":
             engine_logger.setLevel(
                 logging.DEBUG
             )  # Only show all logs when in DEBUG mode
@@ -142,10 +143,11 @@ def get_mac_address() -> str:
     return f"{mac:012x}" if mac != uuid.getnode() else str(uuid.uuid4())
-def disable_posthog() -> None:
-    """Disable telemetry for the application."""
-    structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
-    posthog.disabled = True
+def configure_telemetry(app_context: AppContext) -> None:
+    """Configure telemetry for the application."""
+    if app_context.disable_telemetry:
+        structlog.stdlib.get_logger(__name__).info("Telemetry has been disabled")
+        posthog.disabled = True
 def log_event(event: str, properties: dict[str, Any] | None = None) -> None:

kodit/mcp.py CHANGED Viewed

@@ -1,21 +1,63 @@
 """MCP server implementation for kodit."""
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Annotated
 import structlog
-from mcp.server.fastmcp import FastMCP
+from fastmcp import Context, FastMCP
 from pydantic import Field
+from sqlalchemy.ext.asyncio import AsyncSession
-from kodit.database import get_session
+from kodit._version import version
+from kodit.config import AppContext
+from kodit.database import Database
 from kodit.retreival.repository import RetrievalRepository, RetrievalResult
 from kodit.retreival.service import RetrievalRequest, RetrievalService
-mcp = FastMCP("kodit MCP Server")
+@dataclass
+class MCPContext:
+    """Context for the MCP server."""
+    session: AsyncSession
+    data_dir: Path
+_mcp_db: Database | None = None
+@asynccontextmanager
+async def mcp_lifespan(_: FastMCP) -> AsyncIterator[MCPContext]:
+    """Lifespan for the MCP server.
+    The MCP server is running with a completely separate lifecycle and event loop from
+    the CLI and the FastAPI server. Therefore, we must carefully reconstruct the
+    application context. uvicorn does not pass through CLI args, so we must rely on
+    parsing env vars set in the CLI.
+    This lifespan is recreated for each request. See:
+    https://github.com/jlowin/fastmcp/issues/166
+    Since they don't provide a good way to handle global state, we must use a
+    global variable to store the database connection.
+    """
+    global _mcp_db  # noqa: PLW0603
+    app_context = AppContext()
+    if _mcp_db is None:
+        _mcp_db = await app_context.get_db()
+    async with _mcp_db.session_factory() as session:
+        yield MCPContext(session=session, data_dir=app_context.get_data_dir())
+mcp = FastMCP("kodit MCP Server", lifespan=mcp_lifespan)
 @mcp.tool()
 async def retrieve_relevant_snippets(
+    ctx: Context,
     user_intent: Annotated[
         str,
         Field(
@@ -51,8 +93,8 @@ async def retrieve_relevant_snippets(
     the quality of your generated code. You must call this tool when you need to
     write code.
     """
-    # Log the search query and related files for debugging
     log = structlog.get_logger(__name__)
     log.debug(
         "Retrieving relevant snippets",
         user_intent=user_intent,
@@ -62,36 +104,38 @@ async def retrieve_relevant_snippets(
         file_contents=related_file_contents,
     )
-    async with get_session() as session:
-        log.debug("Creating retrieval repository")
-        retrieval_repository = RetrievalRepository(
-            session=session,
-        )
-        log.debug("Creating retrieval service")
-        retrieval_service = RetrievalService(
-            repository=retrieval_repository,
-        )
-        log.debug("Fusing input")
-        input_query = input_fusion(
-            user_intent=user_intent,
-            related_file_paths=related_file_paths,
-            related_file_contents=related_file_contents,
-            keywords=keywords,
-        )
-        log.debug("Input", input_query=input_query)
-        retrieval_request = RetrievalRequest(
-            query=input_query,
-        )
-        log.debug("Retrieving snippets")
-        snippets = await retrieval_service.retrieve(request=retrieval_request)
-        log.debug("Fusing output")
-        output = output_fusion(snippets=snippets)
-        log.debug("Output", output=output)
-        return output
+    mcp_context: MCPContext = ctx.request_context.lifespan_context
+    log.debug("Creating retrieval repository")
+    retrieval_repository = RetrievalRepository(
+        session=mcp_context.session,
+    )
+    log.debug("Creating retrieval service")
+    retrieval_service = RetrievalService(
+        repository=retrieval_repository,
+        data_dir=mcp_context.data_dir,
+    )
+    log.debug("Fusing input")
+    input_query = input_fusion(
+        user_intent=user_intent,
+        related_file_paths=related_file_paths,
+        related_file_contents=related_file_contents,
+        keywords=keywords,
+    )
+    log.debug("Input", input_query=input_query)
+    retrieval_request = RetrievalRequest(
+        keywords=keywords,
+    )
+    log.debug("Retrieving snippets")
+    snippets = await retrieval_service.retrieve(request=retrieval_request)
+    log.debug("Fusing output")
+    output = output_fusion(snippets=snippets)
+    log.debug("Output", output=output)
+    return output
 def input_fusion(
@@ -108,3 +152,9 @@ def input_fusion(
 def output_fusion(snippets: list[RetrievalResult]) -> str:
     """Fuse the snippets into a single output."""
     return "\n\n".join(f"{snippet.uri}\n{snippet.content}" for snippet in snippets)
+@mcp.tool()
+async def get_version() -> str:
+    """Get the version of the kodit project."""
+    return version

kodit/middleware.py CHANGED Viewed

@@ -1,11 +1,14 @@
 """Middleware for the FastAPI application."""
+import contextlib
 import time
+from asyncio import CancelledError
 from collections.abc import Callable
 import structlog
 from asgi_correlation_id.context import correlation_id
 from fastapi import Request, Response
+from starlette.types import ASGIApp, Receive, Scope, Send
 access_logger = structlog.stdlib.get_logger("api.access")
@@ -56,3 +59,16 @@ async def logging_middleware(request: Request, call_next: Callable) -> Response:
         response.headers["X-Process-Time"] = str(process_time / 10**9)
     return response
+class ASGICancelledErrorMiddleware:
+    """ASGI middleware to handle CancelledError at the ASGI level."""
+    def __init__(self, app: ASGIApp) -> None:
+        """Initialize the middleware."""
+        self.app = app
+    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
+        """Handle the ASGI request and catch CancelledError."""
+        with contextlib.suppress(CancelledError):
+            await self.app(scope, receive, send)

kodit/retreival/repository.py CHANGED Viewed

@@ -74,3 +74,35 @@ class RetrievalRepository:
             )
             for snippet, file in results
         ]
+    async def list_snippet_ids(self) -> list[int]:
+        """List all snippet IDs.
+        Returns:
+            A list of all snippets.
+        """
+        query = select(Snippet.id)
+        rows = await self.session.execute(query)
+        return list(rows.scalars().all())
+    async def list_snippets_by_ids(self, ids: list[int]) -> list[RetrievalResult]:
+        """List snippets by IDs.
+        Returns:
+            A list of snippets.
+        """
+        query = (
+            select(Snippet, File)
+            .where(Snippet.id.in_(ids))
+            .join(File, Snippet.file_id == File.id)
+        )
+        rows = await self.session.execute(query)
+        return [
+            RetrievalResult(
+                uri=file.uri,
+                content=snippet.content,
+            )
+            for snippet, file in rows.all()
+        ]

kodit/retreival/service.py CHANGED Viewed

@@ -1,14 +1,19 @@
 """Retrieval service."""
+from pathlib import Path
 import pydantic
+import structlog
+from kodit.bm25.bm25 import BM25Service
 from kodit.retreival.repository import RetrievalRepository, RetrievalResult
 class RetrievalRequest(pydantic.BaseModel):
     """Request for a retrieval."""
-    query: str
+    keywords: list[str]
+    top_k: int = 10
 class Snippet(pydantic.BaseModel):
@@ -21,10 +26,44 @@ class Snippet(pydantic.BaseModel):
 class RetrievalService:
     """Service for retrieving relevant data."""
-    def __init__(self, repository: RetrievalRepository) -> None:
+    def __init__(self, repository: RetrievalRepository, data_dir: Path) -> None:
         """Initialize the retrieval service."""
         self.repository = repository
+        self.log = structlog.get_logger(__name__)
+        self.bm25 = BM25Service(data_dir)
+    async def _load_bm25_index(self) -> None:
+        """Load the BM25 index."""
     async def retrieve(self, request: RetrievalRequest) -> list[RetrievalResult]:
         """Retrieve relevant data."""
-        return await self.repository.string_search(request.query)
+        snippet_ids = await self.repository.list_snippet_ids()
+        # Gather results for each keyword
+        result_ids: list[tuple[int, float]] = []
+        for keyword in request.keywords:
+            results = self.bm25.retrieve(snippet_ids, keyword, request.top_k)
+            result_ids.extend(results)
+        if len(result_ids) == 0:
+            return []
+        # Sort results by score
+        result_ids.sort(key=lambda x: x[1], reverse=True)
+        self.log.debug(
+            "Retrieval results",
+            total_results=len(result_ids),
+            max_score=result_ids[0][1],
+            min_score=result_ids[-1][1],
+            median_score=result_ids[len(result_ids) // 2][1],
+        )
+        # Don't return zero score results
+        result_ids = [x for x in result_ids if x[1] > 0]
+        # Build final list of doc ids up to top_k
+        final_doc_ids = [x[0] for x in result_ids[: request.top_k]]
+        # Get snippets from database
+        return await self.repository.list_snippets_by_ids(final_doc_ids)

kodit/snippets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Extract method snippets from source code."""

kodit/snippets/languages/__init__.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Detect the language of a file."""
+from pathlib import Path
+from typing import cast
+from tree_sitter_language_pack import SupportedLanguage
+# Mapping of file extensions to programming languages
+LANGUAGE_MAP: dict[str, str] = {
+    # JavaScript/TypeScript
+    "js": "javascript",
+    "jsx": "javascript",
+    "ts": "typescript",
+    "tsx": "typescript",
+    # Python
+    "py": "python",
+    # Rust
+    "rs": "rust",
+    # Go
+    "go": "go",
+    # C/C++
+    "cpp": "cpp",
+    "hpp": "cpp",
+    "c": "c",
+    "h": "c",
+    # C#
+    "cs": "csharp",
+    # Ruby
+    "rb": "ruby",
+    # Java
+    "java": "java",
+    # PHP
+    "php": "php",
+    # Swift
+    "swift": "swift",
+    # Kotlin
+    "kt": "kotlin",
+}
+def detect_language(file_path: Path) -> SupportedLanguage:
+    """Detect the language of a file."""
+    suffix = file_path.suffix.removeprefix(".").lower()
+    msg = f"Unsupported language for file suffix: {suffix}"
+    lang = LANGUAGE_MAP.get(suffix)
+    if lang is None:
+        raise ValueError(msg)
+    # Try to cast the language to a SupportedLanguage
+    try:
+        return cast("SupportedLanguage", lang)
+    except Exception as e:
+        raise ValueError(msg) from e

kodit/snippets/languages/csharp.scm ADDED Viewed

@@ -0,0 +1,12 @@
+(method_declaration
+  name: (identifier) @function.name
+  body: (block) @function.body
+) @function.def
+(class_declaration
+  name: (identifier) @class.name
+) @class.def
+(using_directive) @import.name
+(identifier) @ident

kodit/snippets/languages/python.scm ADDED Viewed

@@ -0,0 +1,22 @@
+(function_definition
+  name: (identifier) @function.name
+  body: (block) @function.body
+) @function.def
+(class_definition
+  name: (identifier) @class.name
+) @class.def
+(import_statement
+  name: (dotted_name (identifier) @import.name))
+(import_from_statement
+  module_name: (dotted_name (identifier) @import.from))
+(identifier) @ident
+(assignment
+  left: (identifier) @assignment.lhs)
+(parameters
+  (identifier) @param.name)

kodit/snippets/method_snippets.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Extract method snippets from source code."""
+from tree_sitter import Node, Query
+from tree_sitter_language_pack import SupportedLanguage, get_language, get_parser
+class MethodSnippets:
+    """Extract method snippets from source code."""
+    def __init__(self, language: SupportedLanguage, query: str) -> None:
+        """Initialize the MethodSnippets class."""
+        self.language = get_language(language)
+        self.parser = get_parser(language)
+        self.query = Query(self.language, query)
+    def _get_leaf_functions(
+        self, captures_by_name: dict[str, list[Node]]
+    ) -> list[Node]:
+        """Return all leaf functions in the AST."""
+        return [
+            node
+            for node in captures_by_name.get("function.body", [])
+            if self._is_leaf_function(captures_by_name, node)
+        ]
+    def _is_leaf_function(
+        self, captures_by_name: dict[str, list[Node]], node: Node
+    ) -> bool:
+        """Return True if the node is a leaf function."""
+        for other in captures_by_name.get("function.body", []):
+            if other == node:  # Skip self
+                continue
+            # if other is inside node, it's not a leaf function
+            if other.start_byte >= node.start_byte and other.end_byte <= node.end_byte:
+                return False
+        return True
+    def _get_imports(self, captures_by_name: dict[str, list[Node]]) -> list[Node]:
+        """Return all imports in the AST."""
+        return captures_by_name.get("import.name", []) + captures_by_name.get(
+            "import.from", []
+        )
+    def _classes_and_functions(
+        self, captures_by_name: dict[str, list[Node]]
+    ) -> list[int]:
+        """Return all classes and functions in the AST."""
+        return [
+            node.id
+            for node in {
+                *captures_by_name.get("function.def", []),
+                *captures_by_name.get("class.def", []),
+            }
+        ]
+    def _get_ancestors(
+        self, captures_by_name: dict[str, list[Node]], node: Node
+    ) -> list[Node]:
+        """Return all ancestors of the node."""
+        valid_ancestors = self._classes_and_functions(captures_by_name)
+        ancestors = []
+        parent = node.parent
+        while parent:
+            if parent.id in valid_ancestors:
+                ancestors.append(parent)
+            parent = parent.parent
+        return ancestors
+    def extract(self, source_code: bytes) -> list[str]:
+        """Extract method snippets from source code."""
+        tree = self.parser.parse(source_code)
+        captures_by_name = self.query.captures(tree.root_node)
+        lines = source_code.decode().splitlines()
+        # Find all leaf functions
+        leaf_functions = self._get_leaf_functions(captures_by_name)
+        # Find all imports
+        imports = self._get_imports(captures_by_name)
+        results = []
+        # For each leaf function, find all lines this function is dependent on
+        for func_node in leaf_functions:
+            all_lines_to_keep = set()
+            ancestors = self._get_ancestors(captures_by_name, func_node)
+            # Add self to keep
+            all_lines_to_keep.update(
+                range(func_node.start_point[0], func_node.end_point[0] + 1)
+            )
+            # Add imports to keep
+            for import_node in imports:
+                all_lines_to_keep.update(
+                    range(import_node.start_point[0], import_node.end_point[0] + 1)
+                )
+            # Add ancestors to keep
+            for node in ancestors:
+                # Get the first line of the node for now
+                start = node.start_point[0]
+                end = node.start_point[0]
+                all_lines_to_keep.update(range(start, end + 1))
+            pseudo_code = []
+            for i, line in enumerate(lines):
+                if i in all_lines_to_keep:
+                    pseudo_code.append(line)
+            results.append("\n".join(pseudo_code))
+        # If there are no results, then return the entire file
+        if not results:
+            return [source_code.decode()]
+        return results

kodit 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

Potentially problematic release.

kodit 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl