PyPI - kodit - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

kodit 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of kodit might be problematic. Click here for more details.

Files changed (28) hide show

kodit/_version.py +2 -2
kodit/bm25/bm25.py +1 -1
kodit/cli.py +101 -9
kodit/config.py +2 -0
kodit/database.py +2 -2
kodit/embedding/__init__.py +1 -0
kodit/embedding/embedding.py +52 -0
kodit/embedding/models.py +28 -0
kodit/indexing/repository.py +11 -0
kodit/indexing/service.py +24 -3
kodit/{logging.py → log.py} +7 -1
kodit/mcp.py +3 -9
kodit/{alembic → migrations}/env.py +1 -0
kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +47 -0
kodit/retreival/repository.py +128 -5
kodit/retreival/service.py +82 -27
kodit/sources/service.py +2 -2
{kodit-0.1.9.dist-info → kodit-0.1.11.dist-info}/METADATA +3 -1
kodit-0.1.11.dist-info/RECORD +44 -0
kodit-0.1.9.dist-info/RECORD +0 -40
/kodit/{alembic → migrations}/README +0 -0
/kodit/{alembic → migrations}/__init__.py +0 -0
/kodit/{alembic → migrations}/script.py.mako +0 -0
/kodit/{alembic → migrations}/versions/85155663351e_initial.py +0 -0
/kodit/{alembic → migrations}/versions/__init__.py +0 -0
{kodit-0.1.9.dist-info → kodit-0.1.11.dist-info}/WHEEL +0 -0
{kodit-0.1.9.dist-info → kodit-0.1.11.dist-info}/entry_points.txt +0 -0
{kodit-0.1.9.dist-info → kodit-0.1.11.dist-info}/licenses/LICENSE +0 -0

kodit/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.1.9'
-__version_tuple__ = version_tuple = (0, 1, 9)
+__version__ = version = '0.1.11'
+__version_tuple__ = version_tuple = (0, 1, 11)

kodit/bm25/bm25.py CHANGED Viewed

@@ -38,7 +38,7 @@ class BM25Service:
         self.log.debug("Indexing corpus")
         vocab = self._tokenize(corpus)
         self.retriever = bm25s.BM25()
-        self.retriever.index(vocab)
+        self.retriever.index(vocab, show_progress=False)
         self.retriever.save(self.index_path)
     def retrieve(

kodit/cli.py CHANGED Viewed

@@ -15,6 +15,7 @@ from kodit.config import (
     DEFAULT_BASE_DIR,
     DEFAULT_DB_URL,
     DEFAULT_DISABLE_TELEMETRY,
+    DEFAULT_EMBEDDING_MODEL_NAME,
     DEFAULT_LOG_FORMAT,
     DEFAULT_LOG_LEVEL,
     AppContext,
@@ -23,7 +24,7 @@ from kodit.config import (
 )
 from kodit.indexing.repository import IndexRepository
 from kodit.indexing.service import IndexService
-from kodit.logging import configure_logging, configure_telemetry, log_event
+from kodit.log import configure_logging, configure_telemetry, log_event
 from kodit.retreival.repository import RetrievalRepository
 from kodit.retreival.service import RetrievalRequest, RetrievalService
 from kodit.sources.repository import SourceRepository
@@ -97,7 +98,12 @@ async def index(
     source_repository = SourceRepository(session)
     source_service = SourceService(app_context.get_clone_dir(), source_repository)
     repository = IndexRepository(session)
-    service = IndexService(repository, source_service, app_context.get_data_dir())
+    service = IndexService(
+        repository,
+        source_service,
+        app_context.get_data_dir(),
+        embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
+    )
     if not sources:
         # No source specified, list all indexes
@@ -133,20 +139,106 @@ async def index(
         await service.run(index.id)
-@cli.command()
+@cli.group()
+def search() -> None:
+    """Search for snippets in the database."""
+@search.command()
 @click.argument("query")
 @click.option("--top-k", default=10, help="Number of snippets to retrieve")
 @with_app_context
 @with_session
-async def retrieve(
-    session: AsyncSession, app_context: AppContext, query: str, top_k: int
+async def code(
+    session: AsyncSession,
+    app_context: AppContext,
+    query: str,
+    top_k: int,
+) -> None:
+    """Search for snippets using semantic code search.
+    This works best if your query is code.
+    """
+    repository = RetrievalRepository(session)
+    service = RetrievalService(
+        repository,
+        app_context.get_data_dir(),
+        embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
+    )
+    snippets = await service.retrieve(RetrievalRequest(code_query=query, top_k=top_k))
+    if len(snippets) == 0:
+        click.echo("No snippets found")
+        return
+    for snippet in snippets:
+        click.echo("-" * 80)
+        click.echo(f"{snippet.uri}")
+        click.echo(snippet.content)
+        click.echo("-" * 80)
+        click.echo()
+@search.command()
+@click.argument("keywords", nargs=-1)
+@click.option("--top-k", default=10, help="Number of snippets to retrieve")
+@with_app_context
+@with_session
+async def keyword(
+    session: AsyncSession,
+    app_context: AppContext,
+    keywords: list[str],
+    top_k: int,
 ) -> None:
-    """Retrieve snippets from the database."""
+    """Search for snippets using keyword search."""
     repository = RetrievalRepository(session)
-    service = RetrievalService(repository, app_context.get_data_dir())
-    # Temporary request while we don't have all search capabilities
+    service = RetrievalService(
+        repository,
+        app_context.get_data_dir(),
+        embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
+    )
+    snippets = await service.retrieve(RetrievalRequest(keywords=keywords, top_k=top_k))
+    if len(snippets) == 0:
+        click.echo("No snippets found")
+        return
+    for snippet in snippets:
+        click.echo("-" * 80)
+        click.echo(f"{snippet.uri}")
+        click.echo(snippet.content)
+        click.echo("-" * 80)
+        click.echo()
+@search.command()
+@click.option("--top-k", default=10, help="Number of snippets to retrieve")
+@click.option("--keywords", required=True, help="Comma separated list of keywords")
+@click.option("--code", required=True, help="Semantic code search query")
+@with_app_context
+@with_session
+async def hybrid(
+    session: AsyncSession,
+    app_context: AppContext,
+    top_k: int,
+    keywords: str,
+    code: str,
+) -> None:
+    """Search for snippets using hybrid search."""
+    repository = RetrievalRepository(session)
+    service = RetrievalService(
+        repository,
+        app_context.get_data_dir(),
+        embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
+    )
+    # Parse keywords into a list of strings
+    keywords_list = [k.strip().lower() for k in keywords.split(",")]
     snippets = await service.retrieve(
-        RetrievalRequest(keywords=query.split(","), top_k=top_k)
+        RetrievalRequest(keywords=keywords_list, code_query=code, top_k=top_k)
     )
     if len(snippets) == 0:

kodit/config.py CHANGED Viewed

@@ -11,12 +11,14 @@ from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from kodit.database import Database
+from kodit.embedding.embedding import TINY
 DEFAULT_BASE_DIR = Path.home() / ".kodit"
 DEFAULT_DB_URL = f"sqlite+aiosqlite:///{DEFAULT_BASE_DIR}/kodit.db"
 DEFAULT_LOG_LEVEL = "INFO"
 DEFAULT_LOG_FORMAT = "pretty"
 DEFAULT_DISABLE_TELEMETRY = False
+DEFAULT_EMBEDDING_MODEL_NAME = TINY
 T = TypeVar("T")

kodit/database.py CHANGED Viewed

@@ -15,7 +15,7 @@ from sqlalchemy.ext.asyncio import (
 )
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
-from kodit import alembic
+from kodit import migrations
 class Base(AsyncAttrs, DeclarativeBase):
@@ -57,7 +57,7 @@ class Database:
         # Create Alembic configuration and run migrations
         alembic_cfg = AlembicConfig()
         alembic_cfg.set_main_option(
-            "script_location", str(Path(alembic.__file__).parent)
+            "script_location", str(Path(migrations.__file__).parent)
         )
         alembic_cfg.set_main_option("sqlalchemy.url", db_url)
         self.log.debug("Running migrations", db_url=db_url)

kodit/embedding/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Embedding module."""

kodit/embedding/embedding.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Embedding service."""
+import os
+from collections.abc import Generator
+import structlog
+from sentence_transformers import SentenceTransformer
+TINY = "tiny"
+CODE = "code"
+TEST = "test"
+COMMON_EMBEDDING_MODELS = {
+    TINY: "ibm-granite/granite-embedding-30m-english",
+    CODE: "flax-sentence-embeddings/st-codesearch-distilroberta-base",
+    TEST: "minishlab/potion-base-4M",
+}
+class EmbeddingService:
+    """Service for embeddings."""
+    def __init__(self, model_name: str) -> None:
+        """Initialize the embedding service."""
+        self.log = structlog.get_logger(__name__)
+        self.model_name = COMMON_EMBEDDING_MODELS.get(model_name, model_name)
+        self.embedding_model = None
+    def _model(self) -> SentenceTransformer:
+        """Get the embedding model."""
+        if self.embedding_model is None:
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid warnings
+            self.embedding_model = SentenceTransformer(
+                self.model_name,
+                trust_remote_code=True,
+                device="cpu",  # Force CPU so we don't have to install accelerate, etc.
+            )
+        return self.embedding_model
+    def embed(self, snippets: list[str]) -> Generator[list[float], None, None]:
+        """Embed a list of documents."""
+        model = self._model()
+        embeddings = model.encode(snippets, show_progress_bar=False, batch_size=4)
+        for embedding in embeddings:
+            yield [float(x) for x in embedding]
+    def query(self, query: list[str]) -> Generator[list[float], None, None]:
+        """Query the embedding model."""
+        model = self._model()
+        embeddings = model.encode(query, show_progress_bar=False, batch_size=4)
+        for embedding in embeddings:
+            yield [float(x) for x in embedding]

kodit/embedding/models.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Embedding models."""
+from enum import Enum
+from sqlalchemy import JSON, ForeignKey
+from sqlalchemy import Enum as SQLAlchemyEnum
+from sqlalchemy.orm import Mapped, mapped_column
+from kodit.database import Base, CommonMixin
+class EmbeddingType(Enum):
+    """Embedding type."""
+    CODE = 1
+    TEXT = 2
+class Embedding(Base, CommonMixin):
+    """Embedding model."""
+    __tablename__ = "embeddings"
+    snippet_id: Mapped[int] = mapped_column(ForeignKey("snippets.id"), index=True)
+    type: Mapped[EmbeddingType] = mapped_column(
+        SQLAlchemyEnum(EmbeddingType), index=True
+    )
+    embedding: Mapped[list[float]] = mapped_column(JSON)

kodit/indexing/repository.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import TypeVar
 from sqlalchemy import delete, func, select
 from sqlalchemy.ext.asyncio import AsyncSession
+from kodit.embedding.models import Embedding
 from kodit.indexing.models import Index, Snippet
 from kodit.sources.models import File, Source
@@ -165,3 +166,13 @@ class IndexRepository:
         query = select(Snippet).order_by(Snippet.id)
         result = await self.session.execute(query)
         return list(result.scalars())
+    async def add_embedding(self, embedding: Embedding) -> None:
+        """Add a new embedding to the database.
+        Args:
+            embedding: The Embedding instance to add.
+        """
+        self.session.add(embedding)
+        await self.session.commit()

kodit/indexing/service.py CHANGED Viewed

@@ -14,6 +14,8 @@ import structlog
 from tqdm.asyncio import tqdm
 from kodit.bm25.bm25 import BM25Service
+from kodit.embedding.embedding import EmbeddingService
+from kodit.embedding.models import Embedding, EmbeddingType
 from kodit.indexing.models import Snippet
 from kodit.indexing.repository import IndexRepository
 from kodit.snippets.snippets import SnippetService
@@ -50,6 +52,7 @@ class IndexService:
         repository: IndexRepository,
         source_service: SourceService,
         data_dir: Path,
+        embedding_model_name: str,
     ) -> None:
         """Initialize the index service.
@@ -63,6 +66,7 @@ class IndexService:
         self.snippet_service = SnippetService()
         self.log = structlog.get_logger(__name__)
         self.bm25 = BM25Service(data_dir)
+        self.code_embedding_service = EmbeddingService(model_name=embedding_model_name)
     async def create(self, source_id: int) -> IndexView:
         """Create a new index for a source.
@@ -128,9 +132,26 @@ class IndexService:
         # Create snippets for supported file types
         await self._create_snippets(index_id)
-        # Update BM25 index
         snippets = await self.repository.get_all_snippets()
-        self.bm25.index([snippet.content for snippet in snippets])
+        self.log.info("Creating keyword index")
+        self.bm25.index(
+            [
+                snippet.content
+                for snippet in tqdm(snippets, total=len(snippets), leave=False)
+            ]
+        )
+        self.log.info("Creating semantic code index")
+        for snippet in tqdm(snippets, total=len(snippets), leave=False):
+            embedding = next(self.code_embedding_service.embed([snippet.content]))
+            await self.repository.add_embedding(
+                Embedding(
+                    snippet_id=snippet.id,
+                    embedding=embedding,
+                    type=EmbeddingType.CODE,
+                )
+            )
         # Update index timestamp
         await self.repository.update_index_timestamp(index)
@@ -148,7 +169,7 @@ class IndexService:
         """
         files = await self.repository.files_for_index(index_id)
-        for file in tqdm(files, total=len(files)):
+        for file in tqdm(files, total=len(files), leave=False):
             # Skip unsupported file types
             if file.mime_type in MIME_BLACKLIST:
                 self.log.debug("Skipping mime type", mime_type=file.mime_type)

kodit/{logging.py → log.py} RENAMED Viewed

@@ -87,7 +87,13 @@ def configure_logging(app_context: AppContext) -> None:
     # Configure uvicorn loggers to use our structlog setup
     # Uvicorn spits out loads of exception logs when sse server doesn't shut down
     # gracefully, so we hide them unless in DEBUG mode
-    for _log in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
+    for _log in [
+        "uvicorn",
+        "uvicorn.error",
+        "uvicorn.access",
+        "bm25s",
+        "sentence_transformers.SentenceTransformer",
+    ]:
         if root_logger.getEffectiveLevel() == logging.DEBUG:
             logging.getLogger(_log).handlers.clear()
             logging.getLogger(_log).propagate = True

kodit/mcp.py CHANGED Viewed

@@ -12,7 +12,7 @@ from pydantic import Field
 from sqlalchemy.ext.asyncio import AsyncSession
 from kodit._version import version
-from kodit.config import AppContext
+from kodit.config import DEFAULT_EMBEDDING_MODEL_NAME, AppContext
 from kodit.database import Database
 from kodit.retreival.repository import RetrievalRepository, RetrievalResult
 from kodit.retreival.service import RetrievalRequest, RetrievalService
@@ -115,18 +115,12 @@ async def retrieve_relevant_snippets(
     retrieval_service = RetrievalService(
         repository=retrieval_repository,
         data_dir=mcp_context.data_dir,
+        embedding_model_name=DEFAULT_EMBEDDING_MODEL_NAME,
     )
-    log.debug("Fusing input")
-    input_query = input_fusion(
-        user_intent=user_intent,
-        related_file_paths=related_file_paths,
-        related_file_contents=related_file_contents,
-        keywords=keywords,
-    )
-    log.debug("Input", input_query=input_query)
     retrieval_request = RetrievalRequest(
         keywords=keywords,
+        code_query="\n".join(related_file_contents),
     )
     log.debug("Retrieving snippets")
     snippets = await retrieval_service.retrieve(request=retrieval_request)

kodit/{alembic → migrations}/env.py RENAMED Viewed

@@ -8,6 +8,7 @@ from sqlalchemy import pool
 from sqlalchemy.engine import Connection
 from sqlalchemy.ext.asyncio import async_engine_from_config
+import kodit.embedding.models
 import kodit.indexing.models
 import kodit.sources.models
 from kodit.database import Base

kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py ADDED Viewed

@@ -0,0 +1,47 @@
+# ruff: noqa
+"""add embeddings table
+Revision ID: 7c3bbc2ab32b
+Revises: 85155663351e
+Create Date: 2025-05-23 17:23:09.924980
+"""
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+# revision identifiers, used by Alembic.
+revision: str = '7c3bbc2ab32b'
+down_revision: Union[str, None] = '85155663351e'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('embeddings',
+    sa.Column('snippet_id', sa.Integer(), nullable=False),
+    sa.Column('type', sa.Enum('CODE', 'TEXT', name='embeddingtype'), nullable=False),
+    sa.Column('embedding', sa.JSON(), nullable=False),
+    sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('created_at', sa.DateTime(), nullable=False),
+    sa.Column('updated_at', sa.DateTime(), nullable=False),
+    sa.ForeignKeyConstraint(['snippet_id'], ['snippets.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index(op.f('ix_embeddings_snippet_id'), 'embeddings', ['snippet_id'], unique=False)
+    op.create_index(op.f('ix_embeddings_type'), 'embeddings', ['type'], unique=False)
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_embeddings_type'), table_name='embeddings')
+    op.drop_index(op.f('ix_embeddings_snippet_id'), table_name='embeddings')
+    op.drop_table('embeddings')
+    # ### end Alembic commands ###

kodit/retreival/repository.py CHANGED Viewed

@@ -7,10 +7,14 @@ and their associated file information.
 from typing import TypeVar
+import numpy as np
 import pydantic
-from sqlalchemy import select
+from sqlalchemy import (
+    select,
+)
 from sqlalchemy.ext.asyncio import AsyncSession
+from kodit.embedding.models import Embedding, EmbeddingType
 from kodit.indexing.models import Snippet
 from kodit.sources.models import File
@@ -24,8 +28,10 @@ class RetrievalResult(pydantic.BaseModel):
     and the matching snippet content.
     """
+    id: int
     uri: str
     content: str
+    score: float
 class RetrievalRepository:
@@ -69,8 +75,10 @@ class RetrievalRepository:
         return [
             RetrievalResult(
+                id=snippet.id,
                 uri=file.uri,
                 content=snippet.content,
+                score=1.0,
             )
             for snippet, file in results
         ]
@@ -90,7 +98,7 @@ class RetrievalRepository:
         """List snippets by IDs.
         Returns:
-            A list of snippets.
+            A list of snippets in the same order as the input IDs.
         """
         query = (
@@ -99,10 +107,125 @@ class RetrievalRepository:
             .join(File, Snippet.file_id == File.id)
         )
         rows = await self.session.execute(query)
-        return [
-            RetrievalResult(
+        # Create a dictionary for O(1) lookup of results by ID
+        id_to_result = {
+            snippet.id: RetrievalResult(
+                id=snippet.id,
                 uri=file.uri,
                 content=snippet.content,
+                score=1.0,
             )
             for snippet, file in rows.all()
-        ]
+        }
+        # Return results in the same order as input IDs
+        return [id_to_result[i] for i in ids]
+    async def fetch_embeddings(
+        self, embedding_type: EmbeddingType
+    ) -> list[tuple[int, list[float]]]:
+        """Fetch all embeddings of a given type from the database.
+        Args:
+            embedding_type: The type of embeddings to fetch
+        Returns:
+            List of (snippet_id, embedding) tuples
+        """
+        # Only select the fields we need and use a more efficient query
+        query = select(Embedding.snippet_id, Embedding.embedding).where(
+            Embedding.type == embedding_type
+        )
+        rows = await self.session.execute(query)
+        return [tuple(row) for row in rows.all()]  # Convert Row objects to tuples
+    def prepare_vectors(
+        self, embeddings: list[tuple[int, list[float]]], query_embedding: list[float]
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Convert embeddings to numpy arrays.
+        Args:
+            embeddings: List of (snippet_id, embedding) tuples
+            query_embedding: Query embedding vector
+        Returns:
+            Tuple of (stored_vectors, query_vector) as numpy arrays
+        """
+        stored_vecs = np.array(
+            [emb[1] for emb in embeddings]
+        )  # Use index 1 to get embedding
+        query_vec = np.array(query_embedding)
+        return stored_vecs, query_vec
+    def compute_similarities(
+        self, stored_vecs: np.ndarray, query_vec: np.ndarray
+    ) -> np.ndarray:
+        """Compute cosine similarities between stored vectors and query vector.
+        Args:
+            stored_vecs: Array of stored embedding vectors
+            query_vec: Query embedding vector
+        Returns:
+            Array of similarity scores
+        """
+        stored_norms = np.linalg.norm(stored_vecs, axis=1)
+        query_norm = np.linalg.norm(query_vec)
+        return np.dot(stored_vecs, query_vec) / (stored_norms * query_norm)
+    def get_top_k_results(
+        self,
+        similarities: np.ndarray,
+        embeddings: list[tuple[int, list[float]]],
+        top_k: int,
+    ) -> list[tuple[int, float]]:
+        """Get top-k results by similarity score.
+        Args:
+            similarities: Array of similarity scores
+            embeddings: List of (snippet_id, embedding) tuples
+            top_k: Number of results to return
+        Returns:
+            List of (snippet_id, similarity_score) tuples
+        """
+        top_indices = np.argsort(similarities)[::-1][:top_k]
+        return [
+            (embeddings[i][0], float(similarities[i])) for i in top_indices
+        ]  # Use index 0 to get snippet_id
+    async def list_semantic_results(
+        self, embedding_type: EmbeddingType, embedding: list[float], top_k: int = 10
+    ) -> list[tuple[int, float]]:
+        """List semantic results using cosine similarity.
+        This implementation fetches all embeddings of the given type and computes
+        cosine similarity in Python using NumPy for better performance.
+        Args:
+            embedding_type: The type of embeddings to search
+            embedding: The query embedding vector
+            top_k: Number of results to return
+        Returns:
+            List of (snippet_id, similarity_score) tuples, sorted by similarity
+        """
+        # Step 1: Fetch embeddings from database
+        embeddings = await self.fetch_embeddings(embedding_type)
+        if not embeddings:
+            return []
+        # Step 2: Convert to numpy arrays
+        stored_vecs, query_vec = self.prepare_vectors(embeddings, embedding)
+        # Step 3: Compute similarities
+        similarities = self.compute_similarities(stored_vecs, query_vec)
+        # Step 4: Get top-k results
+        return self.get_top_k_results(similarities, embeddings, top_k)

kodit/retreival/service.py CHANGED Viewed

@@ -6,13 +6,16 @@ import pydantic
 import structlog
 from kodit.bm25.bm25 import BM25Service
+from kodit.embedding.embedding import EmbeddingService
+from kodit.embedding.models import EmbeddingType
 from kodit.retreival.repository import RetrievalRepository, RetrievalResult
 class RetrievalRequest(pydantic.BaseModel):
     """Request for a retrieval."""
-    keywords: list[str]
+    code_query: str | None = None
+    keywords: list[str] | None = None
     top_k: int = 10
@@ -26,44 +29,96 @@ class Snippet(pydantic.BaseModel):
 class RetrievalService:
     """Service for retrieving relevant data."""
-    def __init__(self, repository: RetrievalRepository, data_dir: Path) -> None:
+    def __init__(
+        self,
+        repository: RetrievalRepository,
+        data_dir: Path,
+        embedding_model_name: str,
+    ) -> None:
         """Initialize the retrieval service."""
         self.repository = repository
         self.log = structlog.get_logger(__name__)
         self.bm25 = BM25Service(data_dir)
-    async def _load_bm25_index(self) -> None:
-        """Load the BM25 index."""
+        self.code_embedding_service = EmbeddingService(model_name=embedding_model_name)
     async def retrieve(self, request: RetrievalRequest) -> list[RetrievalResult]:
         """Retrieve relevant data."""
-        snippet_ids = await self.repository.list_snippet_ids()
+        fusion_list = []
+        if request.keywords:
+            snippet_ids = await self.repository.list_snippet_ids()
+            # Gather results for each keyword
+            result_ids: list[tuple[int, float]] = []
+            for keyword in request.keywords:
+                results = self.bm25.retrieve(snippet_ids, keyword, request.top_k)
+                result_ids.extend(results)
+            # Sort results by score
+            result_ids.sort(key=lambda x: x[1], reverse=True)
+            self.log.debug("Retrieval results (BM25)", results=result_ids)
+            bm25_results = [x[0] for x in result_ids]
+            fusion_list.append(bm25_results)
+        # Compute embedding for semantic query
+        semantic_results = []
+        if request.code_query:
+            query_embedding = next(
+                self.code_embedding_service.query([request.code_query])
+            )
+            query_results = await self.repository.list_semantic_results(
+                EmbeddingType.CODE, query_embedding, top_k=request.top_k
+            )
+            # Sort results by score
+            query_results.sort(key=lambda x: x[1], reverse=True)
-        # Gather results for each keyword
-        result_ids: list[tuple[int, float]] = []
-        for keyword in request.keywords:
-            results = self.bm25.retrieve(snippet_ids, keyword, request.top_k)
-            result_ids.extend(results)
+            # Extract the snippet ids from the query results
+            semantic_results = [x[0] for x in query_results]
+            fusion_list.append(semantic_results)
-        if len(result_ids) == 0:
+        if len(fusion_list) == 0:
             return []
-        # Sort results by score
-        result_ids.sort(key=lambda x: x[1], reverse=True)
+        # Combine all results together with RFF if required
+        final_results = reciprocal_rank_fusion(fusion_list, k=60)
+        # Extract ids from final results
+        final_ids = [x[0] for x in final_results]
+        # Get snippets from database (up to top_k)
+        return await self.repository.list_snippets_by_ids(final_ids[: request.top_k])
+def reciprocal_rank_fusion(
+    rankings: list[list[int]], k: float = 60
+) -> list[tuple[int, float]]:
+    """RRF prioritises results that are present in all results.
+    Args:
+        rankings: List of rankers, each containing a list of document ids. Top of the
+        list is considered to be the best result.
+        k: Parameter for RRF.
+    Returns:
+        Dictionary of ids and their scores.
+    """
+    scores = {}
+    for ranker in rankings:
+        for rank in ranker:
+            scores[rank] = float(0)
-        self.log.debug(
-            "Retrieval results",
-            total_results=len(result_ids),
-            max_score=result_ids[0][1],
-            min_score=result_ids[-1][1],
-            median_score=result_ids[len(result_ids) // 2][1],
-        )
+    for ranker in rankings:
+        for i, rank in enumerate(ranker):
+            scores[rank] += 1.0 / (k + i)
-        # Don't return zero score results
-        result_ids = [x for x in result_ids if x[1] > 0]
+    # Create a list of tuples of ids and their scores
+    results = [(rank, scores[rank]) for rank in scores]
-        # Build final list of doc ids up to top_k
-        final_doc_ids = [x[0] for x in result_ids[: request.top_k]]
+    # Sort results by score
+    results.sort(key=lambda x: x[1], reverse=True)
-        # Get snippets from database
-        return await self.repository.list_snippets_by_ids(final_doc_ids)
+    return results

kodit/sources/service.py CHANGED Viewed

@@ -165,7 +165,7 @@ class SourceService:
             file_count = sum(1 for _ in clone_path.rglob("*") if _.is_file())
             # Process each file in the source directory
-            for path in tqdm(clone_path.rglob("*"), total=file_count):
+            for path in tqdm(clone_path.rglob("*"), total=file_count, leave=False):
                 await self._process_file(source.id, path.absolute())
         return SourceView(
@@ -212,7 +212,7 @@ class SourceService:
             file_count = sum(1 for _ in clone_path.rglob("*") if _.is_file())
             # Process each file in the source directory
-            for path in tqdm(clone_path.rglob("*"), total=file_count):
+            for path in tqdm(clone_path.rglob("*"), total=file_count, leave=False):
                 await self._process_file(source.id, path.absolute())
         return SourceView(

{kodit-0.1.9.dist-info → kodit-0.1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kodit
-Version: 0.1.9
+Version: 0.1.11
 Summary: Code indexing for better AI code generation
 Project-URL: Homepage, https://docs.helixml.tech/kodit/
 Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -29,11 +29,13 @@ Requires-Dist: dotenv>=0.9.9
 Requires-Dist: fastapi[standard]>=0.115.12
 Requires-Dist: fastmcp>=2.3.3
 Requires-Dist: gitpython>=3.1.44
+Requires-Dist: hf-xet>=1.1.2
 Requires-Dist: httpx-retries>=0.3.2
 Requires-Dist: httpx>=0.28.1
 Requires-Dist: posthog>=4.0.1
 Requires-Dist: pydantic-settings>=2.9.1
 Requires-Dist: pytable-formatter>=0.1.1
+Requires-Dist: sentence-transformers>=4.1.0
 Requires-Dist: sqlalchemy[asyncio]>=2.0.40
 Requires-Dist: structlog>=25.3.0
 Requires-Dist: tdqm>=0.0.1

kodit-0.1.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,44 @@
+kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
+kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
+kodit/_version.py,sha256=xfwL5IZGNNwnNDAQtGFjpvlNxqYn3U9IM9B98Du9pJw,513
+kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
+kodit/cli.py,sha256=qEQy_Sd64cEV5KzYsKlGLyMxFQ4fFi-as4QO8CRrKYo,8978
+kodit/config.py,sha256=hQshTMW_8jpk94zP-1JaxowgmW_LrT534ipHFaRUGMw,3006
+kodit/database.py,sha256=kekSdyEATdb47jxzQemkSOXMNOwnUwmVVTpn9hYaDK8,2356
+kodit/log.py,sha256=PhyzQktEyyHaNr78W0wmL-RSRuq311DQ-d0l-EKTGmQ,5417
+kodit/mcp.py,sha256=qp16vRb0TY46-xQy179iWgYebr6Ju_Z91ZSzZnWPHuk,4771
+kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
+kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
+kodit/bm25/bm25.py,sha256=NtlcLrgqJja11qDGKz_U6tuYWaS9sfbyS-TcA__rBKs,2284
+kodit/embedding/__init__.py,sha256=h9NXzDA1r-K23nvBajBV-RJzHJN0p3UJ7UQsmdnOoRw,24
+kodit/embedding/embedding.py,sha256=X2Fa-eXhQwp__QFj9yxIhvlCAiYVQSaZ2y18ZtG5_1Y,1810
+kodit/embedding/models.py,sha256=rN90vSs86dYiqoawcp8E9jtwY31JoJXYfaDlsJK7uqc,656
+kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
+kodit/indexing/models.py,sha256=sZIhGwvL4Dw0QTWFxrjfWctSLkAoDT6fv5DlGz8-Fr8,1258
+kodit/indexing/repository.py,sha256=eIaIbqNs9Z3XTVymZ5Zl5uPWveqiEXNo0JTa-y-Tl24,5430
+kodit/indexing/service.py,sha256=hhQ_6vI7J7LnNgOLbsO4B07TOJvEePqqFviiqr3TL_M,6579
+kodit/migrations/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
+kodit/migrations/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
+kodit/migrations/env.py,sha256=bzB6vod_tO-X2F_G671FwYSAn0pyhNw8M1kG4MgidO8,2444
+kodit/migrations/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
+kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py,sha256=-61qol9PfQKILCDQRA5jEaats9aGZs9Wdtp-j-38SF4,1644
+kodit/migrations/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
+kodit/migrations/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
+kodit/retreival/__init__.py,sha256=33PhJU-3gtsqYq6A1UkaLNKbev_Zee9Lq6dYC59-CsA,69
+kodit/retreival/repository.py,sha256=XHkkeUsnXSrrcthJOL9FXgivn5kkaPnC9Qci6ebwjZc,7294
+kodit/retreival/service.py,sha256=gGp74jnqhyCDF5vKOrN2dJKDnhlfR4HZaxADSrjTb4s,3778
+kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
+kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
+kodit/snippets/snippets.py,sha256=QumvhltWoxXw41SyKb-RbSvAr3m6V3lUy9n0AI8jcto,1409
+kodit/snippets/languages/__init__.py,sha256=Bj5KKZSls2MQ8ZY1S_nHg447MgGZW-2WZM-oq6vjwwA,1187
+kodit/snippets/languages/csharp.scm,sha256=gbBN4RiV1FBuTJF6orSnDFi8H9JwTw-d4piLJYsWUsc,222
+kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
+kodit/sources/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
+kodit/sources/models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
+kodit/sources/repository.py,sha256=mGJrHWH6Uo8YABdoojHFbzaf_jW-2ywJpAHIa1gnc3U,3401
+kodit/sources/service.py,sha256=aV_qiqkU2kMBNPvye5_v4NnZiK-lJ64rQdmFtBtsQaY,9243
+kodit-0.1.11.dist-info/METADATA,sha256=yUO645VYUiVrJMRtwNB71O-6qvC94nS7_ILQ8eQEvoY,2288
+kodit-0.1.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kodit-0.1.11.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
+kodit-0.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+kodit-0.1.11.dist-info/RECORD,,

kodit-0.1.9.dist-info/RECORD DELETED Viewed

@@ -1,40 +0,0 @@
-kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
-kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
-kodit/_version.py,sha256=bhntibG3PKk5Ai3XlSNEV8gj-ffItuKloY6vzWn6swo,511
-kodit/app.py,sha256=Mr5BFHOHx5zppwjC4XPWVvHjwgl1yrKbUjTWXKubJQM,891
-kodit/cli.py,sha256=bsfURvGKZzpHkChnTlatI0nXHV3KV_6vJnUJ2fQEAfM,6637
-kodit/config.py,sha256=nlm9U-nVx5riH2SrU1XY4XcCMhQK4DrwO_1H8bPOBjA,2927
-kodit/database.py,sha256=vtTlmrXHyHJH3Ek-twZTCqEjB0jun-NncALFze2fqhA,2350
-kodit/logging.py,sha256=cFEQXWI27LzWScSxly9ApwkbBDamUG17pA-jEfVakXQ,5316
-kodit/mcp.py,sha256=PxTHVPlIErrruFKzmEPIWZjN6cfEhcQmj6nOU9EsBy4,4905
-kodit/middleware.py,sha256=I6FOkqG9-8RH5kR1-0ZoQWfE4qLCB8lZYv8H_OCH29o,2714
-kodit/alembic/README,sha256=ISVtAOvqvKk_5ThM5ioJE-lMkvf9IbknFUFVU_vPma4,58
-kodit/alembic/__init__.py,sha256=lP5MuwlyWRMO6UcDWnQcQ3G-GYHcFb6rl9gYPHJ1sjo,40
-kodit/alembic/env.py,sha256=kcQiglu2KpNTAf37CsKVs_HXxOe6S7sXJ00pHGSCqno,2414
-kodit/alembic/script.py.mako,sha256=zWziKtiwYKEWuwPV_HBNHwa9LCT45_bi01-uSNFaOOE,703
-kodit/alembic/versions/85155663351e_initial.py,sha256=Cg7zlF871o9ShV5rQMQ1v7hRV7fI59veDY9cjtTrs-8,3306
-kodit/alembic/versions/__init__.py,sha256=9-lHzptItTzq_fomdIRBegQNm4Znx6pVjwD4MiqRIdo,36
-kodit/bm25/__init__.py,sha256=j8zyriNWhbwE5Lbybzg1hQAhANlU9mKHWw4beeUR6og,19
-kodit/bm25/bm25.py,sha256=3wyNRSrTaYqV7s4R1D6X0NpCf22PuFK2_uc8YapzYLE,2263
-kodit/indexing/__init__.py,sha256=cPyi2Iej3G1JFWlWr7X80_UrsMaTu5W5rBwgif1B3xo,75
-kodit/indexing/models.py,sha256=sZIhGwvL4Dw0QTWFxrjfWctSLkAoDT6fv5DlGz8-Fr8,1258
-kodit/indexing/repository.py,sha256=ZicLPXPKQxW6NnY_anmZ4nI1-FGkrJsqjg0NK-vvnTY,5117
-kodit/indexing/service.py,sha256=rLWYI70VytlJAyZtQC5Xpqtj9f3EzbivzgeM_1L9BUU,5751
-kodit/retreival/__init__.py,sha256=33PhJU-3gtsqYq6A1UkaLNKbev_Zee9Lq6dYC59-CsA,69
-kodit/retreival/repository.py,sha256=1lqGgJHsBmvMGMzEYa-hrdXg2q7rqtYPl1cvBb7jMRE,3119
-kodit/retreival/service.py,sha256=9wvURtPPJVvPUWNIC2waIrJMxcm1Ka1J_xDEOEedAFU,2007
-kodit/snippets/__init__.py,sha256=-2coNoCRjTixU9KcP6alpmt7zqf37tCRWH3D7FPJ8dg,48
-kodit/snippets/method_snippets.py,sha256=EVHhSNWahAC5nSXv9fWVFJY2yq25goHdCSCuENC07F8,4145
-kodit/snippets/snippets.py,sha256=QumvhltWoxXw41SyKb-RbSvAr3m6V3lUy9n0AI8jcto,1409
-kodit/snippets/languages/__init__.py,sha256=Bj5KKZSls2MQ8ZY1S_nHg447MgGZW-2WZM-oq6vjwwA,1187
-kodit/snippets/languages/csharp.scm,sha256=gbBN4RiV1FBuTJF6orSnDFi8H9JwTw-d4piLJYsWUsc,222
-kodit/snippets/languages/python.scm,sha256=ee85R9PBzwye3IMTE7-iVoKWd_ViU3EJISTyrFGrVeo,429
-kodit/sources/__init__.py,sha256=1NTZyPdjThVQpZO1Mp1ColVsS7sqYanOVLqnoqV9Ipo,83
-kodit/sources/models.py,sha256=xb42CaNDO1CUB8SIW-xXMrB6Ji8cFw-yeJ550xBEg9Q,2398
-kodit/sources/repository.py,sha256=mGJrHWH6Uo8YABdoojHFbzaf_jW-2ywJpAHIa1gnc3U,3401
-kodit/sources/service.py,sha256=hqAjGFVhvtePhMrK1Aprj__Mq2PLjVq8CsWMBoA3_Qw,9217
-kodit-0.1.9.dist-info/METADATA,sha256=MAqVxrLPrTV3Ihcix_3YHQNq9qyuD1OEavYHV76qli8,2214
-kodit-0.1.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-kodit-0.1.9.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
-kodit-0.1.9.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-kodit-0.1.9.dist-info/RECORD,,