PyPI - codeembed - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codeembed 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

codeembed/__init__.py +59 -0
codeembed/bootstrap/__init__.py +17 -0
codeembed/bootstrap/services.py +220 -0
codeembed/cli.py +454 -0
codeembed/config/__init__.py +5 -0
codeembed/config/models.py +13 -0
codeembed/cost_tracking/__init__.py +7 -0
codeembed/cost_tracking/llm_wrapper.py +39 -0
codeembed/cost_tracking/models.py +52 -0
codeembed/delta_computer/__init__.py +5 -0
codeembed/delta_computer/delta_computer.py +75 -0
codeembed/doc_embedder/__init__.py +5 -0
codeembed/doc_embedder/doc_embedder.py +134 -0
codeembed/doc_provider/__init__.py +10 -0
codeembed/doc_provider/base.py +14 -0
codeembed/doc_provider/local_doc_provider.py +58 -0
codeembed/doc_provider/models.py +20 -0
codeembed/doc_search_service/__init__.py +5 -0
codeembed/doc_search_service/doc_search_service.py +48 -0
codeembed/doc_splitters/__init__.py +8 -0
codeembed/doc_splitters/generic_splitter.py +165 -0
codeembed/doc_splitters/models.py +14 -0
codeembed/llm/__init__.py +13 -0
codeembed/llm/base.py +31 -0
codeembed/llm/models.py +27 -0
codeembed/llm/ollama_adapter.py +64 -0
codeembed/llm/openai_adapter.py +96 -0
codeembed/mcp_server.py +45 -0
codeembed/setup_logger.py +34 -0
codeembed/utils/__init__.py +9 -0
codeembed/utils/checksum_utils.py +5 -0
codeembed/utils/string_utils.py +5 -0
codeembed/utils/time_utils.py +5 -0
codeembed/vector_db/__init__.py +9 -0
codeembed/vector_db/base.py +27 -0
codeembed/vector_db/chromadb_adapter.py +130 -0
codeembed/vector_db/models.py +16 -0
codeembed-0.1.0.dist-info/METADATA +292 -0
codeembed-0.1.0.dist-info/RECORD +42 -0
codeembed-0.1.0.dist-info/WHEEL +4 -0
codeembed-0.1.0.dist-info/entry_points.txt +2 -0
codeembed-0.1.0.dist-info/licenses/LICENSE +21 -0

codeembed/doc_embedder/doc_embedder.py ADDED Viewed

@@ -0,0 +1,134 @@
+import logging
+from typing import List
+from uuid import uuid4
+from codeembed.delta_computer.delta_computer import DeltaComputer
+from codeembed.doc_provider.base import DocProviderBase
+from codeembed.doc_splitters.generic_splitter import FileSplitter
+from codeembed.doc_splitters.models import FileSegment
+from codeembed.llm.base import LLMServiceBase
+from codeembed.llm.models import ChatMessage
+from codeembed.vector_db.base import VectorDbBase
+from codeembed.vector_db.models import Chunk
+logger = logging.getLogger(__name__)
+def _segment_to_chunk(
+    llm_service: LLMServiceBase,
+    segment: FileSegment,
+    full_content: str,
+    file_path: str,
+    llm_model: str,
+) -> str:
+    # NOTE: For markdown files we could embed directly without LLM summarization.
+    #       Just split on ## headers.
+    logger.info("Analyzing segment %s in file %s...", segment.content.split("\n")[0], file_path)
+    messages: List[ChatMessage] = [
+        {"role": "system", "content": "You are an expert at describing code."},
+        {
+            "role": "user",
+            "content": f"""In the context of the following file:
+<File Path>{file_path}</File Path>
+<FileContent>
+{full_content}
+</FileContent>
+Please describe the purpose following code/text segment:
+<Segment>
+<Line Start>{segment.line_start}</Line Start>
+<Content>
+{segment.content}
+</Content>
+<Line End>{segment.line_end}</Line End>
+</Segment>
+If this is a function or class, please describe what it does and how it interacts with the application.
+If it is a text paragraph, explain what it covers.
+Focus on the key aspects of the text or code.
+Write a succint summary.
+Return the summary only without any additional comments.
+Start with, e.g.,
+This <segment type> is ...
+""",
+        },
+    ]
+    result = llm_service.generate_response(messages, llm_model, max_tokens=1024, temperature=0.3)
+    logger.info("Generated summary for segment in file %s: %s", file_path, result.response)
+    return result.response
+class DocEmbedder:
+    def __init__(
+        self,
+        doc_provider: DocProviderBase,
+        vector_db: VectorDbBase,
+        llm_service: LLMServiceBase,
+        llm_model: str,
+        debounce_seconds: int = 10,
+    ) -> None:
+        self._doc_provider = doc_provider
+        self._vector_db = vector_db
+        self._llm_service = llm_service
+        self._llm_model = llm_model
+        self._debounce_seconds = debounce_seconds
+    def embed_codebase(self) -> None:
+        """Embeds the codebase and prepares it for vector search."""
+        logger.info("Computing deltas...")
+        chunks_ids_to_remove, files_to_update = DeltaComputer(
+            self._doc_provider, self._vector_db, self._debounce_seconds
+        ).compute_deltas()
+        logger.info(f"Detected {len(chunks_ids_to_remove)} chunks to delete from vector database.")
+        logger.info(f"Detected {len(files_to_update)} files to process.")
+        if chunks_ids_to_remove:
+            logger.info(f"Deleting {len(chunks_ids_to_remove)} chunks from vector database.")
+            self._vector_db.delete_chunks(list(chunks_ids_to_remove))
+        logger.info(f"Processing {len(files_to_update)} files...")
+        num_processed = 0
+        num_skipped = 0
+        splitter = FileSplitter()
+        for i, file in enumerate(files_to_update):
+            logger.info(f"Processing file '{file}' ({i + 1}/{len(files_to_update)})...")
+            doc = self._doc_provider.get_content(file)
+            segments = splitter.split_file(doc.content, file)
+            chunks = []
+            for segment in segments:
+                summary = _segment_to_chunk(self._llm_service, segment, doc.content, file, self._llm_model)
+                chunks.append(
+                    Chunk(
+                        id=uuid4(),
+                        modified_at=doc.modified_at,
+                        content=summary,
+                        file_path=file,
+                        line_start=segment.line_start,
+                        line_end=segment.line_end,
+                        raw_code=segment.content,
+                        file_sha256_checksum=doc.sha256_checksum,
+                    )
+                )
+            if not chunks:
+                logger.warning(f"No chunks generated for file '{file}'. Skipping embedding for this file.")
+                num_skipped += 1
+                continue
+            logger.info(f"Saving {len(chunks)} chunks to vector database.")
+            self._vector_db.add_chunks(chunks)
+            num_processed += 1
+            logger.info(f"Successfully embedded file: '{file}' ({i + 1}/{len(files_to_update)}).")
+        if num_processed > 0:
+            logger.info(f"Successfully embedded {num_processed} files.")
+        if num_skipped > 0:
+            logger.warning(f"Skipped processing {num_skipped} files.")

codeembed/doc_provider/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from codeembed.doc_provider.base import DocProviderBase
+from codeembed.doc_provider.local_doc_provider import LocalDocProvider
+from codeembed.doc_provider.models import DocumentContent, DocumentMeta
+__all__ = [
+    "DocProviderBase",
+    "DocumentContent",
+    "DocumentMeta",
+    "LocalDocProvider",
+]

codeembed/doc_provider/base.py ADDED Viewed

@@ -0,0 +1,14 @@
+from abc import ABC, abstractmethod
+from typing import Iterator
+from codeembed.doc_provider.models import DocumentContent, DocumentMeta
+class DocProviderBase(ABC):
+    @abstractmethod
+    def iter(self) -> Iterator[DocumentMeta]:
+        """Iterates metadata of files."""
+    @abstractmethod
+    def get_content(self, file_path: str) -> DocumentContent:
+        """Gets the actual file content."""

codeembed/doc_provider/local_doc_provider.py ADDED Viewed

@@ -0,0 +1,58 @@
+import os
+import subprocess
+from datetime import datetime, timezone
+from typing import Iterator, List
+from codeembed.doc_provider.base import DocProviderBase
+from codeembed.doc_provider.models import DocumentContent, DocumentMeta
+_SKIP_DIRS = frozenset({"venv", ".venv", "node_modules", "dist", "build"})
+_SKIP_FILES = frozenset({"__init__.py", ".env", ".env.local", "appsettings.json", "appsettings.Development.json"})
+def _get_git_files(base_path: str) -> set[str]:
+    result = subprocess.run(
+        ["git", "ls-files", "--cached", "--others", "--exclude-standard"],
+        cwd=base_path,
+        capture_output=True,
+        text=True,
+    )
+    return set(result.stdout.splitlines())
+class LocalDocProvider(DocProviderBase):
+    def __init__(self, base_path: str, supported_file_extensions: List[str]) -> None:
+        self._base_path = base_path
+        self._supported_file_extensions = [ext.lower().split(".")[-1] for ext in supported_file_extensions]
+    def iter(self) -> Iterator[DocumentMeta]:
+        file_paths = _get_git_files(self._base_path)
+        docs: List[DocumentMeta] = []
+        for file_path in file_paths:
+            ext = file_path.split(".")[-1]
+            if ext.lower() not in self._supported_file_extensions:
+                continue
+            parts = file_path.split("/")
+            if parts[-1] in _SKIP_FILES or any(d in _SKIP_DIRS for d in parts[:-1]):
+                continue
+            try:
+                modified_ts = os.path.getmtime(file_path)
+                modified_at = datetime.fromtimestamp(modified_ts, tz=timezone.utc)
+            except OSError:
+                continue
+            docs.append(DocumentMeta(file_path=file_path, modified_at=modified_at))
+        docs.sort(key=lambda d: d.modified_at, reverse=True)
+        yield from docs
+    def get_content(self, file_path: str) -> DocumentContent:
+        with open(file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+        modified_ts = os.path.getmtime(file_path)
+        modified_at = datetime.fromtimestamp(modified_ts, tz=timezone.utc)
+        return DocumentContent(content=content, modified_at=modified_at)

codeembed/doc_provider/models.py ADDED Viewed

@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from datetime import datetime
+from codeembed.utils.checksum_utils import string_to_sha256
+@dataclass
+class DocumentMeta:
+    file_path: str
+    modified_at: datetime
+@dataclass
+class DocumentContent:
+    content: str
+    modified_at: datetime
+    @property
+    def sha256_checksum(self) -> str:
+        return string_to_sha256(self.content)

codeembed/doc_search_service/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from codeembed.doc_search_service.doc_search_service import DocSearchService
+__all__ = [
+    "DocSearchService",
+]

codeembed/doc_search_service/doc_search_service.py ADDED Viewed

@@ -0,0 +1,48 @@
+from typing import Dict, List
+from codeembed.utils.string_utils import truncate_string
+from codeembed.vector_db.base import VectorDbBase
+from codeembed.vector_db.models import Chunk
+class DocSearchService:
+    """
+    The service that searches for relevant content from vector database and formats it for LLM consumption.
+    """
+    def __init__(
+        self,
+        vector_db: VectorDbBase,
+    ) -> None:
+        self._vector_db = vector_db
+    def search(self, query: str, top_n: int = 10) -> str:
+        """Searches for relevant content from vector database and formats it for LLM consumption."""
+        chunks = self._vector_db.search(query, top_n)
+        chunks_by_file: Dict[str, List[Chunk]] = {}
+        for chunk in chunks:
+            if chunk.file_path not in chunks_by_file:
+                chunks_by_file[chunk.file_path] = []
+            chunks_by_file[chunk.file_path].append(chunk)
+        res = f"<SearchQuery>{query}</SearchQuery>\n"
+        res += f"<TopN>{top_n}</TopN>\n"
+        res += f"<Results chunkCount={len(chunks)} fileCount={len(chunks_by_file)}>\n"
+        for file_path, chunks in chunks_by_file.items():
+            res += f'  <File path="{file_path}">\n'
+            for chunk in chunks:
+                # NOTE: Consider truncating by number of tokens.
+                raw_code = chunk.raw_code if chunk.raw_code else ""
+                res += "    <Chunk>\n"
+                res += f"      <Summary>\n{truncate_string(chunk.content, 4096)}\n      </Summary>\n"
+                res += (
+                    f'      <RawCode lines="{chunk.line_start}-{chunk.line_end}">\n'
+                    f"{truncate_string(raw_code, 4096)}\n"
+                    f"      </RawCode>\n"
+                )
+                res += "    </Chunk>\n"
+            res += "  </File>\n"
+        res += "</Results>\n"
+        return res

codeembed/doc_splitters/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from codeembed.doc_splitters.generic_splitter import FileSplitter
+from codeembed.doc_splitters.models import FileSegment, SplittedFile
+__all__ = [
+    "FileSegment",
+    "FileSplitter",
+    "SplittedFile",
+]

codeembed/doc_splitters/generic_splitter.py ADDED Viewed

@@ -0,0 +1,165 @@
+from typing import Dict, List
+import tiktoken
+from codeembed.doc_splitters.models import FileSegment
+_encoder = tiktoken.get_encoding("o200k_base")
+_SPLIT_KEYWORDS: Dict[str, List[str]] = {
+    "py": ["class ", "def "],
+    "md": ["## "],
+    "ts": [
+        "export function ",
+        "export class ",
+        "export const ",
+        "export interface ",
+        "export type ",
+        "export default ",
+        "function ",
+        "class ",
+    ],
+    "tsx": [
+        "export function ",
+        "export class ",
+        "export const ",
+        "export interface ",
+        "export type ",
+        "export default ",
+        "function ",
+        "class ",
+    ],
+    "js": ["export function ", "export class ", "export const ", "export default ", "function ", "class "],
+    "jsx": ["export function ", "export class ", "export const ", "export default ", "function ", "class "],
+}
+def _count_tokens(text: str) -> int:
+    return len(_encoder.encode(text))
+def _split_by_fixed_length(
+    content: str,
+    max_tokens: int = 512,
+    overlap_lines: int = 5,
+    line_offset: int = 0,
+) -> List[FileSegment]:
+    lines = content.splitlines()
+    chunks: List[FileSegment] = []
+    chunk: List[str] = []
+    chunk_tokens = 0
+    chunk_start = 0
+    for i, line in enumerate(lines):
+        line_tokens = _count_tokens(line)
+        if chunk_tokens + line_tokens > max_tokens and chunk:
+            chunks.append(
+                FileSegment(
+                    line_start=line_offset + chunk_start,
+                    line_end=line_offset + i,
+                    content="\n".join(chunk),
+                )
+            )
+            overlap = chunk[-overlap_lines:]
+            chunk = overlap
+            chunk_tokens = sum(_count_tokens(ln) for ln in chunk)
+            chunk_start = i - len(overlap)
+        chunk.append(line)
+        chunk_tokens += line_tokens
+    if chunk:
+        chunks.append(
+            FileSegment(
+                line_start=line_offset + chunk_start,
+                line_end=line_offset + len(lines),
+                content="\n".join(chunk),
+            )
+        )
+    return chunks
+def _detect_splits(content: str, split_keywords: List[str]) -> List[int]:
+    split_lines = []
+    for i, line in enumerate(content.splitlines()):
+        for keyword in split_keywords:
+            if line.startswith(keyword):
+                split_lines.append(i)
+                break
+    if not split_lines or split_lines[0] != 0:
+        split_lines.insert(0, 0)
+    return split_lines
+def _apply_splits(
+    content: str,
+    split_lines: List[int],
+    max_tokens: int = 512,
+    overlap_lines: int = 5,
+) -> List[FileSegment]:
+    segments = []
+    lines = content.splitlines()
+    for i in range(len(split_lines)):
+        split_start = split_lines[i]
+        split_end = split_lines[i + 1] if i + 1 < len(split_lines) else len(lines)
+        # Scan backwards to the nearest empty line so decorators/comments are included
+        actual_start = split_start
+        for j in range(split_start - 1, -1, -1):
+            if not lines[j].strip():
+                actual_start = j + 1
+                break
+        else:
+            if split_start > 0:
+                actual_start = 0
+        if actual_start == split_end:
+            continue
+        segment_content = "\n".join(lines[actual_start:split_end])
+        if _count_tokens(segment_content) <= max_tokens:
+            segments.append(
+                FileSegment(
+                    line_start=actual_start,
+                    line_end=split_end,
+                    content=segment_content,
+                )
+            )
+        else:
+            segments.extend(
+                _split_by_fixed_length(
+                    segment_content,
+                    max_tokens=max_tokens,
+                    overlap_lines=overlap_lines,
+                    line_offset=actual_start,
+                )
+            )
+    return segments
+class FileSplitter:
+    def __init__(self, max_tokens: int = 512, overlap_lines: int = 5):
+        self._max_tokens = max_tokens
+        self._overlap_lines = overlap_lines
+    def split_file(self, file_content: str, file_path: str) -> List[FileSegment]:
+        file_extension = file_path.split(".")[-1].lower()
+        if file_extension not in _SPLIT_KEYWORDS:
+            return _split_by_fixed_length(
+                file_content,
+                max_tokens=self._max_tokens,
+                overlap_lines=self._overlap_lines,
+            )
+        split_lines = _detect_splits(file_content, _SPLIT_KEYWORDS[file_extension])
+        return _apply_splits(
+            file_content,
+            split_lines,
+            max_tokens=self._max_tokens,
+            overlap_lines=self._overlap_lines,
+        )

codeembed/doc_splitters/models.py ADDED Viewed

@@ -0,0 +1,14 @@
+from dataclasses import dataclass
+@dataclass
+class FileSegment:
+    line_start: int
+    line_end: int
+    content: str
+@dataclass
+class SplittedFile:
+    file_path: str
+    full_content: str

codeembed/llm/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from codeembed.llm.base import LLMServiceBase
+from codeembed.llm.models import ChatMessage, LLMResponse, StructuredLLMResponse
+from codeembed.llm.ollama_adapter import OllamaLLMService
+from codeembed.llm.openai_adapter import OpenAILLMService
+__all__ = [
+    "ChatMessage",
+    "LLMResponse",
+    "LLMServiceBase",
+    "OllamaLLMService",
+    "OpenAILLMService",
+    "StructuredLLMResponse",
+]

codeembed/llm/base.py ADDED Viewed

@@ -0,0 +1,31 @@
+from abc import ABC, abstractmethod
+from typing import List, Optional, Type, TypeVar
+from pydantic import BaseModel
+from codeembed.llm.models import ChatMessage, LLMResponse, StructuredLLMResponse
+T = TypeVar("T", bound=BaseModel)
+class LLMServiceBase(ABC):
+    @abstractmethod
+    def generate_structured_output(
+        self,
+        messages: List[ChatMessage],
+        llm_model: str,
+        output_format: Type[T],
+        max_tokens: Optional[int] = None,
+        temperature: Optional[float] = None,
+    ) -> StructuredLLMResponse[T]:
+        pass
+    @abstractmethod
+    def generate_response(
+        self,
+        messages: List[ChatMessage],
+        llm_model: str,
+        max_tokens: Optional[int] = None,
+        temperature: Optional[float] = None,
+    ) -> LLMResponse:
+        pass

codeembed/llm/models.py ADDED Viewed

@@ -0,0 +1,27 @@
+from dataclasses import dataclass
+from typing import Generic, Literal, TypedDict, TypeVar
+from pydantic import BaseModel
+T = TypeVar("T", bound=BaseModel)
+class ChatMessage(TypedDict):
+    role: Literal["system", "user", "assistant"]
+    content: str
+@dataclass
+class LLMResponse:
+    input_tokens: int
+    output_tokens: int
+    response: str
+    llm_model: str
+@dataclass
+class StructuredLLMResponse(Generic[T]):
+    input_tokens: int
+    output_tokens: int
+    data: T
+    llm_model: str

codeembed/llm/ollama_adapter.py ADDED Viewed

@@ -0,0 +1,64 @@
+from typing import List, Optional, Type, TypeVar
+import ollama
+from pydantic import BaseModel
+from codeembed.llm.base import LLMServiceBase
+from codeembed.llm.models import ChatMessage, LLMResponse, StructuredLLMResponse
+T = TypeVar("T", bound=BaseModel)
+class OllamaLLMService(LLMServiceBase):
+    def generate_structured_output(
+        self,
+        messages: List[ChatMessage],
+        llm_model: str,
+        output_format: Type[T],
+        max_tokens: Optional[int] = None,
+        temperature: Optional[float] = None,
+    ) -> StructuredLLMResponse[T]:
+        options = {}
+        if max_tokens is not None:
+            options["num_predict"] = max_tokens
+        if temperature is not None:
+            options["temperature"] = temperature
+        resp = ollama.chat(model=llm_model, messages=messages, format="json", options=options)
+        data = resp["message"]["content"]
+        model = output_format.model_validate_json(data)
+        return StructuredLLMResponse(
+            input_tokens=resp["prompt_eval_count"] or 0,
+            output_tokens=resp["eval_count"] or 0,
+            data=model,
+            llm_model=llm_model,
+        )
+    def generate_response(
+        self,
+        messages: List[ChatMessage],
+        llm_model: str,
+        max_tokens: Optional[int] = None,
+        temperature: Optional[float] = None,
+    ) -> LLMResponse:
+        options = {}
+        if max_tokens is not None:
+            options["num_predict"] = max_tokens
+        if temperature is not None:
+            options["temperature"] = temperature
+        resp = ollama.chat(model=llm_model, messages=messages, options=options)
+        content = resp["message"]["content"]
+        return LLMResponse(
+            input_tokens=resp["prompt_eval_count"] or 0,
+            output_tokens=resp["eval_count"] or 0,
+            response=content,
+            llm_model=llm_model,
+        )