PyPI - codeembed - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codeembed 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

codeembed/__init__.py +59 -0
codeembed/bootstrap/__init__.py +17 -0
codeembed/bootstrap/services.py +220 -0
codeembed/cli.py +454 -0
codeembed/config/__init__.py +5 -0
codeembed/config/models.py +13 -0
codeembed/cost_tracking/__init__.py +7 -0
codeembed/cost_tracking/llm_wrapper.py +39 -0
codeembed/cost_tracking/models.py +52 -0
codeembed/delta_computer/__init__.py +5 -0
codeembed/delta_computer/delta_computer.py +75 -0
codeembed/doc_embedder/__init__.py +5 -0
codeembed/doc_embedder/doc_embedder.py +134 -0
codeembed/doc_provider/__init__.py +10 -0
codeembed/doc_provider/base.py +14 -0
codeembed/doc_provider/local_doc_provider.py +58 -0
codeembed/doc_provider/models.py +20 -0
codeembed/doc_search_service/__init__.py +5 -0
codeembed/doc_search_service/doc_search_service.py +48 -0
codeembed/doc_splitters/__init__.py +8 -0
codeembed/doc_splitters/generic_splitter.py +165 -0
codeembed/doc_splitters/models.py +14 -0
codeembed/llm/__init__.py +13 -0
codeembed/llm/base.py +31 -0
codeembed/llm/models.py +27 -0
codeembed/llm/ollama_adapter.py +64 -0
codeembed/llm/openai_adapter.py +96 -0
codeembed/mcp_server.py +45 -0
codeembed/setup_logger.py +34 -0
codeembed/utils/__init__.py +9 -0
codeembed/utils/checksum_utils.py +5 -0
codeembed/utils/string_utils.py +5 -0
codeembed/utils/time_utils.py +5 -0
codeembed/vector_db/__init__.py +9 -0
codeembed/vector_db/base.py +27 -0
codeembed/vector_db/chromadb_adapter.py +130 -0
codeembed/vector_db/models.py +16 -0
codeembed-0.1.0.dist-info/METADATA +292 -0
codeembed-0.1.0.dist-info/RECORD +42 -0
codeembed-0.1.0.dist-info/WHEEL +4 -0
codeembed-0.1.0.dist-info/entry_points.txt +2 -0
codeembed-0.1.0.dist-info/licenses/LICENSE +21 -0

codeembed/cli.py ADDED Viewed

@@ -0,0 +1,454 @@
+import json
+import os
+import shutil
+import subprocess
+from typing import Any, Dict, Literal, Optional
+import typer
+from codeembed.bootstrap.services import get_config, get_llm_service, get_session
+from codeembed.llm.base import LLMServiceBase
+from codeembed.setup_logger import setup_logger
+app = typer.Typer()
+_CODEEMBED_DIR = ".codeembed"
+_CONFIG_FILE = "codeembed.toml"
+_GITIGNORE_ENTRY = ".codeembed/"
+_DEFAULT_DEBOUNCE = 10
+_DEFAULT_SLEEP_INTERVAL = 60
+_CURATED_MODELS = [
+    ("gpt-oss:20b", "OpenAI's open source model, ~14GB"),
+    ("gemma4:e4b", "Google's all-around model, ~9.6GB"),
+]
+_OPENAI_CURATED_MODELS = [
+    ("gpt-4.1-mini", "Lightweight and cost-effective"),
+    ("gpt-5.4-mini", "Newer, lightweight and cost-effective"),
+    ("gpt-5.4-nano", "Newer and super lightweight option"),
+]
+_AGENT_INSTRUCTION_FILES = [
+    "AGENTS.md",
+    "CLAUDE.md",
+    os.path.join(".github", "copilot-instructions.md"),
+]
+_AGENT_INSTRUCTIONS_MARKER = "mcp__codeembed__search"
+_AGENT_INSTRUCTIONS_CONTENT = """\
+## Codebase search
+Use the `mcp__codeembed__search` tool as the first step for any question about how this \
+codebase works — how something is implemented, where something is defined, what calls what. \
+Prefer it over grep or file reads for exploratory questions.
+"""
+def _ensure_gitignore() -> None:
+    if not os.path.isfile(".gitignore"):
+        typer.echo("Error: No .gitignore found. Run 'codeembed init' from the root of your git repository.")
+        typer.echo("A .gitignore is required to prevent CodeEmbed from embedding your sensitive files.")
+        raise typer.Exit(1)
+    with open(".gitignore", "r", encoding="utf-8") as f:
+        content = f.read()
+    if _GITIGNORE_ENTRY not in content:
+        # ask user for permission to modify .gitignore
+        typer.echo(f"CodeEmbed stores its data in the '{_CODEEMBED_DIR}/' directory.")
+        typer.echo(f"You must add '{_GITIGNORE_ENTRY}' to your .gitignore to use CodeEmbed safely.")
+        if not typer.confirm(f"Add '{_GITIGNORE_ENTRY}' to your .gitignore now?", default=True):
+            typer.echo(f"Error: Gitignore is missing '{_GITIGNORE_ENTRY}' entry for safe operation.")
+            raise typer.Exit(1)
+        with open(".gitignore", "a", encoding="utf-8") as f:
+            f.write(f"\n# CodeEmbed\n{_GITIGNORE_ENTRY}\n")
+        typer.echo(f"Added '{_GITIGNORE_ENTRY}' to .gitignore. Remember to commit this change.\n")
+def _create_codeembed_dir() -> None:
+    if not os.path.isdir(_CODEEMBED_DIR):
+        os.makedirs(_CODEEMBED_DIR)
+        typer.echo(f"Created '{_CODEEMBED_DIR}/' directory.\n")
+def _check_ollama_installed() -> None:
+    if shutil.which("ollama") is None:
+        typer.echo("Error: Ollama is not installed or not in your PATH.")
+        typer.echo("Install it from https://ollama.com/ then re-run 'codeembed init'.")
+        raise typer.Exit(1)
+def _check_ollama_model_is_available(model: str) -> None:
+    downloaded_models = _get_downloaded_models()
+    if model not in downloaded_models:
+        typer.echo(f"Error: Ollama model '{model}' is not available.")
+        typer.echo(f"Download it with: ollama pull {model}")
+        # Alternatively give option to download now.
+        raise typer.Exit(1)
+def _check_ollama_running() -> None:
+    result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
+    if result.returncode != 0:
+        typer.echo("Error: Ollama server is not running.")
+        typer.echo("Start it with: ollama serve")
+        raise typer.Exit(1)
+def _get_downloaded_models() -> list[str]:
+    result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
+    lines = result.stdout.strip().splitlines()
+    models = []
+    for line in lines[1:]:  # skip header
+        parts = line.split()
+        if parts:
+            models.append(parts[0])
+    return models
+def _select_ollama_llm_model(downloaded_models: list[str]) -> str:
+    typer.echo("\nSelect a local LLM model for code summarization:\n")
+    options: list[str] = []
+    for model, description in _CURATED_MODELS:
+        tag = " [downloaded]" if model in downloaded_models else ""
+        options.append(model)
+        typer.echo(f"  {len(options)}. {model} — {description}{tag}")
+    extra = [m for m in downloaded_models if m not in dict(_CURATED_MODELS)]
+    for model in extra:
+        options.append(model)
+        typer.echo(f"  {len(options)}. {model} [downloaded]")
+    options.append("custom")
+    typer.echo(f"  {len(options)}. Enter a custom model name\n")
+    raw = typer.prompt(f"Choice (1-{len(options)})")
+    try:
+        index = int(raw) - 1
+        if index < 0 or index >= len(options):
+            raise ValueError()
+    except ValueError:
+        typer.echo("Invalid choice. Please re-run 'codeembed init'.")
+        raise typer.Exit(1)
+    if options[index] == "custom":
+        return typer.prompt("Model name (e.g. gpt-oss:20b)")
+    return options[index]
+def _select_openai_model() -> str:
+    typer.echo("\nSelect an OpenAI LLM deployment for code summarization:\n")
+    options = list(_OPENAI_CURATED_MODELS)
+    for i, (model, description) in enumerate(options, 1):
+        typer.echo(f"  {i}. {model} — {description}")
+    custom_index = len(options) + 1
+    typer.echo(f"  {custom_index}. Enter a custom model name\n")
+    raw = typer.prompt(f"Choice (1-{custom_index})")
+    try:
+        index = int(raw) - 1
+        if index < 0 or index >= custom_index:
+            raise ValueError()
+    except ValueError:
+        typer.echo("Invalid choice. Please re-run 'codeembed init'.")
+        raise typer.Exit(1)
+    if index == len(options):
+        return typer.prompt("Model name (e.g. gpt-4o)")
+    return options[index][0]
+def _check_openai_credentials() -> str:
+    if os.getenv("OPENAI_API_KEY"):
+        return "[OPENAI_API_KEY set]"
+    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
+    if azure_endpoint:
+        if os.getenv("AZURE_OPENAI_API_KEY"):
+            return "[Azure: endpoint + key set]"
+        return "[Azure: endpoint set, using RBAC]"
+    return "[no credentials found]"
+def _select_provider() -> Literal["ollama", "openai"]:
+    typer.echo("\nSelect an LLM provider for code summarization:\n")
+    is_ollama_installed = shutil.which("ollama") is not None
+    if is_ollama_installed:
+        result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
+        is_ollama_running = result.returncode == 0
+    else:
+        is_ollama_running = False
+    ollama_status = "[running]" if is_ollama_running else "[not running]" if is_ollama_installed else "[not installed]"
+    openai_status = _check_openai_credentials()
+    typer.echo("  1. ollama " + ollama_status)
+    typer.echo("  2. openai " + openai_status)
+    raw = typer.prompt("Choice (1-2)")
+    try:
+        index = int(raw) - 1
+        if index < 0 or index >= 2:
+            raise ValueError()
+    except ValueError:
+        typer.echo("Invalid choice. Please re-run 'codeembed init'.")
+        raise typer.Exit(1)
+    if index == 0:
+        return "ollama"
+    elif index == 1:
+        return "openai"
+    raise ValueError("Invalid index")  # should never happen
+def _ensure_model_downloaded(model: str, downloaded_models: list[str]) -> None:
+    if model in downloaded_models:
+        return
+    typer.echo(f"\nModel '{model}' is not downloaded yet.")
+    if typer.confirm("Download it now?", default=False):
+        typer.echo(f"Pulling '{model}'... (this may take a while)")
+        subprocess.run(["ollama", "pull", model])
+    else:
+        typer.echo(f"Skipping. You can pull it later with: ollama pull {model}")
+def _write_config(model: str, provider: Literal["ollama", "openai"], env_var_path: Optional[str] = None) -> None:
+    config_toml = f"""\
+[codeembed]
+llm_model = "{model}"
+provider = "{provider}"
+debounce = {_DEFAULT_DEBOUNCE}
+sleep_interval = {_DEFAULT_SLEEP_INTERVAL}
+"""
+    if env_var_path:
+        config_toml += f'env_var_path = "{env_var_path}"\n'
+    with open(_CONFIG_FILE, "w", encoding="utf-8") as f:
+        f.write(config_toml)
+    typer.echo(f"Created '{_CONFIG_FILE}'.")
+def _load_env_file(env_var_path: Optional[str]) -> None:
+    if not env_var_path:
+        return
+    from dotenv import load_dotenv
+    if not os.path.isfile(env_var_path):
+        typer.echo(f"Error: Environment variable file '{env_var_path}' not found.")
+        raise typer.Exit(1)
+    load_dotenv(env_var_path)
+def _check_llm_is_available(llm_service: LLMServiceBase, llm_model: str) -> None:
+    # Pings the LLM deployment. Raises exception if it's not available.
+    try:
+        llm_service.generate_response(
+            [{"role": "system", "content": "Ping!"}],
+            llm_model,
+            temperature=0.0,
+            max_tokens=1,
+        )
+    except Exception as e:
+        typer.echo(f"Error: Failed to ping LLM model or deployment '{llm_model}'. Details: {e}")
+        raise typer.Exit(1)
+_MCP_SERVER_CONFIG = {
+    "command": "codeembed",
+    "args": ["serve"],
+}
+def _read_json(path: str) -> Dict[str, Any]:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def _write_json(path: str, data: Dict[str, Any]) -> None:
+    os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.dirname(path) else None
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(data, indent=2) + "\n")
+def _add_to_claude_code() -> None:
+    mcp_json_path = ".mcp.json"
+    settings_path = os.path.join(".claude", "settings.local.json")
+    data: Dict[str, Any] = _read_json(mcp_json_path) if os.path.isfile(mcp_json_path) else {}
+    data.setdefault("mcpServers", {})["codeembed"] = _MCP_SERVER_CONFIG
+    _write_json(mcp_json_path, data)
+    typer.echo(f"  Updated '{mcp_json_path}'.")
+    data = _read_json(settings_path) if os.path.isfile(settings_path) else {}
+    enabled = data.setdefault("enabledMcpjsonServers", [])
+    if "codeembed" not in enabled:
+        enabled.append("codeembed")
+    perms = data.setdefault("permissions", {})
+    allowed = perms.setdefault("allow", [])
+    if "mcp__codeembed__search" not in allowed:
+        allowed.append("mcp__codeembed__search")
+    _write_json(settings_path, data)
+    typer.echo(f"  Updated '{settings_path}'.")
+def _add_agent_instructions() -> None:
+    target = next(
+        (f for f in _AGENT_INSTRUCTION_FILES if os.path.isfile(f)),
+        "AGENTS.md",
+    )
+    if os.path.isfile(target):
+        with open(target, "r", encoding="utf-8") as f:
+            existing = f.read()
+        if _AGENT_INSTRUCTIONS_MARKER in existing:
+            typer.echo(f"  '{target}' already contains CodeEmbed instructions, skipping.")
+            return
+        with open(target, "a", encoding="utf-8") as f:
+            f.write("\n" + _AGENT_INSTRUCTIONS_CONTENT)
+        typer.echo(f"  Appended CodeEmbed search instructions to '{target}'.")
+    else:
+        parent = os.path.dirname(target)
+        if parent:
+            os.makedirs(parent, exist_ok=True)
+        with open(target, "w", encoding="utf-8") as f:
+            f.write(_AGENT_INSTRUCTIONS_CONTENT)
+        typer.echo(f"  Created '{target}' with CodeEmbed search instructions.")
+def _add_to_github_copilot() -> None:
+    vscode_mcp_path = os.path.join(".vscode", "mcp.json")
+    data = _read_json(vscode_mcp_path) if os.path.isfile(vscode_mcp_path) else {}
+    data.setdefault("servers", {})["codeembed"] = _MCP_SERVER_CONFIG
+    _write_json(vscode_mcp_path, data)
+    typer.echo(f"  Updated '{vscode_mcp_path}'.")
+@app.command()
+def init():
+    """Initialize CodeEmbed in the current project."""
+    typer.echo("Initializing CodeEmbed...\n")
+    if os.path.isfile(_CONFIG_FILE):
+        if not typer.confirm(f"'{_CONFIG_FILE}' already exists. Overwrite?", default=False):
+            raise typer.Exit(0)
+    env_var_path = typer.prompt(
+        "Do you have a .env file path? (optional, press Enter to skip)", default="", show_default=False
+    )
+    _load_env_file(env_var_path or None)
+    _ensure_gitignore()
+    _create_codeembed_dir()
+    provider = _select_provider()
+    if provider == "ollama":
+        _check_ollama_installed()
+        _check_ollama_running()
+        downloaded_models = _get_downloaded_models()
+        model = _select_ollama_llm_model(downloaded_models)
+        _ensure_model_downloaded(model, downloaded_models)
+    else:
+        model = _select_openai_model()
+    _write_config(model, provider, env_var_path)
+    typer.echo("")
+    if typer.confirm(
+        "Add CodeEmbed to Claude Code? (creates/updates .mcp.json and .claude/settings.local.json)", default=True
+    ):
+        _add_to_claude_code()
+    if typer.confirm("Add CodeEmbed to GitHub Copilot? (creates/updates .vscode/mcp.json)", default=False):
+        _add_to_github_copilot()
+    if typer.confirm(
+        "Add CodeEmbed search instructions to AGENTS.md? (or existing CLAUDE.md / .github/copilot-instructions.md)",
+        default=True,
+    ):
+        _add_agent_instructions()
+    typer.echo(
+        "\nDone.\n\n"
+        "Tip: Run 'codeembed embed' before starting the server to pre-populate the index.\n"
+        "The server also embeds in the background automatically, but searches will return\n"
+        "empty results until the first file is embedded.\n\n"
+        "Then run 'codeembed serve' to start the MCP server."
+    )
+@app.command()
+def serve():
+    """Start the MCP server."""
+    if not os.path.isfile(_CONFIG_FILE):
+        typer.echo("Error: 'codeembed.toml' not found. Run 'codeembed init' first.")
+        raise typer.Exit(1)
+    config = get_config()
+    _load_env_file(config.env_var_path)
+    setup_logger()
+    if config.provider == "ollama":
+        _check_ollama_installed()
+        _check_ollama_running()
+        _check_ollama_model_is_available(config.llm_model)
+    llm_service = get_llm_service()
+    _check_llm_is_available(llm_service, config.llm_model)
+    from codeembed.mcp_server import mcp
+    typer.echo("Starting CodeEmbed MCP server...")
+    mcp.run(transport="stdio")
+@app.command()
+def embed():
+    """Embed codebase into the vector database."""
+    if not os.path.isfile(_CONFIG_FILE):
+        typer.echo("Error: 'codeembed.toml' not found. Run 'codeembed init' first.")
+        raise typer.Exit(1)
+    config = get_config()
+    _load_env_file(config.env_var_path)
+    setup_logger()
+    if config.provider == "ollama":
+        _check_ollama_installed()
+        _check_ollama_running()
+        _check_ollama_model_is_available(config.llm_model)
+    try:
+        llm_service = get_llm_service()
+        _check_llm_is_available(llm_service, config.llm_model)
+        typer.echo("Embedding codebase...\n")
+        from codeembed.bootstrap.services import get_embedder_service
+        embedder = get_embedder_service()
+        embedder.embed_codebase()
+    finally:
+        session = get_session()
+        session.save()
+        typer.echo(f"\nInput tokens used: {session.input_tokens}. Output tokens used: {session.output_tokens}.")
+    typer.echo("\nDone.")

codeembed/config/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from codeembed.config.models import CodeEmbedConfig
+__all__ = [
+    "CodeEmbedConfig",
+]

codeembed/config/models.py ADDED Viewed

@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+from typing import Literal, Optional
+@dataclass
+class CodeEmbedConfig:
+    debounce: int
+    sleep_interval: int
+    llm_model: str
+    provider: Literal["ollama", "openai"] = "ollama"
+    llm_api_endpoint_env_var: Optional[str] = None
+    llm_api_key_env_var: Optional[str] = None
+    env_var_path: Optional[str] = None

codeembed/cost_tracking/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from codeembed.cost_tracking.llm_wrapper import LLMServiceWithCostTracking
+from codeembed.cost_tracking.models import Session
+__all__ = [
+    "LLMServiceWithCostTracking",
+    "Session",
+]

codeembed/cost_tracking/llm_wrapper.py ADDED Viewed

@@ -0,0 +1,39 @@
+from typing import TypeVar
+from pydantic import BaseModel
+from codeembed.cost_tracking.models import Session
+from codeembed.llm.base import LLMServiceBase
+from codeembed.llm.models import LLMResponse, StructuredLLMResponse
+T = TypeVar("T", bound=BaseModel)
+class LLMServiceWithCostTracking(LLMServiceBase):
+    """Wrapper for LLM service that tracks token usage."""
+    def __init__(self, llm_service: LLMServiceBase, session: Session) -> None:
+        self._llm_service = llm_service
+        self._session = session
+    def generate_structured_output(self, *args, **kwargs) -> StructuredLLMResponse[T]:
+        res = self._llm_service.generate_structured_output(*args, **kwargs)
+        self._session.add(
+            model_name=res.llm_model,
+            input_tokens=res.input_tokens,
+            output_tokens=res.output_tokens,
+        )
+        return res
+    def generate_response(self, *args, **kwargs) -> LLMResponse:
+        res = self._llm_service.generate_response(*args, **kwargs)
+        self._session.add(
+            model_name=res.llm_model,
+            input_tokens=res.input_tokens,
+            output_tokens=res.output_tokens,
+        )
+        return res

codeembed/cost_tracking/models.py ADDED Viewed

@@ -0,0 +1,52 @@
+import json
+import os
+from typing import Dict, Literal, Optional
+from codeembed.utils.time_utils import utc_now
+_SessionData = Dict[str, Dict[Literal["input_tokens", "output_tokens", "embedding_tokens"], int]]
+_SESSIONS_DIR = ".codeembed/sessions"
+class Session:
+    """Writes token usage to a file under `.codeembed/sessions/<timestamp>.json`"""
+    def __init__(self):
+        self._by_model: _SessionData = {}
+        self._session_id = utc_now().strftime("%Y-%m-%dT%H-%M-%S")
+    def add(
+        self,
+        model_name: str,
+        input_tokens: Optional[int] = None,
+        output_tokens: Optional[int] = None,
+    ) -> None:
+        if model_name not in self._by_model:
+            self._by_model[model_name] = {
+                "input_tokens": 0,
+                "output_tokens": 0,
+                "embedding_tokens": 0,
+            }
+        if input_tokens is not None:
+            self._by_model[model_name]["input_tokens"] += input_tokens
+        if output_tokens is not None:
+            self._by_model[model_name]["output_tokens"] += output_tokens
+    def save(self) -> None:
+        if not self._by_model:
+            return
+        os.makedirs(_SESSIONS_DIR, exist_ok=True)
+        with open(f"{_SESSIONS_DIR}/{self._session_id}.json", "w") as f:
+            f.write(json.dumps(self._by_model, indent=2))
+    def get_usage(self) -> _SessionData:
+        return self._by_model
+    @property
+    def input_tokens(self) -> int:
+        return sum(tokens["input_tokens"] for tokens in self._by_model.values())
+    @property
+    def output_tokens(self) -> int:
+        return sum(tokens["output_tokens"] for tokens in self._by_model.values())

codeembed/delta_computer/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from codeembed.delta_computer.delta_computer import DeltaComputer
+__all__ = [
+    "DeltaComputer",
+]

codeembed/delta_computer/delta_computer.py ADDED Viewed

@@ -0,0 +1,75 @@
+from datetime import datetime, timedelta
+from typing import Dict, List, Set, Tuple
+from uuid import UUID
+from codeembed.doc_provider.base import DocProviderBase
+from codeembed.utils.time_utils import utc_now
+from codeembed.vector_db.base import VectorDbBase
+class DeltaComputer:
+    """Figures out which files to add, delete or update."""
+    def __init__(self, doc_provider: DocProviderBase, vector_db: VectorDbBase, debounce_seconds: int = 10) -> None:
+        self._doc_provider = doc_provider
+        self._vector_db = vector_db
+        self._debounce_seconds = debounce_seconds
+    def compute_deltas(self) -> Tuple[Set[UUID], Set[str]]:
+        """
+        Returns chunk IDs to delete and file paths to process.
+        May not have best perfomance since we iterate each chunk stored in the vector database.
+        """
+        file_paths_to_update: Set[str] = set()
+        file_path_to_chunk_ids: Dict[str, List[UUID]] = {}
+        chunk_ids_to_delete: Set[UUID] = set()
+        # Collect modified_at stored in our database.
+        old_modified_at: Dict[str, datetime] = {}
+        old_checksums: Dict[str, str] = {}
+        for chunk in self._vector_db.iter_chunks():
+            old_modified_at[chunk.file_path] = max(
+                old_modified_at.get(chunk.file_path, chunk.modified_at), chunk.modified_at
+            )
+            old_checksums[chunk.file_path] = chunk.file_sha256_checksum
+            file_path_to_chunk_ids[chunk.file_path] = file_path_to_chunk_ids.get(chunk.file_path, []) + [chunk.id]
+        # Collect current modified_at in file system.
+        current: Dict[str, datetime] = {}
+        for doc in self._doc_provider.iter():
+            current[doc.file_path] = doc.modified_at
+        # Figure out which files have been added or modified.
+        for file_path, modified_at in current.items():
+            if modified_at > utc_now() - timedelta(seconds=self._debounce_seconds):
+                # We skip files modified within the last N seconds
+                continue
+            if file_path not in old_modified_at or old_modified_at[file_path] < modified_at:
+                if file_path in old_modified_at:
+                    doc = self._doc_provider.get_content(file_path)
+                    if doc.sha256_checksum == old_checksums[file_path]:
+                        # We skip files with same checksum even if modified_at is updated.
+                        # Some editors update modified_at even without any changes.
+                        # TODO: We should probably update modified_at in vector database
+                        #       to avoid re-reading this file on every run.
+                        continue
+                # file updated or added
+                file_paths_to_update.add(file_path)
+                # We delete all old chunks for any modified files.
+                for chunk_id in file_path_to_chunk_ids.get(file_path, []):
+                    chunk_ids_to_delete.add(chunk_id)
+        # Figure out which files have been removed.
+        for file_path in old_modified_at:
+            if file_path not in current:
+                for chunk_id in file_path_to_chunk_ids.get(file_path, []):
+                    chunk_ids_to_delete.add(chunk_id)
+        return chunk_ids_to_delete, file_paths_to_update

codeembed/doc_embedder/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from codeembed.doc_embedder.doc_embedder import DocEmbedder
+__all__ = [
+    "DocEmbedder",
+]