PyPI - codebase-retrieval-context-engine - Versions diffs - 2.0.0__py3-none-any.whl - Mend

codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
corbell/__init__.py +6 -0
corbell/cli/__init__.py +1 -0
corbell/cli/commands/__init__.py +1 -0
corbell/cli/commands/index.py +86 -0
corbell/cli/commands/query.py +71 -0
corbell/cli/main.py +57 -0
corbell/core/__init__.py +1 -0
corbell/core/constants.py +52 -0
corbell/core/embeddings/__init__.py +6 -0
corbell/core/embeddings/base.py +68 -0
corbell/core/embeddings/extractor.py +201 -0
corbell/core/embeddings/factory.py +48 -0
corbell/core/embeddings/model.py +401 -0
corbell/core/embeddings/search_cache.py +95 -0
corbell/core/embeddings/sqlite_store.py +271 -0
corbell/core/gitignore.py +76 -0
corbell/core/graph/__init__.py +1 -0
corbell/core/graph/builder.py +696 -0
corbell/core/graph/method_graph.py +1077 -0
corbell/core/graph/providers/__init__.py +6 -0
corbell/core/graph/providers/aws_patterns.py +62 -0
corbell/core/graph/providers/azure_patterns.py +64 -0
corbell/core/graph/providers/gcp_patterns.py +59 -0
corbell/core/graph/schema.py +175 -0
corbell/core/graph/sqlite_store.py +500 -0
corbell/core/indexing/__init__.py +1 -0
corbell/core/indexing/builder.py +608 -0
corbell/core/indexing/lock.py +150 -0
corbell/core/indexing/tracker.py +245 -0
corbell/core/llm_client.py +677 -0
corbell/core/mcp/__init__.py +1 -0
corbell/core/mcp/server.py +214 -0
corbell/core/query/__init__.py +1 -0
corbell/core/query/diagnostics.py +38 -0
corbell/core/query/engine.py +321 -0
corbell/core/query/enhancer.py +102 -0
corbell/core/query/formatter.py +98 -0
corbell/core/query/graph_expander.py +284 -0
corbell/core/query/merger.py +171 -0
corbell/core/query/reranker.py +131 -0
corbell/core/workspace.py +408 -0

corbell/core/mcp/server.py ADDED Viewed

@@ -0,0 +1,214 @@
+"""MCP Server for Corbell code retrieval engine.
+Exposes a single tool `context_engine_codebase_retrieval` via FastMCP,
+supporting both stdio and SSE transports.
+"""
+from __future__ import annotations
+import asyncio
+import os
+import sys
+from typing import Optional
+from mcp.server.fastmcp import FastMCP
+# Create the FastMCP server
+mcp = FastMCP("corbell", dependencies=["corbell"])
+# ---------------------------------------------------------------------------
+# Tool: context_engine_codebase_retrieval
+# ---------------------------------------------------------------------------
+@mcp.tool()
+def context_engine_codebase_retrieval(
+    query: str,
+    workspace_full_path: str = "",
+    top_k: int = 50,
+    rerank: bool = True,
+) -> str:
+    """Search the indexed codebase and return relevant code snippets.
+    Returns formatted code blocks with absolute file paths and line numbers,
+    ready for injection into an LLM context window.
+    Args:
+        query: Natural language description of the code you're looking for.
+        workspace_full_path: Full path to the workspace (repository) root directory.
+            Falls back to CORBELL_WORKSPACE env var if empty.
+        top_k: Maximum number of code chunks to return (default 50).
+        rerank: Whether to use LLM reranking for better relevance (default true).
+    Returns:
+        Formatted code snippets, or an error string on failure.
+    """
+    try:
+        workspace_path_str = _resolve_workspace(workspace_full_path)
+        if workspace_path_str is None:
+            return (
+                "Error: workspace_full_path is required. "
+                "Pass the full path to the workspace (repository) root directory."
+            )
+        from pathlib import Path
+        from corbell.core.workspace import build_config, db_path_for_workspace
+        from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
+        from corbell.core.indexing.tracker import IndexTracker
+        from corbell.core.indexing.builder import IndexBuilder
+        ws_path = Path(workspace_path_str).resolve()
+        if not ws_path.exists():
+            return (
+                f"Error: Workspace directory not found: {ws_path}. "
+                "Ensure the path points to a valid repository root."
+            )
+        cfg = build_config(ws_path)
+        db_path = db_path_for_workspace(ws_path, model=cfg.storage.resolved_model())
+        try:
+            emb_store = SQLiteEmbeddingStore(db_path)
+        except Exception:
+            return (
+                f"Error: Database corrupted at {db_path}. "
+                "Run 'corbell index build --rebuild' to recreate."
+            )
+        # Check index status
+        try:
+            chunk_count = emb_store.count()
+        except Exception:
+            return (
+                f"Error: Database corrupted at {db_path}. "
+                "Run 'corbell index build --rebuild' to recreate."
+            )
+        if chunk_count == 0:
+            import logging
+            logging.getLogger(__name__).info(
+                "Index is empty — running full build now (this may take a while)..."
+            )
+            builder = IndexBuilder()
+            builder.build(cfg, db_path, rebuild=True)
+        # Blocking incremental rebuild if stale (MCP never does full build)
+        tracker = IndexTracker(db_path)
+        stale_result = tracker.get_stale_files(cfg.repos, cfg)
+        if stale_result.has_changes:
+            try:
+                builder = IndexBuilder()
+                builder.build(cfg, db_path, rebuild=False)
+            except Exception:
+                # Non-fatal: proceed with current index
+                pass
+        # Run the retrieval pipeline
+        from corbell.core.query.engine import codebase_retrieval
+        result = codebase_retrieval(
+            query=query,
+            workspace_path=ws_path,
+            top_k=top_k,
+            use_llm=True,
+            rerank=rerank,
+        )
+        return result
+    except Exception as exc:
+        return f"Error: Unexpected failure in codebase_retrieval: {exc}"
+def _resolve_workspace(workspace_full_path: str) -> Optional[str]:
+    """Resolve the workspace path from parameter or env var."""
+    # 1. Explicit path provided
+    if workspace_full_path and workspace_full_path.strip():
+        return workspace_full_path.strip()
+    # 2. Environment variable
+    env_path = os.environ.get("CORBELL_WORKSPACE")
+    if env_path:
+        return env_path
+    return None
+# ---------------------------------------------------------------------------
+# Filtered stdin wrapper — prevents empty-line crashes in MCP SDK
+# ---------------------------------------------------------------------------
+class _FilteredStdin:
+    """Async iterator over stdin that silently drops empty/whitespace lines.
+    The MCP SDK's stdio transport passes every raw line from sys.stdin to
+    Pydantic's JSONRPCMessage.model_validate_json(). Empty newlines fail
+    validation and crash the server. This wrapper filters them out.
+    """
+    def __init__(self) -> None:
+        self._reader = None
+    def __aiter__(self):
+        return self
+    async def __anext__(self) -> str:
+        loop = asyncio.get_event_loop()
+        while True:
+            line = await loop.run_in_executor(None, sys.stdin.readline)
+            if not line:  # EOF
+                raise StopAsyncIteration
+            if line.strip():  # Only forward non-empty lines
+                return line
+            # Empty/whitespace lines are silently dropped
+# ---------------------------------------------------------------------------
+# Server entry point
+# ---------------------------------------------------------------------------
+def serve(transport: str = "stdio", port: int = 8000) -> None:
+    """Run the MCP server.
+    Args:
+        transport: 'stdio' for pipe-based IDE integration, 'sse' for HTTP server.
+        port: Port number for SSE transport (ignored for stdio).
+    """
+    if transport == "sse":
+        print(f"Corbell MCP server starting on http://localhost:{port}/sse ...", file=sys.stderr)
+        mcp.settings.port = port
+        mcp.run(transport="sse")
+    else:
+        print("Corbell MCP server starting on stdio...", file=sys.stderr)
+        async def _run():
+            from mcp.server.stdio import stdio_server
+            filtered = _FilteredStdin()
+            async with stdio_server(stdin=filtered) as (read_stream, write_stream):
+                await mcp._mcp_server.run(
+                    read_stream,
+                    write_stream,
+                    mcp._mcp_server.create_initialization_options(),
+                )
+        asyncio.run(_run())
+def main() -> None:
+    """Entry point for `uvx codebase-retrieval-context-engine`."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Codebase Retrieval Context Engine MCP Server")
+    parser.add_argument(
+        "--transport", "-t", default="stdio", choices=["stdio", "sse"],
+        help="Transport mode (default: stdio)",
+    )
+    parser.add_argument(
+        "--port", "-p", type=int, default=8000,
+        help="Port for SSE transport (default: 8000)",
+    )
+    args = parser.parse_args()
+    serve(transport=args.transport, port=args.port)

corbell/core/query/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Query pipeline module for Corbell code retrieval."""

corbell/core/query/diagnostics.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""Query diagnostics for tracking and surfacing warnings during retrieval."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class QueryDiagnostics:
+    """Accumulates warning counters during a query execution.
+    Counters are incremented as the pipeline runs. At the end,
+    ``summary()`` returns a warning string if any threshold is exceeded.
+    """
+    skipped_files: int = 0       # files that no longer exist on disk
+    skipped_methods: int = 0     # method nodes that couldn't be expanded
+    graph_expansion_failures: int = 0  # graph lookups that failed
+    # Thresholds for emitting warnings
+    _FILE_THRESHOLD: int = field(default=3, init=False, repr=False)
+    _METHOD_THRESHOLD: int = field(default=5, init=False, repr=False)
+    def summary(self) -> Optional[str]:
+        """Return a warning string if any counter exceeds its threshold.
+        Returns:
+            Warning string suitable for display, or None if everything is fine.
+        """
+        parts = []
+        if self.skipped_files >= self._FILE_THRESHOLD:
+            parts.append(f"{self.skipped_files} files missing (index may be stale)")
+        if self.skipped_methods >= self._METHOD_THRESHOLD:
+            parts.append(f"{self.skipped_methods} methods skipped")
+        if not parts:
+            return None
+        return "; ".join(parts)

corbell/core/query/engine.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""Main query engine orchestrator for codebase retrieval."""
+from __future__ import annotations
+import logging
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+logger = logging.getLogger(__name__)
+def codebase_retrieval(
+    query: str,
+    workspace_path: str | Path,
+    top_k: int = 50,
+    use_llm: bool = True,
+    rerank: bool = True,
+) -> str:
+    """Execute the full code retrieval pipeline.
+    Pipeline:
+    1. Load workspace config and open stores.
+    2. Auto-index check (empty → full build, stale → blocking incremental rebuild).
+       Skipped entirely when last build completed within the past 30 seconds.
+    3. Embedding search via EmbeddingSearchCache (raw query used directly).
+    4. Graph call-chain expansion.
+    5. Merge + dedup.
+    6. LLM rerank (optional).
+    7. Format results.
+    Args:
+        query: Natural language query string.
+        workspace_path: Path to the workspace (repository) root directory.
+        top_k: Maximum number of chunks to pass to reranker.
+        use_llm: If False, skip reranking.
+        rerank: If False, skip reranking even when LLM is configured.
+    Returns:
+        Formatted code snippet string ready for LLM context injection.
+        Returns an error string (prefixed with "Error:") on failure.
+    """
+    from corbell.core.workspace import build_config, db_path_for_workspace
+    from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
+    from corbell.core.embeddings.search_cache import EmbeddingSearchCache
+    from corbell.core.embeddings.model import SentenceTransformerModel, GoogleEmbeddingModel, VoyageEmbeddingModel, EmbeddingModel
+    from corbell.core.graph.sqlite_store import SQLiteGraphStore
+    from corbell.core.indexing.builder import IndexBuilder
+    from corbell.core.indexing.tracker import IndexTracker
+    from corbell.core.query.diagnostics import QueryDiagnostics
+    from corbell.core.query.graph_expander import ScoredChunk, expand_via_graph
+    from corbell.core.query.merger import merge_and_dedup
+    from corbell.core.query.reranker import rerank_chunks
+    from corbell.core.query.formatter import format_results
+    workspace_path = Path(workspace_path).resolve()
+    if not workspace_path.exists():
+        return f"Error: Workspace directory not found: {workspace_path}. Run 'corbell index build' first."
+    cfg = build_config(workspace_path)
+    db_path = db_path_for_workspace(workspace_path, model=cfg.storage.resolved_model())
+    emb_store = SQLiteEmbeddingStore(db_path)
+    graph_store = SQLiteGraphStore(db_path)
+    tracker = IndexTracker(db_path)
+    # --- Auto-index check ---
+    chunk_count = emb_store.count()
+    if chunk_count == 0:
+        logger.info("Index is empty — running full build now (this may take a while)...")
+        builder = IndexBuilder()
+        builder.build(cfg, db_path, rebuild=True, progress_fn=lambda msg: logger.info(msg))
+    # Short-circuit: skip stale check if a build finished within the last 30 seconds
+    last_build = tracker.get_last_build_at()
+    if last_build is None or (time.time() - last_build) >= 30:
+        stale_result = tracker.get_stale_files(cfg.repos, cfg)
+        if stale_result.has_changes:
+            # Always do a blocking incremental rebuild when stale
+            builder = IndexBuilder()
+            builder.build(cfg, db_path, rebuild=False, progress_fn=lambda msg: logger.info(msg))
+    # --- LLM client setup ---
+    llm_client: Optional[Any] = None
+    if use_llm:
+        from corbell.core.llm_client import LLMClient
+        llm_cfg = cfg.llm
+        llm_client = LLMClient(
+            provider=llm_cfg.provider,
+            model=llm_cfg.resolved_model(),
+            api_key=llm_cfg.resolved_api_key(),
+            aws_region=llm_cfg.aws_region,
+            azure_endpoint=llm_cfg.azure_endpoint,
+            azure_deployment=llm_cfg.azure_deployment,
+            azure_api_version=llm_cfg.azure_api_version,
+            gcp_project=llm_cfg.gcp_project,
+            gcp_region=llm_cfg.gcp_region,
+        )
+    # --- Search queries ---
+    search_queries = [query]
+    # --- Embedding model ---
+    model_name = cfg.storage.resolved_model()
+    emb_model: EmbeddingModel
+    if model_name.startswith("gemini-"):
+        emb_model = GoogleEmbeddingModel(model_name)
+    elif model_name.startswith("voyage-"):
+        emb_model = VoyageEmbeddingModel(model_name)
+    else:
+        emb_model = SentenceTransformerModel(model_name)
+    # --- Load search cache ---
+    cache = EmbeddingSearchCache()
+    cache.load(emb_store)
+    if not cache.is_loaded:
+        return "No index found. Run 'corbell index build' first."
+    # --- Embedding search ---
+    import numpy as np
+    all_embedding_results: dict[str, ScoredChunk] = {}
+    query_config = cfg.query
+    for sq in search_queries:
+        try:
+            if isinstance(emb_model, GoogleEmbeddingModel):
+                formatted_query = emb_model.prepare_query(sq) if emb_model.uses_prefix_format else sq
+                q_vecs = emb_model.encode([formatted_query], task_type="RETRIEVAL_QUERY")
+            elif isinstance(emb_model, VoyageEmbeddingModel):
+                q_vecs = emb_model.encode([sq], input_type="query")
+            else:
+                q_vecs = emb_model.encode([sq])
+        except Exception as exc:
+            return f"Error: Failed to load embedding model '{model_name}'. Ensure 'sentence-transformers' is installed. ({exc})"
+        q_vec = np.array(q_vecs[0], dtype=np.float32)
+        hits = cache.search(q_vec, top_k=top_k)
+        if not hits:
+            continue
+        # Fetch full records for top hits
+        hit_ids = [h[0] for h in hits]
+        hit_scores = {h[0]: h[1] for h in hits}
+        try:
+            records = emb_store.get_chunks_by_ids(hit_ids)
+        except Exception:
+            continue
+        # Build repo_path map for resolving absolute paths
+        repo_path_map = {
+            r.id: str(r.resolved_path) for r in cfg.repos if r.resolved_path
+        }
+        for record in records:
+            score = hit_scores.get(record.id, 0.0)
+            # Resolve absolute file path
+            abs_path = record.file_path
+            repo_root = repo_path_map.get(record.service_id, "")
+            if repo_root and not Path(abs_path).is_absolute():
+                abs_path = str((Path(repo_root) / abs_path).resolve())
+            chunk = ScoredChunk(
+                chunk_id=record.id,
+                score=score,
+                file_path=abs_path,
+                start_line=record.start_line,
+                end_line=record.end_line,
+                content=record.content,
+                repo_id=record.service_id,
+                symbol=record.symbol,
+                chunk_type=record.chunk_type,
+                language=record.language,
+            )
+            # Keep max score for deduplication across queries
+            existing = all_embedding_results.get(record.id)
+            if existing is None or score > existing.score:
+                all_embedding_results[record.id] = chunk
+    if not all_embedding_results:
+        return "No relevant code found for the given query."
+    base_chunks = list(all_embedding_results.values())
+    # --- Graph expansion ---
+    diagnostics = QueryDiagnostics()
+    bonus_chunks = expand_via_graph(
+        embedding_results=base_chunks,
+        graph_store=graph_store,
+        repos=cfg.repos,
+        max_depth=query_config.expand_call_depth,
+        max_chunks=query_config.expand_max_chunks,
+        diagnostics=diagnostics,
+    )
+    all_chunks = base_chunks + bonus_chunks
+    # --- Merge + dedup ---
+    merged = merge_and_dedup(all_chunks)
+    # --- Apply top_k cap ---
+    merged = merged[:top_k]
+    # --- LLM rerank ---
+    do_rerank = use_llm and rerank and query_config.rerank
+    if do_rerank:
+        # Annotate chunks with graph metadata before sending to the reranker
+        graph_meta = _annotate_with_graph_meta(merged, graph_store, cfg.repos)
+        rerank_start = time.time()
+        reranked_ids = rerank_chunks(query, merged, llm_client, graph_meta=graph_meta)
+        rerank_elapsed = time.time() - rerank_start
+        logger.info(
+            "Rerank complete: %.3fs, %d/%d chunks kept, order: %s",
+            rerank_elapsed,
+            len(reranked_ids),
+            len(merged),
+            reranked_ids,
+        )
+        # Reorder merged, keeping only chunks selected by the reranker
+        id_to_chunk = {c.chunk_id: c for c in merged}
+        merged = [id_to_chunk[cid] for cid in reranked_ids if cid in id_to_chunk]
+    # --- Format output ---
+    repo_paths = {r.id: str(r.resolved_path) for r in cfg.repos if r.resolved_path}
+    output = format_results(merged, repo_paths)
+    # Prepend diagnostics warning if thresholds exceeded
+    warning = diagnostics.summary()
+    if warning:
+        output = f"[warnings: {warning}]\n\n{output}"
+    return output
+def _annotate_with_graph_meta(
+    chunks: List[Any],
+    graph_store: Any,
+    repos: List[Any],
+) -> Dict[str, Dict]:
+    """Build a graph metadata dict keyed by chunk_id for each chunk.
+    For each chunk, finds overlapping MethodNodes (by file_path + line range)
+    and collects:
+      - callers: number of methods that call into this chunk's method
+      - callees: number of method_call edges outgoing from this chunk's method
+      - flow: name of the first FlowNode that includes this method (or None)
+    Args:
+        chunks: List of ScoredChunk objects.
+        graph_store: SQLiteGraphStore instance.
+        repos: List of RepoConfig objects for path resolution.
+    Returns:
+        Dict mapping chunk_id -> {"callers": int, "callees": int, "flow": str | None}.
+        Chunks with no matching MethodNode are omitted.
+    """
+    from corbell.core.query.graph_expander import _find_matching_methods
+    # Build repo_id → absolute path mapping (same as graph_expander)
+    repo_path_map: Dict[str, Path] = {}
+    for repo in repos:
+        if repo.resolved_path:
+            repo_path_map[repo.id] = repo.resolved_path
+    try:
+        all_services = graph_store.get_all_services()
+        service_ids = [s.id for s in all_services]
+    except Exception:
+        return {}
+    graph_meta: Dict[str, Dict] = {}
+    for chunk in chunks:
+        try:
+            matching_methods = _find_matching_methods(
+                chunk, graph_store, repo_path_map, service_ids
+            )
+        except Exception:
+            continue
+        if not matching_methods:
+            continue
+        # Aggregate across all overlapping methods (e.g. nested lambdas)
+        total_callers = 0
+        total_callees = 0
+        flow_name: Optional[str] = None
+        for method in matching_methods:
+            try:
+                callers = graph_store.get_callers_of_method(method.id)
+                total_callers += len(callers)
+            except Exception:
+                pass
+            try:
+                outgoing = graph_store.get_dependencies(method.id)
+                total_callees += sum(1 for e in outgoing if e.kind == "method_call")
+            except Exception:
+                pass
+            if flow_name is None:
+                try:
+                    flows = graph_store.get_flows_for_method(method.id)
+                    if flows:
+                        flow_name = flows[0].get("flow_name") or None
+                except Exception:
+                    pass
+        graph_meta[chunk.chunk_id] = {
+            "callers": total_callers,
+            "callees": total_callees,
+            "flow": flow_name,
+        }
+    return graph_meta

corbell/core/query/enhancer.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Query enhancement: LLM-based query expansion and keyword extraction."""
+from __future__ import annotations
+import re
+from typing import Any, List, Optional, Tuple
+def enhance_query(
+    query: str,
+    llm_client: Optional[Any],
+) -> Tuple[List[str], List[str]]:
+    """Expand a user query into search queries and keywords.
+    With LLM configured: generates 3 natural-language search queries describing
+    what the relevant code would *do* (not technology names).
+    Without LLM: returns the original query as the sole search query and
+    extracts simple keywords via regex.
+    Args:
+        query: The user's natural language query.
+        llm_client: An LLMClient instance (or None / unconfigured).
+    Returns:
+        Tuple of (search_queries, keywords):
+        - search_queries: List of strings to embed and search with.
+        - keywords: List of extracted keywords for graph expansion hints.
+    """
+    if llm_client is not None and getattr(llm_client, "is_configured", False):
+        return _enhance_with_llm(query, llm_client)
+    else:
+        return _enhance_without_llm(query)
+def _enhance_with_llm(
+    query: str, llm_client: Any
+) -> Tuple[List[str], List[str]]:
+    """Use LLM to generate 3 code-oriented search queries."""
+    system = (
+        "You are a code search assistant. Given a user query about code, "
+        "generate exactly 3 different natural-language search queries that describe "
+        "what the relevant implementation code *does* (not technology names or framework names). "
+        "Each query should describe behavior, logic, or data transformations. "
+        "Return exactly 3 queries, one per line, no numbering, no extra text."
+    )
+    user = f"User query: {query}\n\nGenerate 3 code search queries:"
+    try:
+        response = llm_client.call(system, user, max_tokens=300, temperature=0.1)
+        lines = [line.strip() for line in response.strip().splitlines() if line.strip()]
+        # Take up to 3 non-empty lines
+        search_queries = lines[:3]
+        if not search_queries:
+            search_queries = [query]
+    except Exception:
+        search_queries = [query]
+    # Extract keywords from the original query for graph expansion
+    keywords = _extract_keywords(query)
+    return search_queries, keywords
+def _enhance_without_llm(query: str) -> Tuple[List[str], List[str]]:
+    """Simple enhancement without LLM: use query as-is, extract keywords via regex."""
+    keywords = _extract_keywords(query)
+    return [query], keywords
+def _extract_keywords(text: str) -> List[str]:
+    """Extract meaningful keywords from text via regex."""
+    # Extract words that look like identifiers (camelCase, snake_case, etc.)
+    words = re.findall(r"[a-zA-Z][a-zA-Z0-9_]*", text)
+    # Filter stop words and short words
+    stop_words = {
+        "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
+        "of", "with", "by", "from", "up", "about", "into", "through", "during",
+        "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
+        "do", "does", "did", "will", "would", "could", "should", "may", "might",
+        "must", "shall", "can", "how", "what", "when", "where", "which", "who",
+        "that", "this", "these", "those", "it", "its", "get", "set", "use",
+        "new", "return", "class", "function", "method", "var", "let", "const",
+        "def", "import", "from", "as", "if", "else", "while", "for", "try",
+        "except", "raise", "pass", "break", "continue", "not", "and", "or",
+    }
+    keywords = [
+        w for w in words
+        if len(w) > 2 and w.lower() not in stop_words
+    ]
+    # Remove duplicates while preserving order
+    seen: set = set()
+    unique_keywords = []
+    for kw in keywords:
+        lower = kw.lower()
+        if lower not in seen:
+            seen.add(lower)
+            unique_keywords.append(kw)
+    return unique_keywords[:20]  # cap at 20 keywords