PyPI - codebrain - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codebrain 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

codebrain/__init__.py +3 -0
codebrain/__main__.py +6 -0
codebrain/agent_bridge.py +162 -0
codebrain/analyzer.py +943 -0
codebrain/api.py +578 -0
codebrain/api_models.py +102 -0
codebrain/cli.py +1927 -0
codebrain/comprehension.py +1939 -0
codebrain/config.py +46 -0
codebrain/context.py +276 -0
codebrain/export.py +334 -0
codebrain/graph/__init__.py +0 -0
codebrain/graph/query.py +656 -0
codebrain/graph/schema.py +113 -0
codebrain/graph/store.py +295 -0
codebrain/hook_runner.py +71 -0
codebrain/hooks.py +107 -0
codebrain/indexer.py +450 -0
codebrain/llm.py +676 -0
codebrain/logging.py +42 -0
codebrain/mcp_server.py +1635 -0
codebrain/memory/__init__.py +5 -0
codebrain/memory/store.py +270 -0
codebrain/parser/__init__.py +0 -0
codebrain/parser/base.py +27 -0
codebrain/parser/config_parser.py +228 -0
codebrain/parser/models.py +44 -0
codebrain/parser/python_parser.py +658 -0
codebrain/parser/registry.py +144 -0
codebrain/parser/typescript_parser.py +1189 -0
codebrain/parser/typescript_treesitter.py +535 -0
codebrain/py.typed +0 -0
codebrain/resolver.py +171 -0
codebrain/settings.py +88 -0
codebrain/utils.py +59 -0
codebrain/validator.py +563 -0
codebrain/watcher/__init__.py +0 -0
codebrain/watcher/file_watcher.py +173 -0
codebrain-0.1.0.dist-info/METADATA +360 -0
codebrain-0.1.0.dist-info/RECORD +44 -0
codebrain-0.1.0.dist-info/WHEEL +5 -0
codebrain-0.1.0.dist-info/entry_points.txt +6 -0
codebrain-0.1.0.dist-info/licenses/LICENSE +21 -0
codebrain-0.1.0.dist-info/top_level.txt +1 -0

codebrain/indexer.py ADDED Viewed

@@ -0,0 +1,450 @@
+"""Orchestrator: scan a repository, parse Python files, store in the graph."""
+from __future__ import annotations
+import fnmatch
+import re
+import time
+import sys
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from pathlib import Path
+from typing import Callable
+from codebrain.config import CODEBRAIN_DIR, DB_FILENAME, INDEXABLE_EXTENSIONS, SKIP_DIRS
+from codebrain.graph.store import GraphStore
+from codebrain.logging import get_logger
+from codebrain.parser.registry import get_registry
+from codebrain.settings import Settings, load_settings
+from codebrain.utils import file_hash, normalize_path
+_log = get_logger("indexer")
+# Default threshold: use parallel parsing when file count exceeds this
+_PARALLEL_THRESHOLD = 50
+def _parse_file(file_path: Path, repo_root: Path) -> "ParsedFile":
+    """Route to the correct parser via the plugin registry."""
+    return get_registry().parse(file_path, repo_root)
+def _parse_one(args: tuple[str, str]) -> "tuple[object | None, str | None]":
+    """Module-level function for ProcessPoolExecutor (must be picklable).
+    Returns (ParsedFile, None) on success, or (None, error_string) on failure.
+    """
+    file_path_str, repo_root_str = args
+    file_path = Path(file_path_str)
+    repo_root = Path(repo_root_str)
+    try:
+        pf = _parse_file(file_path, repo_root)
+        return (pf, None)
+    except Exception as exc:
+        rel = normalize_path(file_path, repo_root)
+        return (None, f"{rel}: {exc}")
+_PARSE_SCRIPT = """\
+import sys, json, pickle, base64
+from pathlib import Path
+from codebrain.parser.registry import get_registry
+r = get_registry()
+pf = r.parse(Path(sys.argv[1]), Path(sys.argv[2]))
+data = base64.b64encode(pickle.dumps(pf)).decode()
+print(data)
+"""
+def _parse_with_timeout(
+    file_path: Path, repo_root: Path, timeout: int = 30,
+) -> "tuple[object | None, str | None]":
+    """Parse a file in a subprocess with a hard timeout.
+    Uses subprocess.run so we can actually kill the process if the parser
+    hangs (e.g. tree-sitter C extension holding the GIL on Windows).
+    Returns (ParsedFile, None) on success or (None, error_string) on failure/timeout.
+    """
+    import subprocess
+    import pickle
+    import base64
+    rel = normalize_path(file_path, repo_root)
+    try:
+        result = subprocess.run(
+            [sys.executable, "-c", _PARSE_SCRIPT, str(file_path), str(repo_root)],
+            capture_output=True, text=True, timeout=timeout,
+        )
+        if result.returncode != 0:
+            err_msg = result.stderr.strip().split("\n")[-1] if result.stderr else "unknown error"
+            return (None, f"{rel}: {err_msg}")
+        data = result.stdout.strip()
+        if not data:
+            return (None, f"{rel}: parser returned no output")
+        pf = pickle.loads(base64.b64decode(data))
+        return (pf, None)
+    except subprocess.TimeoutExpired:
+        _log.warning("Parse timeout: %s (>%ds), killed", rel, timeout)
+        return (None, f"{rel}: parse timeout (>{timeout}s, killed)")
+    except Exception as exc:
+        return (None, f"{rel}: subprocess error: {exc}")
+def _load_gitignore_patterns(repo_root: Path) -> list[tuple[str, bool]]:
+    """Read .gitignore and return a list of (pattern, negated) tuples."""
+    gitignore = repo_root / ".gitignore"
+    if not gitignore.is_file():
+        return []
+    patterns: list[tuple[str, bool]] = []
+    for line in gitignore.read_text(errors="replace").splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        negated = line.startswith("!")
+        if negated:
+            line = line[1:]
+        patterns.append((line, negated))
+    return patterns
+def _should_skip_dir(dir_name: str, skip_dirs: frozenset[str] = SKIP_DIRS) -> bool:
+    """Return True if this directory should be skipped entirely."""
+    for pattern in skip_dirs:
+        if fnmatch.fnmatch(dir_name, pattern):
+            return True
+    return False
+def _gitignore_to_regex(pattern: str) -> re.Pattern[str]:
+    """Convert a single .gitignore pattern to a compiled regex.
+    Supports ``**`` (any path segment), anchored patterns (leading ``/``),
+    directory-only patterns (trailing ``/``), and standard globs.
+    """
+    clean = pattern.rstrip("/")
+    anchored = clean.startswith("/")
+    if anchored:
+        clean = clean[1:]
+    # Escape and then convert glob tokens
+    parts: list[str] = []
+    i = 0
+    while i < len(clean):
+        c = clean[i]
+        if c == "*":
+            if i + 1 < len(clean) and clean[i + 1] == "*":
+                # **
+                if i + 2 < len(clean) and clean[i + 2] == "/":
+                    parts.append("(?:.+/)?")
+                    i += 3
+                    continue
+                parts.append(".*")
+                i += 2
+                continue
+            parts.append("[^/]*")
+        elif c == "?":
+            parts.append("[^/]")
+        elif c in r"\.+^${}()|[]":
+            parts.append(f"\\{c}")
+        else:
+            parts.append(c)
+        i += 1
+    regex = "".join(parts)
+    if anchored:
+        regex = f"^{regex}"
+    else:
+        regex = f"(?:^|/){regex}"
+    # Match both the exact path and anything beneath it
+    regex = f"(?:{regex})(?:/.*)?$"
+    return re.compile(regex)
+def _matches_gitignore(rel_path: str, patterns: list[tuple[str, bool]]) -> bool:
+    """Check if *rel_path* matches the .gitignore patterns.
+    Processes patterns in order; negation patterns (``!``) can un-ignore
+    previously matched paths.
+    """
+    matched = False
+    for pattern, negated in patterns:
+        regex = _gitignore_to_regex(pattern)
+        if regex.search(rel_path):
+            matched = not negated
+    return matched
+def discover_files(
+    repo_root: Path,
+    settings: Settings | None = None,
+) -> list[Path]:
+    """Walk the repo and return all indexable files.
+    Uses os.walk with directory pruning for performance on large repos.
+    Uses *settings* for skip_dirs and indexable_extensions when provided,
+    otherwise falls back to config.py defaults.
+    """
+    import os
+    extensions = settings.indexable_extensions if settings else INDEXABLE_EXTENSIONS
+    skip_dirs = settings.skip_dirs if settings else SKIP_DIRS
+    gitignore_patterns = _load_gitignore_patterns(repo_root)
+    files: list[Path] = []
+    root_str = str(repo_root)
+    # Name-based parseable files (e.g. docker-compose.yml)
+    from codebrain.parser.registry import get_registry
+    name_parseable = get_registry().supported_names
+    repo_resolved = repo_root.resolve()
+    for dirpath, dirnames, filenames in os.walk(root_str, followlinks=False):
+        # Prune skipped directories IN-PLACE (prevents os.walk from descending)
+        dirnames[:] = [
+            d for d in dirnames
+            if not _should_skip_dir(d, skip_dirs)
+        ]
+        for fname in filenames:
+            # Check extension first (cheapest filter)
+            ext = os.path.splitext(fname)[1]
+            if ext not in extensions and fname not in name_parseable:
+                continue
+            full_path = os.path.join(dirpath, fname)
+            item = Path(full_path)
+            # Skip symlinks pointing outside the repo (security + avoids cycles)
+            if item.is_symlink():
+                try:
+                    target = item.resolve()
+                    if not str(target).startswith(str(repo_resolved)):
+                        _log.debug("Skipping external symlink: %s -> %s", fname, target)
+                        continue
+                except OSError:
+                    # Broken symlink
+                    _log.debug("Skipping broken symlink: %s", fname)
+                    continue
+            # Check gitignore
+            rel = normalize_path(item, repo_root)
+            if _matches_gitignore(rel, gitignore_patterns):
+                continue
+            # Skip files exceeding max_file_size_kb
+            try:
+                size_kb = item.stat().st_size / 1024
+                if settings and size_kb > settings.max_file_size_kb:
+                    _log.warning(
+                        "Skipping %s (%.1fMB) — exceeds max_file_size %dKB",
+                        rel, size_kb / 1024, settings.max_file_size_kb,
+                    )
+                    continue
+            except OSError:
+                _log.debug("Skipping inaccessible file: %s", fname)
+                continue
+            files.append(item)
+    return sorted(files)
+def full_index(
+    repo_root: Path,
+    db_path: Path | None = None,
+    *,
+    progress_callback: Callable[[int, int], None] | None = None,
+    parallel_threshold: int | None = None,
+    max_workers: int | None = None,
+) -> dict:
+    """Perform a full index of the repository.
+    Args:
+        progress_callback: Called with (current, total) after each file.
+        parallel_threshold: Use ProcessPoolExecutor when file count exceeds this.
+            Defaults to value from .codebrain.toml or 50.
+        max_workers: Max worker processes (None = from settings or cpu_count).
+    Returns a summary dict with counts and timing.
+    """
+    settings = load_settings(repo_root)
+    if db_path is None:
+        db_path = repo_root / settings.codebrain_dir / settings.db_filename
+    if parallel_threshold is None:
+        parallel_threshold = settings.parallel_threshold
+    if max_workers is None:
+        max_workers = settings.max_workers
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    files = discover_files(repo_root, settings)
+    # Mark index as in-progress so interrupted runs can be detected
+    with GraphStore(db_path) as _meta_store:
+        _meta_store.conn.execute(
+            "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
+            ("index_status", "in_progress"),
+        )
+        _meta_store.conn.execute(
+            "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
+            ("index_files_total", str(len(files))),
+        )
+        _meta_store.conn.commit()
+    start = time.perf_counter()
+    parsed_count = 0
+    skipped_count = 0
+    total_nodes = 0
+    total_edges = 0
+    errors: list[str] = []
+    use_parallel = (
+        len(files) >= parallel_threshold
+        and sys.platform != "win32"
+    )
+    # Per-file timeout (seconds). C extensions like tree-sitter can hold the
+    # GIL, making thread-based timeouts useless.  On Windows we use
+    # multiprocessing.Process with kill() for every non-Python file.
+    per_file_timeout = 30
+    # File extensions where C-extension parsers may hang (tree-sitter).
+    # Python uses stdlib ast module which never hangs.
+    _NEEDS_ISOLATION = frozenset({".ts", ".tsx", ".js", ".jsx"})
+    if use_parallel:
+        _log.debug("Parallel parsing %d files (threshold=%d)", len(files), parallel_threshold)
+        args_list = [(str(fp), str(repo_root)) for fp in files]
+        parsed_results = []
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            for i, (pf, err) in enumerate(executor.map(_parse_one, args_list)):
+                if err:
+                    errors.append(err)
+                elif pf is not None:
+                    parsed_results.append(pf)
+                if progress_callback:
+                    progress_callback(i + 1, len(files))
+        # Serial database writes
+        with GraphStore(db_path) as store:
+            for pf in parsed_results:
+                store.upsert_file(pf)
+                parsed_count += 1
+                total_nodes += len(pf.nodes)
+                total_edges += len(pf.edges)
+    else:
+        # Serial path with subprocess isolation for non-Python files (Windows)
+        with GraphStore(db_path) as store:
+            for i, file_path in enumerate(files):
+                rel = normalize_path(file_path, repo_root)
+                try:
+                    # File may have been deleted between discover and parse
+                    if not file_path.is_file():
+                        _log.debug("Skipping deleted file: %s", rel)
+                        skipped_count += 1
+                        if progress_callback:
+                            progress_callback(i + 1, len(files))
+                        continue
+                    needs_isolation = (
+                        sys.platform == "win32"
+                        and file_path.suffix in _NEEDS_ISOLATION
+                    )
+                    if needs_isolation:
+                        pf, err = _parse_with_timeout(
+                            file_path, repo_root, timeout=per_file_timeout,
+                        )
+                        if err:
+                            errors.append(err)
+                            if "timeout" in err:
+                                skipped_count += 1
+                            continue
+                    else:
+                        _log.debug("Parsing %s", rel)
+                        pf = _parse_file(file_path, repo_root)
+                    if pf is not None:
+                        store.upsert_file(pf)
+                        parsed_count += 1
+                        total_nodes += len(pf.nodes)
+                        total_edges += len(pf.edges)
+                except Exception as exc:
+                    _log.debug("Error parsing %s: %s", rel, exc)
+                    errors.append(f"{rel}: {exc}")
+                if progress_callback:
+                    progress_callback(i + 1, len(files))
+    elapsed = time.perf_counter() - start
+    # Mark index as complete
+    with GraphStore(db_path) as _meta_store:
+        _meta_store.conn.execute(
+            "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
+            ("index_status", "complete"),
+        )
+        _meta_store.conn.execute(
+            "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
+            ("index_files_parsed", str(parsed_count)),
+        )
+        _meta_store.conn.execute(
+            "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)",
+            ("index_timestamp", str(time.time())),
+        )
+        _meta_store.conn.commit()
+    return {
+        "files_found": len(files),
+        "files_parsed": parsed_count,
+        "files_skipped": skipped_count,
+        "total_nodes": total_nodes,
+        "total_edges": total_edges,
+        "errors": errors,
+        "elapsed_seconds": round(elapsed, 3),
+    }
+def incremental_update(
+    repo_root: Path,
+    changed_files: list[Path],
+    deleted_files: list[Path],
+    store: GraphStore,
+) -> dict:
+    """Re-parse only changed files and remove deleted ones.
+    Returns a summary dict.
+    """
+    start = time.perf_counter()
+    updated = 0
+    removed = 0
+    errors: list[str] = []
+    for file_path in deleted_files:
+        rel = normalize_path(file_path, repo_root)
+        store.remove_file(rel)
+        removed += 1
+    for file_path in changed_files:
+        rel = normalize_path(file_path, repo_root)
+        try:
+            # Check hash to skip unchanged
+            current_hash = file_hash(file_path)
+            stored_hash = store.get_file_hash(rel)
+            if current_hash == stored_hash:
+                continue
+            pf = _parse_file(file_path, repo_root)
+            store.upsert_file(pf)
+            updated += 1
+        except Exception as exc:
+            errors.append(f"{rel}: {exc}")
+    elapsed = time.perf_counter() - start
+    return {
+        "files_updated": updated,
+        "files_removed": removed,
+        "errors": errors,
+        "elapsed_seconds": round(elapsed, 3),
+    }