PyPI - codexlr8 - Versions diffs - 0.0.1__py3-none-any.whl - Mend

codexlr8 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

codexlr8/__init__.py +3 -0
codexlr8/cli.py +515 -0
codexlr8/config.py +47 -0
codexlr8/mcp_server.py +163 -0
codexlr8/meta.py +110 -0
codexlr8/scanner.py +82 -0
codexlr8/search.py +405 -0
codexlr8-0.0.1.dist-info/METADATA +152 -0
codexlr8-0.0.1.dist-info/RECORD +13 -0
codexlr8-0.0.1.dist-info/WHEEL +5 -0
codexlr8-0.0.1.dist-info/entry_points.txt +3 -0
codexlr8-0.0.1.dist-info/licenses/LICENSE +201 -0
codexlr8-0.0.1.dist-info/top_level.txt +1 -0

codexlr8/search.py ADDED Viewed

@@ -0,0 +1,405 @@
+"""Search engine — SQLite FTS5 index with custom ranking for code search."""
+from __future__ import annotations
+import fnmatch
+import os
+import re
+import sqlite3
+from datetime import datetime, timezone
+from .config import load_config
+from .meta import META_EXTENSION, read_meta
+from .scanner import scan_project
+INDEX_DB_NAME = ".codexlr8_index.db"
+def _is_init_file(path: str) -> bool:
+    return os.path.basename(path) == "__init__.py"
+def _tokenize(text: str) -> list[str]:
+    if not text:
+        return []
+    # Capture identifiers (letter-starting) and standalone numbers
+    tokens = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*|\d+", text.lower())
+    return [t for t in tokens if len(t) > 1 or t.isdigit()]  # skip single letters
+def _token_match_ratio(tokens: list[str], text: str) -> float:
+    """What fraction of query tokens appear in the document text?"""
+    if not tokens:
+        return 0.0
+    text_lower = text.lower()
+    matched = sum(1 for t in tokens if t in text_lower)
+    return matched / len(tokens)
+def _matches_exclude(path: str, excludes: list[str]) -> bool:
+    """Check if a file path matches any exclude pattern."""
+    basename = os.path.basename(path)
+    for pattern in excludes:
+        if fnmatch.fnmatch(path, pattern):
+            return True
+        if fnmatch.fnmatch(basename, pattern):
+            return True
+    return False
+class SearchEngine:
+    """SQLite FTS5-backed search engine for a codebase."""
+    def __init__(self, project_path: str):
+        self.project_path = os.path.abspath(project_path)
+        self.db_path = os.path.join(self.project_path, INDEX_DB_NAME)
+        self._config = None
+    @property
+    def config(self) -> dict:
+        if self._config is None:
+            self._config = load_config(self.project_path)
+        return self._config
+    def _get_connection(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        conn.execute("PRAGMA journal_mode=WAL")
+        conn.execute("PRAGMA synchronous=NORMAL")
+        return conn
+    def build_index(self, incremental: bool = False,
+                    exclude: list[str] | None = None,
+                    include: list[str] | None = None) -> int:
+        """Build the full search index.
+        If incremental=True, only re-index changed/new/removed files.
+        include/exclude are glob patterns; fall back to config defaults.
+        Returns number of files indexed/mutated.
+        """
+        if exclude is None:
+            exclude = self.config.get("exclude", [])
+        if include is None:
+            include = self.config.get("include", [])
+        root = self.config.get("root", ".")
+        scan_root = os.path.join(self.project_path, root)
+        files_data = scan_project(
+            scan_root,
+            extensions=self.config.get("extensions"),
+            ignore_dirs=self.config.get("ignore_dirs"),
+            include=include,
+            exclude=exclude,
+        )
+        conn = self._get_connection()
+        if not incremental:
+            conn.execute("DROP TABLE IF EXISTS files")
+            conn.execute("""
+                CREATE VIRTUAL TABLE IF NOT EXISTS files USING fts5(
+                    path, summary, tags, public_api, content,
+                    tokenize='porter unicode61'
+                )
+            """)
+            conn.execute("DROP TABLE IF EXISTS file_meta")
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS file_meta (
+                    path TEXT PRIMARY KEY,
+                    content_size INTEGER,
+                    has_meta BOOLEAN,
+                    is_init BOOLEAN,
+                    file_mtime REAL,
+                    index_built_at TEXT
+                )
+            """)
+        conn.execute("""
+            CREATE VIRTUAL TABLE IF NOT EXISTS files USING fts5(
+                path, summary, tags, public_api, content,
+                tokenize='porter unicode61'
+            )
+        """)
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS file_meta (
+                path TEXT PRIMARY KEY,
+                content_size INTEGER,
+                has_meta BOOLEAN,
+                is_init BOOLEAN,
+                file_mtime REAL,
+                index_built_at TEXT
+            )
+        """)
+        now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+        if incremental:
+            count = self._incremental_update(conn, files_data, now)
+        else:
+            count = self._full_rebuild(conn, files_data, now)
+        conn.commit()
+        conn.close()
+        return count
+    def _full_rebuild(self, conn: sqlite3.Connection, files_data: list[dict], now: str) -> int:
+        conn.execute("DELETE FROM files")
+        conn.execute("DELETE FROM file_meta")
+        count = 0
+        for entry in files_data:
+            self._index_file(conn, entry, now)
+            count += 1
+        return count
+    def _incremental_update(self, conn: sqlite3.Connection, files_data: list[dict], now: str) -> int:
+        current_files: dict[str, float] = {}
+        file_data_map: dict[str, dict] = {}
+        for entry in files_data:
+            abspath = os.path.join(self.project_path, entry["path"])
+            mtime = os.path.getmtime(abspath)
+            current_files[entry["path"]] = mtime
+            file_data_map[entry["path"]] = entry
+        indexed = conn.execute("SELECT path, file_mtime FROM file_meta").fetchall()
+        indexed_map = {row["path"]: row["file_mtime"] for row in indexed}
+        count = 0
+        removed = set(indexed_map) - set(current_files)
+        for path in removed:
+            conn.execute("DELETE FROM files WHERE path = ?", (path,))
+            conn.execute("DELETE FROM file_meta WHERE path = ?", (path,))
+            count += 1
+        for path, mtime in current_files.items():
+            if path not in indexed_map or mtime > indexed_map[path]:
+                self._index_file(conn, file_data_map[path], now, replace=True)
+                count += 1
+        return count
+    def _index_file(self, conn: sqlite3.Connection, entry: dict, now: str, replace: bool = False):
+        path = entry["path"]
+        content = entry.get("content", "")
+        abspath = os.path.join(self.project_path, path)
+        meta = read_meta(abspath + META_EXTENSION) or {}
+        mtime = os.path.getmtime(abspath)
+        summary = meta.get("summary", "")
+        tags = " ".join(meta.get("tags", []))
+        public_api = " ".join(meta.get("public_api", []))
+        if replace:
+            conn.execute("DELETE FROM files WHERE path = ?", (path,))
+        conn.execute(
+            "INSERT INTO files (path, summary, tags, public_api, content) "
+            "VALUES (?, ?, ?, ?, ?)",
+            (path, summary, tags, public_api, content),
+        )
+        line_count = content.count('\n')
+        if content and not content.endswith('\n'):
+            line_count += 1
+        conn.execute(
+            "INSERT OR REPLACE INTO file_meta "
+            "(path, content_size, has_meta, is_init, file_mtime, index_built_at) "
+            "VALUES (?, ?, ?, ?, ?, ?)",
+            (path, line_count, bool(meta), _is_init_file(path), mtime, now),
+        )
+    def search(self, query: str, limit: int = 10,
+               exclude: list[str] | None = None) -> list[dict]:
+        """Search the codebase and return ranked results.
+        Uses AND semantics: all query tokens must match (like Google).
+        Falls back to OR if AND returns nothing, with a post-filter
+        requiring at least 50% of query tokens to match the document.
+        """
+        if not os.path.exists(self.db_path):
+            return []
+        tokens = _tokenize(query)
+        if not tokens:
+            return []
+        if exclude is None:
+            exclude = self.config.get("exclude", [])
+        conn = self._get_connection()
+        # Stage 1: try AND (best precision)
+        and_query = " AND ".join(tokens)
+        cursor = conn.execute(
+            "SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
+            "       m.is_init, rank "
+            "FROM files f "
+            "JOIN file_meta m ON f.path = m.path "
+            "WHERE files MATCH ? "
+            "ORDER BY rank "
+            "LIMIT ?",
+            (and_query, limit * 5),
+        )
+        rows = cursor.fetchall()
+        # Stage 2: fall back to OR if AND found nothing
+        if not rows and len(tokens) > 1:
+            or_query = " OR ".join(tokens)
+            cursor = conn.execute(
+                "SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
+                "       m.is_init, rank "
+                "FROM files f "
+                "JOIN file_meta m ON f.path = m.path "
+                "WHERE files MATCH ? "
+                "ORDER BY rank "
+                "LIMIT ?",
+                (or_query, limit * 10),
+            )
+            rows = cursor.fetchall()
+        # Stage 3: post-filter by token coverage
+        min_ratio = 0.5 if len(tokens) >= 4 else 0.0
+        results = []
+        for row in rows:
+            if _matches_exclude(row["path"], exclude):
+                continue
+            content = row["content"] or ""
+            ratio = _token_match_ratio(tokens, content + (row["summary"] or "") + (row["tags"] or ""))
+            if ratio < min_ratio:
+                continue
+            score = self._compute_score(tokens, dict(row), ratio)
+            if row["is_init"]:
+                score *= 0.6
+            results.append({
+                "path": row["path"],
+                "summary": row["summary"] or None,
+                "tags": (row["tags"] or "").split(),
+                "public_api": row["public_api"] or "",
+                "score": score,
+            })
+        conn.close()
+        results.sort(key=lambda r: r["score"], reverse=True)
+        final = []
+        for r in results[:limit]:
+            preview, line_range = self._get_preview(r["path"], tokens)
+            final.append({
+                "path": r["path"],
+                "line_start": line_range[0],
+                "line_end": line_range[1],
+                "summary": r["summary"],
+                "tags": r["tags"],
+                "score": r["score"],
+                "preview": preview,
+            })
+        return final
+    def _compute_score(self, tokens: list[str], row: dict, match_ratio: float = 1.0) -> float:
+        """Compute relevance score.
+        Core ranking: BM25 from FTS5 (via 'rank') provides the base score.
+        On top of that:
+        - Metadata boost: public_api (1.0) > tags (0.8) > summary (0.6)
+        - Match ratio: fraction of query tokens found in the document
+        - init.py penalty: 0.6x (applied in search())
+        """
+        score = 0.0
+        public_api = (row.get("public_api") or "").lower()
+        summary = (row.get("summary") or "").lower()
+        tags = (row.get("tags") or "").lower()
+        api_tokens = set(_tokenize(public_api))
+        tag_tokens = set(tags.split())
+        summary_tokens = set(_tokenize(summary))
+        for token in tokens:
+            if token in api_tokens:
+                score += 1.0
+            elif token in tag_tokens:
+                score += 0.8
+            elif token in summary_tokens:
+                score += 0.6
+            else:
+                # Content match via BM25 — base weight
+                score += 0.3
+        # Multiply by match ratio: files matching more query terms rank higher
+        score *= match_ratio
+        return round(score, 4)
+    def _get_preview(self, relpath: str, tokens: list[str]) -> tuple[str, tuple[int, int]]:
+        filepath = os.path.join(self.project_path, relpath)
+        if not os.path.exists(filepath):
+            return "", (0, 0)
+        try:
+            with open(filepath, "r", encoding="utf-8", errors="replace") as f:
+                lines = f.readlines()
+        except Exception:
+            return "", (0, 0)
+        if not lines:
+            return "", (0, 0)
+        best_line = 0
+        best_matches = 0
+        for i, line in enumerate(lines):
+            line_lower = line.lower()
+            matches = sum(1 for t in tokens if t in line_lower)
+            if matches > best_matches:
+                best_matches = matches
+                best_line = i
+        start = max(0, best_line - 2)
+        end = min(len(lines), best_line + 8)
+        snippet = "".join(lines[start:end])
+        return snippet, (start + 1, end)
+    def status(self) -> dict:
+        result = {
+            "project_path": self.project_path,
+            "files_indexed": 0,
+            "files_with_meta": 0,
+            "files_without_meta": 0,
+            "total_lines": 0,
+            "index_age": "No index yet",
+        }
+        if not os.path.exists(self.db_path):
+            return result
+        conn = self._get_connection()
+        row = conn.execute("SELECT COUNT(*) as cnt FROM files").fetchone()
+        result["files_indexed"] = row["cnt"] if row else 0
+        row = conn.execute("SELECT COUNT(*) as cnt FROM file_meta WHERE has_meta = 1").fetchone()
+        result["files_with_meta"] = row["cnt"] if row else 0
+        result["files_without_meta"] = result["files_indexed"] - result["files_with_meta"]
+        row = conn.execute("SELECT SUM(content_size) as total FROM file_meta").fetchone()
+        result["total_lines"] = row["total"] or 0
+        mtime = os.path.getmtime(self.db_path)
+        mtime_dt = datetime.fromtimestamp(mtime)
+        age = datetime.now() - mtime_dt
+        if age.seconds < 60:
+            result["index_age"] = f"{age.seconds}s ago"
+        elif age.seconds < 3600:
+            result["index_age"] = f"{age.seconds // 60}m ago"
+        else:
+            result["index_age"] = f"{age.seconds // 3600}h ago"
+        conn.close()
+        return result

codexlr8-0.0.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,152 @@
+Metadata-Version: 2.4
+Name: codexlr8
+Version: 0.0.1
+Summary: A codebase search engine for LLM coding agents
+Author-email: Sadig Akhund <sadigaxund@gmail.com>
+License: Apache-2.0
+Project-URL: Homepage, https://github.com/sadigaxund/codexlr8
+Project-URL: Repository, https://github.com/sadigaxund/codexlr8
+Project-URL: Issues, https://github.com/sadigaxund/codexlr8/issues
+Keywords: code-search,llm,agent,navigation,mcp
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: click>=8.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: mcp>=1.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0; extra == "dev"
+Dynamic: license-file
+# CodeXLR8
+[![PyPI version](https://img.shields.io/pypi/v/codexlr8)](https://pypi.org/project/codexlr8/)
+[![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/downloads/)
+[![License](https://img.shields.io/badge/license-Apache%202.0-green)](LICENSE)
+[![CI](https://github.com/sadigaxund/codexlr8/actions/workflows/test.yml/badge.svg)](https://github.com/sadigaxund/codexlr8/actions)
+A codebase search engine for LLM coding agents. **One query, precise results, no noise.**
+## Setup
+```bash
+pip install codexlr8
+codexlr8 setup
+```
+`setup` auto-detects MCP clients (Claude Code, Cursor) and injects the server config, then walks you through project configuration. After setup, build the search index:
+```bash
+codexlr8 index .
+```
+Your agents now have `codebase_search` and `codebase_index` tools. Search from the CLI yourself:
+```bash
+codexlr8 search . "login auth"
+# 1. auth/session.py:14-27  [score: 1.60]
+#    meta: User authentication — login, logout, session management
+#    tags: auth, login, session, security
+```
+## How It Works
+CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `.meta.yaml` sidecar files that boost ranking precision:
+| Layer | Source | Boost |
+|---|---|---|
+| 1 | Raw file content (function names, variables, comments, docstrings) | FTS5 BM25 base |
+| 2 | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
+| 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
+Search uses AND semantics (like Google): all query tokens must match. If no results, falls back to OR with a ≥50% token threshold.
+## .meta.yaml Sidecars
+Optional YAML files next to source files, created by `codexlr8 init`:
+```yaml
+public_api: [login, logout, reset_password]
+summary: "User auth: login, session, password reset"
+tags: [auth, security, session]
+invariants:
+  - "db.connect() must be called first"
+```
+Files without `.meta.yaml` still get indexed — metadata just produces higher ranking scores.
+## Configuration
+Optional `.codexlr8.yaml` at the project root:
+```yaml
+root: "."
+include: []                     # scope: only scan these
+exclude:                        # skip these
+  - tests/*
+  - test_*
+extensions:                     # file types to index
+  - .py
+  - .js
+ignore_dirs:                    # skip entirely
+  - .git
+  - __pycache__
+```
+All fields have defaults. Use `codexlr8 setup` to create one interactively, or edit by hand.
+## Agent Integration
+Works with **Claude Code, Cursor, Windsurf, Continue.dev** and any MCP-compatible client.
+`codexlr8 setup` auto-detects installed clients and offers to inject the MCP server config. For manual setup, add this to your client's config:
+```json
+{
+  "mcpServers": {
+    "codexlr8": {
+      "command": "uvx",
+      "args": ["codexlr8", "mcp-server"]
+    }
+  }
+}
+```
+Tools available to agents:
+| Tool | Description |
+|---|---|
+| `codebase_search(query, path?, limit?, exclude?)` | Search the codebase, return ranked results |
+| `codebase_index(path?, incremental?, exclude?)` | Build or update the search index |
+The included agent skill ([SKILL.md](SKILL.md)) teaches agents to search before reading files, maintain `.meta.yaml` sidecars, and keep the index fresh.
+## Commands
+```
+codexlr8 setup            Interactive project + MCP config
+codexlr8 scan <path>      List source files and line counts
+codexlr8 init <path>      Bootstrap .meta.yaml sidecars
+codexlr8 index <path>     Build the search index
+codexlr8 search <path> <q> Search the codebase
+codexlr8 status <path>    Show index coverage and age
+codexlr8 install-skill    Install agent skill for Claude Code
+codexlr8 mcp-config       Print MCP client config JSON
+```
+## Contributing
+See [AGENTS.md](AGENTS.md) for principles and development guidelines.
+## License
+Apache 2.0. See [LICENSE](LICENSE).

codexlr8-0.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+codexlr8/__init__.py,sha256=QddVPI3SmCeQq8QyXSCgrO5tBQTnWaigxknfP-iKzao,90
+codexlr8/cli.py,sha256=yJTm_Z0BJU1t4PTTBMjgVScmEy9osF5dYfiTYi2Tb9U,18216
+codexlr8/config.py,sha256=zlsxAnRhbpK-SJ2uN-t5O14mRX5wbzMWWB4Jg0CHPXw,1333
+codexlr8/mcp_server.py,sha256=VqvtUpHMv6XRuRRjU2cuQuT-hnAiGtoYgz3W-bSsnaU,5584
+codexlr8/meta.py,sha256=OyjAqD6OBdaC3gfAGpiuE9QSpHOa2gKS4jPUMXsRSw4,3270
+codexlr8/scanner.py,sha256=AyTO5EtDlOwWXjtnd5_7kMUx22XEP9X4kl0UX4khoro,2882
+codexlr8/search.py,sha256=ZmwkAnB02ZbFUcp1zfp5DUIYOYwLc1ubT-TOrcCL7iI,13851
+codexlr8-0.0.1.dist-info/licenses/LICENSE,sha256=wAtXn9YalS-tNHgydkrIPFDouPpqARf2ObixOWadQUo,11342
+codexlr8-0.0.1.dist-info/METADATA,sha256=7HxYFaju53MLFuLI6_v-vmEg7Nx9WU1DKgekL6yMIuE,4942
+codexlr8-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+codexlr8-0.0.1.dist-info/entry_points.txt,sha256=hOg94qhE4Emf4OyHUye9m5_KhfF-hcqzOkE5gfXbVrQ,87
+codexlr8-0.0.1.dist-info/top_level.txt,sha256=GxUlzPqgBl_1BIcFTCuzzD_hR3vX6DweiueEksKq0zg,9
+codexlr8-0.0.1.dist-info/RECORD,,

codexlr8-0.0.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

codexlr8-0.0.1.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+codexlr8 = codexlr8.cli:main
+codexlr8-mcp = codexlr8.mcp_server:main