PyPI - codespine - Versions diffs - 0.1.1__py3-none-any.whl - Mend

codespine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

codespine/__init__.py +4 -0
codespine/analysis/__init__.py +1 -0
codespine/analysis/community.py +75 -0
codespine/analysis/context.py +24 -0
codespine/analysis/coupling.py +119 -0
codespine/analysis/deadcode.py +107 -0
codespine/analysis/flow.py +77 -0
codespine/analysis/impact.py +90 -0
codespine/cli.py +424 -0
codespine/config.py +22 -0
codespine/db/__init__.py +1 -0
codespine/db/schema.py +82 -0
codespine/db/store.py +313 -0
codespine/diff/__init__.py +1 -0
codespine/diff/branch_diff.py +163 -0
codespine/indexer/__init__.py +1 -0
codespine/indexer/call_resolver.py +137 -0
codespine/indexer/engine.py +305 -0
codespine/indexer/java_parser.py +350 -0
codespine/indexer/symbol_builder.py +32 -0
codespine/mcp/__init__.py +1 -0
codespine/mcp/server.py +67 -0
codespine/noise/__init__.py +1 -0
codespine/noise/blocklist.py +37 -0
codespine/search/__init__.py +1 -0
codespine/search/bm25.py +52 -0
codespine/search/fuzzy.py +36 -0
codespine/search/hybrid.py +80 -0
codespine/search/rrf.py +9 -0
codespine/search/vector.py +113 -0
codespine/watch/__init__.py +1 -0
codespine/watch/watcher.py +38 -0
codespine-0.1.1.dist-info/METADATA +336 -0
codespine-0.1.1.dist-info/RECORD +39 -0
codespine-0.1.1.dist-info/WHEEL +5 -0
codespine-0.1.1.dist-info/entry_points.txt +3 -0
codespine-0.1.1.dist-info/licenses/LICENSE +21 -0
codespine-0.1.1.dist-info/top_level.txt +2 -0
gindex.py +10 -0

codespine/mcp/server.py ADDED Viewed

@@ -0,0 +1,67 @@
+from __future__ import annotations
+from fastmcp import FastMCP
+from codespine.analysis.community import detect_communities, symbol_community
+from codespine.analysis.context import build_symbol_context
+from codespine.analysis.coupling import get_coupling
+from codespine.analysis.deadcode import detect_dead_code as detect_dead_code_analysis
+from codespine.analysis.flow import trace_execution_flows as trace_flows_analysis
+from codespine.analysis.impact import analyze_impact
+from codespine.diff.branch_diff import compare_branches as compare_branches_analysis
+from codespine.search.hybrid import hybrid_search
+def build_mcp_server(store, repo_path_provider):
+    mcp = FastMCP("codespine")
+    @mcp.tool()
+    def search_hybrid(query: str, k: int = 20):
+        return hybrid_search(store, query, k=k)
+    @mcp.tool()
+    def get_impact(symbol: str, max_depth: int = 4):
+        return analyze_impact(store, symbol, max_depth=max_depth)
+    @mcp.tool()
+    def detect_dead_code(limit: int = 200):
+        return detect_dead_code_analysis(store, limit=limit)
+    @mcp.tool()
+    def trace_execution_flows(entry_symbol: str | None = None, max_depth: int = 6):
+        flows = trace_flows_analysis(store, entry_symbol=entry_symbol, max_depth=max_depth)
+        return flows
+    @mcp.tool()
+    def get_symbol_community(symbol: str):
+        detect_communities(store)
+        return symbol_community(store, symbol)
+    @mcp.tool()
+    def get_change_coupling(symbol: str | None = None, months: int = 6, min_strength: float = 0.3, min_cochanges: int = 3):
+        return get_coupling(store, symbol=symbol, months=months, min_strength=min_strength, min_cochanges=min_cochanges)
+    @mcp.tool()
+    def compare_branches(base_ref: str, head_ref: str):
+        return compare_branches_analysis(repo_path_provider(), base_ref, head_ref)
+    @mcp.tool()
+    def get_codebase_stats():
+        projects = store.query_records("MATCH (p:Project) RETURN p.id as project, p.path as path")
+        symbols = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
+        calls = store.query_records("MATCH (:Method)-[r:CALLS]->(:Method) RETURN count(r) as count")
+        return {
+            "projects": projects,
+            "symbols": symbols[0]["count"] if symbols else 0,
+            "calls": calls[0]["count"] if calls else 0,
+        }
+    @mcp.tool()
+    def get_symbol_context(query: str, max_depth: int = 3):
+        return build_symbol_context(store, query, max_depth=max_depth)
+    @mcp.tool()
+    def run_cypher(query: str):
+        return store.query_records(query)
+    return mcp

codespine/noise/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Noise filtering rules."""

codespine/noise/blocklist.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Noise filters for call graph generation."""
+NOISE_METHOD_NAMES = {
+    "print",
+    "println",
+    "printf",
+    "hashCode",
+    "equals",
+    "toString",
+    "getClass",
+    "notify",
+    "notifyAll",
+    "wait",
+    "clone",
+    "finalize",
+    "compareTo",
+    "isEmpty",
+    "size",
+    "length",
+    "stream",
+    "parallelStream",
+    "forEach",
+    "map",
+    "filter",
+    "collect",
+    "orElse",
+    "orElseGet",
+    "add",
+    "append",
+    "remove",
+    "contains",
+    "log",
+    "debug",
+    "info",
+    "warn",
+    "error",
+}

codespine/search/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Search layer."""

codespine/search/bm25.py ADDED Viewed

@@ -0,0 +1,52 @@
+from __future__ import annotations
+import math
+import re
+from collections import Counter
+TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
+def tokenize(text: str) -> list[str]:
+    return [t.lower() for t in TOKEN_RE.findall(text or "")]
+def rank_bm25(query: str, docs: list[tuple[str, str]], k1: float = 1.2, b: float = 0.75) -> list[tuple[str, float]]:
+    """Simple BM25 ranker.
+    docs: list of (doc_id, text)
+    """
+    if not docs:
+        return []
+    q_tokens = tokenize(query)
+    if not q_tokens:
+        return []
+    tokenized_docs = [(doc_id, tokenize(text)) for doc_id, text in docs]
+    avgdl = sum(len(tokens) for _, tokens in tokenized_docs) / max(len(tokenized_docs), 1)
+    doc_freq: Counter[str] = Counter()
+    term_freqs: dict[str, Counter[str]] = {}
+    for doc_id, tokens in tokenized_docs:
+        tf = Counter(tokens)
+        term_freqs[doc_id] = tf
+        for token in tf.keys():
+            doc_freq[token] += 1
+    n_docs = len(tokenized_docs)
+    scores: dict[str, float] = {doc_id: 0.0 for doc_id, _ in tokenized_docs}
+    for token in q_tokens:
+        df = doc_freq.get(token, 0)
+        if df == 0:
+            continue
+        idf = math.log(1 + (n_docs - df + 0.5) / (df + 0.5))
+        for doc_id, tokens in tokenized_docs:
+            tf = term_freqs[doc_id].get(token, 0)
+            if tf == 0:
+                continue
+            dl = len(tokens)
+            denom = tf + k1 * (1 - b + b * (dl / max(avgdl, 1e-9)))
+            scores[doc_id] += idf * ((tf * (k1 + 1)) / max(denom, 1e-9))
+    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

codespine/search/fuzzy.py ADDED Viewed

@@ -0,0 +1,36 @@
+from __future__ import annotations
+def levenshtein(a: str, b: str) -> int:
+    if a == b:
+        return 0
+    if not a:
+        return len(b)
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, start=1):
+        curr = [i]
+        for j, cb in enumerate(b, start=1):
+            ins = curr[j - 1] + 1
+            delete = prev[j] + 1
+            repl = prev[j - 1] + (0 if ca == cb else 1)
+            curr.append(min(ins, delete, repl))
+        prev = curr
+    return prev[-1]
+def normalized_similarity(a: str, b: str) -> float:
+    a_l = (a or "").lower()
+    b_l = (b or "").lower()
+    if not a_l and not b_l:
+        return 1.0
+    dist = levenshtein(a_l, b_l)
+    return 1.0 - (dist / max(len(a_l), len(b_l), 1))
+def rank_fuzzy(query: str, docs: list[tuple[str, str]]) -> list[tuple[str, float]]:
+    ranked = [(doc_id, normalized_similarity(query, text)) for doc_id, text in docs]
+    ranked.sort(key=lambda x: x[1], reverse=True)
+    return ranked

codespine/search/hybrid.py ADDED Viewed

@@ -0,0 +1,80 @@
+from __future__ import annotations
+from codespine.config import SETTINGS
+from codespine.search.bm25 import rank_bm25
+from codespine.search.fuzzy import rank_fuzzy
+from codespine.search.rrf import reciprocal_rank_fusion
+from codespine.search.vector import rank_semantic
+def hybrid_search(store, query: str, k: int = 20) -> list[dict]:
+    recs = store.query_records(
+        """
+        MATCH (s:Symbol), (f:File {id: s.file_id})
+        RETURN s.id as id,
+               s.kind as kind,
+               s.name as name,
+               s.fqname as fqname,
+               s.embedding as embedding,
+               f.path as file_path,
+               f.is_test as is_test
+        LIMIT $lim
+        """,
+        {"lim": SETTINGS.semantic_candidate_pool},
+    )
+    if not recs:
+        return []
+    lexical_docs = [(r["id"], f"{r.get('name', '')} {r.get('fqname', '')}") for r in recs]
+    fuzzy_docs = [(r["id"], r.get("name", "")) for r in recs]
+    vector_docs = [(r["id"], r.get("embedding")) for r in recs]
+    bm25_rank = rank_bm25(query, lexical_docs)
+    fuzzy_rank = rank_fuzzy(query, fuzzy_docs)
+    semantic_rank = rank_semantic(query, vector_docs)
+    fused = reciprocal_rank_fusion([bm25_rank, semantic_rank, fuzzy_rank], k=SETTINGS.rrf_k)
+    rec_by_id = {r["id"]: r for r in recs}
+    results = []
+    for doc_id, score in fused[: max(k * 3, k)]:
+        rec = rec_by_id.get(doc_id)
+        if not rec:
+            continue
+        multiplier = 1.0
+        if rec.get("is_test"):
+            multiplier *= 0.5
+        if rec.get("kind") in {"method", "class"}:
+            multiplier *= 1.2
+        final_score = score * multiplier
+        results.append(
+            {
+                "id": doc_id,
+                "kind": rec.get("kind"),
+                "name": rec.get("name"),
+                "fqname": rec.get("fqname"),
+                "file_path": rec.get("file_path"),
+                "score": final_score,
+            }
+        )
+    results.sort(key=lambda x: x["score"], reverse=True)
+    # Attach architectural context in same response.
+    for item in results[:k]:
+        ctx = store.query_records(
+            """
+            MATCH (s:Symbol {id: $sid})-[:IN_COMMUNITY]->(c:Community)
+            OPTIONAL MATCH (s)-[f:IN_FLOW]->(fl:Flow)
+            RETURN c.id as community_id, c.label as community_label,
+                   fl.id as flow_id, fl.kind as flow_kind, f.depth as flow_depth
+            LIMIT 3
+            """,
+            {"sid": item["id"]},
+        )
+        item["context"] = ctx
+    return results[:k]

codespine/search/rrf.py ADDED Viewed

@@ -0,0 +1,9 @@
+from __future__ import annotations
+def reciprocal_rank_fusion(rankings: list[list[tuple[str, float]]], k: int = 60) -> list[tuple[str, float]]:
+    scores: dict[str, float] = {}
+    for ranking in rankings:
+        for rank, (doc_id, _) in enumerate(ranking, start=1):
+            scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (k + rank)
+    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

codespine/search/vector.py ADDED Viewed

@@ -0,0 +1,113 @@
+from __future__ import annotations
+import hashlib
+import math
+import sqlite3
+from functools import lru_cache
+from codespine.config import SETTINGS
+def _hash_vector(text: str, dim: int) -> list[float]:
+    """Deterministic fallback embedding when sentence-transformers is unavailable."""
+    vec = [0.0] * dim
+    if not text:
+        return vec
+    tokens = text.lower().split()
+    for token in tokens:
+        digest = hashlib.sha1(token.encode("utf-8")).digest()
+        idx = int.from_bytes(digest[:2], "big") % dim
+        sign = 1.0 if digest[2] % 2 == 0 else -1.0
+        vec[idx] += sign
+    norm = math.sqrt(sum(v * v for v in vec)) or 1.0
+    return [v / norm for v in vec]
+@lru_cache(maxsize=1)
+def _load_model():
+    try:
+        from sentence_transformers import SentenceTransformer
+        return SentenceTransformer(SETTINGS.embedding_model)
+    except Exception:
+        return None
+@lru_cache(maxsize=1)
+def _embedding_cache_conn():
+    conn = sqlite3.connect(SETTINGS.embedding_cache_db)
+    conn.execute(
+        """
+        CREATE TABLE IF NOT EXISTS embedding_cache (
+            cache_key TEXT PRIMARY KEY,
+            dim INTEGER NOT NULL,
+            vector_json TEXT NOT NULL
+        )
+        """
+    )
+    return conn
+def _cache_key(text: str, dim: int) -> str:
+    return hashlib.sha1(f"{SETTINGS.embedding_model}|{dim}|{text}".encode("utf-8")).hexdigest()
+def _get_cached_embedding(text: str, dim: int) -> list[float] | None:
+    key = _cache_key(text, dim)
+    conn = _embedding_cache_conn()
+    row = conn.execute("SELECT vector_json FROM embedding_cache WHERE cache_key = ? AND dim = ?", (key, dim)).fetchone()
+    if not row:
+        return None
+    import json
+    return [float(x) for x in json.loads(row[0])]
+def _set_cached_embedding(text: str, dim: int, vec: list[float]) -> None:
+    key = _cache_key(text, dim)
+    conn = _embedding_cache_conn()
+    import json
+    conn.execute(
+        "INSERT OR REPLACE INTO embedding_cache(cache_key, dim, vector_json) VALUES (?, ?, ?)",
+        (key, dim, json.dumps(vec)),
+    )
+    conn.commit()
+def embed_text(text: str, dim: int | None = None) -> list[float]:
+    dim = dim or SETTINGS.vector_dim
+    cached = _get_cached_embedding(text or "", dim)
+    if cached is not None:
+        return cached
+    model = _load_model()
+    if model is None:
+        vec = _hash_vector(text, dim)
+        _set_cached_embedding(text or "", dim, vec)
+        return vec
+    vec = [float(x) for x in model.encode([text or ""], normalize_embeddings=True)[0]]
+    _set_cached_embedding(text or "", dim, vec)
+    return vec
+def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
+    if not vec_a or not vec_b:
+        return 0.0
+    n = min(len(vec_a), len(vec_b))
+    dot = sum(vec_a[i] * vec_b[i] for i in range(n))
+    na = math.sqrt(sum(vec_a[i] * vec_a[i] for i in range(n))) or 1.0
+    nb = math.sqrt(sum(vec_b[i] * vec_b[i] for i in range(n))) or 1.0
+    return dot / (na * nb)
+def rank_semantic(query: str, docs: list[tuple[str, list[float] | None]]) -> list[tuple[str, float]]:
+    qv = embed_text(query)
+    ranked: list[tuple[str, float]] = []
+    for doc_id, emb in docs:
+        if emb is None:
+            continue
+        ranked.append((doc_id, cosine_similarity(qv, emb)))
+    ranked.sort(key=lambda x: x[1], reverse=True)
+    return ranked

codespine/watch/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Watch mode layer."""

codespine/watch/watcher.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+import time
+from codespine.analysis.community import detect_communities
+from codespine.analysis.coupling import compute_coupling
+from codespine.analysis.deadcode import detect_dead_code
+from codespine.analysis.flow import trace_execution_flows
+from codespine.config import SETTINGS
+from codespine.indexer.engine import JavaIndexer
+def run_watch_mode(store, path: str, global_interval: int = SETTINGS.default_global_interval_s) -> None:
+    try:
+        from watchfiles import watch
+    except Exception as exc:  # pragma: no cover
+        raise RuntimeError("watchfiles is required for watch mode") from exc
+    indexer = JavaIndexer(store)
+    last_global = 0.0
+    print(f"Watching {path} for changes...")
+    for changes in watch(path):
+        changed_java = [p for _, p in changes if p.endswith(".java")]
+        if not changed_java:
+            continue
+        start = time.time()
+        result = indexer.index_project(path, full=False)
+        elapsed = time.time() - start
+        print(f"[{time.strftime('%H:%M:%S')}] {len(changed_java)} file(s) modified -> re-indexed ({elapsed:.1f}s)")
+        if time.time() - last_global >= global_interval:
+            detect_communities(store)
+            detect_dead_code(store, limit=200)
+            trace_execution_flows(store)
+            compute_coupling(store, path, result.project_id)
+            last_global = time.time()