codespine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ from fastmcp import FastMCP
4
+
5
+ from codespine.analysis.community import detect_communities, symbol_community
6
+ from codespine.analysis.context import build_symbol_context
7
+ from codespine.analysis.coupling import get_coupling
8
+ from codespine.analysis.deadcode import detect_dead_code as detect_dead_code_analysis
9
+ from codespine.analysis.flow import trace_execution_flows as trace_flows_analysis
10
+ from codespine.analysis.impact import analyze_impact
11
+ from codespine.diff.branch_diff import compare_branches as compare_branches_analysis
12
+ from codespine.search.hybrid import hybrid_search
13
+
14
+
15
+ def build_mcp_server(store, repo_path_provider):
16
+ mcp = FastMCP("codespine")
17
+
18
+ @mcp.tool()
19
+ def search_hybrid(query: str, k: int = 20):
20
+ return hybrid_search(store, query, k=k)
21
+
22
+ @mcp.tool()
23
+ def get_impact(symbol: str, max_depth: int = 4):
24
+ return analyze_impact(store, symbol, max_depth=max_depth)
25
+
26
+ @mcp.tool()
27
+ def detect_dead_code(limit: int = 200):
28
+ return detect_dead_code_analysis(store, limit=limit)
29
+
30
+ @mcp.tool()
31
+ def trace_execution_flows(entry_symbol: str | None = None, max_depth: int = 6):
32
+ flows = trace_flows_analysis(store, entry_symbol=entry_symbol, max_depth=max_depth)
33
+ return flows
34
+
35
+ @mcp.tool()
36
+ def get_symbol_community(symbol: str):
37
+ detect_communities(store)
38
+ return symbol_community(store, symbol)
39
+
40
+ @mcp.tool()
41
+ def get_change_coupling(symbol: str | None = None, months: int = 6, min_strength: float = 0.3, min_cochanges: int = 3):
42
+ return get_coupling(store, symbol=symbol, months=months, min_strength=min_strength, min_cochanges=min_cochanges)
43
+
44
+ @mcp.tool()
45
+ def compare_branches(base_ref: str, head_ref: str):
46
+ return compare_branches_analysis(repo_path_provider(), base_ref, head_ref)
47
+
48
+ @mcp.tool()
49
+ def get_codebase_stats():
50
+ projects = store.query_records("MATCH (p:Project) RETURN p.id as project, p.path as path")
51
+ symbols = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
52
+ calls = store.query_records("MATCH (:Method)-[r:CALLS]->(:Method) RETURN count(r) as count")
53
+ return {
54
+ "projects": projects,
55
+ "symbols": symbols[0]["count"] if symbols else 0,
56
+ "calls": calls[0]["count"] if calls else 0,
57
+ }
58
+
59
+ @mcp.tool()
60
+ def get_symbol_context(query: str, max_depth: int = 3):
61
+ return build_symbol_context(store, query, max_depth=max_depth)
62
+
63
+ @mcp.tool()
64
+ def run_cypher(query: str):
65
+ return store.query_records(query)
66
+
67
+ return mcp
@@ -0,0 +1 @@
1
+ """Noise filtering rules."""
@@ -0,0 +1,37 @@
1
+ """Noise filters for call graph generation."""
2
+
3
+ NOISE_METHOD_NAMES = {
4
+ "print",
5
+ "println",
6
+ "printf",
7
+ "hashCode",
8
+ "equals",
9
+ "toString",
10
+ "getClass",
11
+ "notify",
12
+ "notifyAll",
13
+ "wait",
14
+ "clone",
15
+ "finalize",
16
+ "compareTo",
17
+ "isEmpty",
18
+ "size",
19
+ "length",
20
+ "stream",
21
+ "parallelStream",
22
+ "forEach",
23
+ "map",
24
+ "filter",
25
+ "collect",
26
+ "orElse",
27
+ "orElseGet",
28
+ "add",
29
+ "append",
30
+ "remove",
31
+ "contains",
32
+ "log",
33
+ "debug",
34
+ "info",
35
+ "warn",
36
+ "error",
37
+ }
@@ -0,0 +1 @@
1
+ """Search layer."""
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import re
5
+ from collections import Counter
6
+
7
+ TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
8
+
9
+
10
+ def tokenize(text: str) -> list[str]:
11
+ return [t.lower() for t in TOKEN_RE.findall(text or "")]
12
+
13
+
14
+ def rank_bm25(query: str, docs: list[tuple[str, str]], k1: float = 1.2, b: float = 0.75) -> list[tuple[str, float]]:
15
+ """Simple BM25 ranker.
16
+
17
+ docs: list of (doc_id, text)
18
+ """
19
+ if not docs:
20
+ return []
21
+
22
+ q_tokens = tokenize(query)
23
+ if not q_tokens:
24
+ return []
25
+
26
+ tokenized_docs = [(doc_id, tokenize(text)) for doc_id, text in docs]
27
+ avgdl = sum(len(tokens) for _, tokens in tokenized_docs) / max(len(tokenized_docs), 1)
28
+
29
+ doc_freq: Counter[str] = Counter()
30
+ term_freqs: dict[str, Counter[str]] = {}
31
+ for doc_id, tokens in tokenized_docs:
32
+ tf = Counter(tokens)
33
+ term_freqs[doc_id] = tf
34
+ for token in tf.keys():
35
+ doc_freq[token] += 1
36
+
37
+ n_docs = len(tokenized_docs)
38
+ scores: dict[str, float] = {doc_id: 0.0 for doc_id, _ in tokenized_docs}
39
+ for token in q_tokens:
40
+ df = doc_freq.get(token, 0)
41
+ if df == 0:
42
+ continue
43
+ idf = math.log(1 + (n_docs - df + 0.5) / (df + 0.5))
44
+ for doc_id, tokens in tokenized_docs:
45
+ tf = term_freqs[doc_id].get(token, 0)
46
+ if tf == 0:
47
+ continue
48
+ dl = len(tokens)
49
+ denom = tf + k1 * (1 - b + b * (dl / max(avgdl, 1e-9)))
50
+ scores[doc_id] += idf * ((tf * (k1 + 1)) / max(denom, 1e-9))
51
+
52
+ return sorted(scores.items(), key=lambda x: x[1], reverse=True)
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def levenshtein(a: str, b: str) -> int:
5
+ if a == b:
6
+ return 0
7
+ if not a:
8
+ return len(b)
9
+ if not b:
10
+ return len(a)
11
+
12
+ prev = list(range(len(b) + 1))
13
+ for i, ca in enumerate(a, start=1):
14
+ curr = [i]
15
+ for j, cb in enumerate(b, start=1):
16
+ ins = curr[j - 1] + 1
17
+ delete = prev[j] + 1
18
+ repl = prev[j - 1] + (0 if ca == cb else 1)
19
+ curr.append(min(ins, delete, repl))
20
+ prev = curr
21
+ return prev[-1]
22
+
23
+
24
+ def normalized_similarity(a: str, b: str) -> float:
25
+ a_l = (a or "").lower()
26
+ b_l = (b or "").lower()
27
+ if not a_l and not b_l:
28
+ return 1.0
29
+ dist = levenshtein(a_l, b_l)
30
+ return 1.0 - (dist / max(len(a_l), len(b_l), 1))
31
+
32
+
33
+ def rank_fuzzy(query: str, docs: list[tuple[str, str]]) -> list[tuple[str, float]]:
34
+ ranked = [(doc_id, normalized_similarity(query, text)) for doc_id, text in docs]
35
+ ranked.sort(key=lambda x: x[1], reverse=True)
36
+ return ranked
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+
3
+ from codespine.config import SETTINGS
4
+ from codespine.search.bm25 import rank_bm25
5
+ from codespine.search.fuzzy import rank_fuzzy
6
+ from codespine.search.rrf import reciprocal_rank_fusion
7
+ from codespine.search.vector import rank_semantic
8
+
9
+
10
+ def hybrid_search(store, query: str, k: int = 20) -> list[dict]:
11
+ recs = store.query_records(
12
+ """
13
+ MATCH (s:Symbol), (f:File {id: s.file_id})
14
+ RETURN s.id as id,
15
+ s.kind as kind,
16
+ s.name as name,
17
+ s.fqname as fqname,
18
+ s.embedding as embedding,
19
+ f.path as file_path,
20
+ f.is_test as is_test
21
+ LIMIT $lim
22
+ """,
23
+ {"lim": SETTINGS.semantic_candidate_pool},
24
+ )
25
+
26
+ if not recs:
27
+ return []
28
+
29
+ lexical_docs = [(r["id"], f"{r.get('name', '')} {r.get('fqname', '')}") for r in recs]
30
+ fuzzy_docs = [(r["id"], r.get("name", "")) for r in recs]
31
+ vector_docs = [(r["id"], r.get("embedding")) for r in recs]
32
+
33
+ bm25_rank = rank_bm25(query, lexical_docs)
34
+ fuzzy_rank = rank_fuzzy(query, fuzzy_docs)
35
+ semantic_rank = rank_semantic(query, vector_docs)
36
+
37
+ fused = reciprocal_rank_fusion([bm25_rank, semantic_rank, fuzzy_rank], k=SETTINGS.rrf_k)
38
+ rec_by_id = {r["id"]: r for r in recs}
39
+
40
+ results = []
41
+ for doc_id, score in fused[: max(k * 3, k)]:
42
+ rec = rec_by_id.get(doc_id)
43
+ if not rec:
44
+ continue
45
+
46
+ multiplier = 1.0
47
+ if rec.get("is_test"):
48
+ multiplier *= 0.5
49
+ if rec.get("kind") in {"method", "class"}:
50
+ multiplier *= 1.2
51
+
52
+ final_score = score * multiplier
53
+ results.append(
54
+ {
55
+ "id": doc_id,
56
+ "kind": rec.get("kind"),
57
+ "name": rec.get("name"),
58
+ "fqname": rec.get("fqname"),
59
+ "file_path": rec.get("file_path"),
60
+ "score": final_score,
61
+ }
62
+ )
63
+
64
+ results.sort(key=lambda x: x["score"], reverse=True)
65
+
66
+ # Attach architectural context in same response.
67
+ for item in results[:k]:
68
+ ctx = store.query_records(
69
+ """
70
+ MATCH (s:Symbol {id: $sid})-[:IN_COMMUNITY]->(c:Community)
71
+ OPTIONAL MATCH (s)-[f:IN_FLOW]->(fl:Flow)
72
+ RETURN c.id as community_id, c.label as community_label,
73
+ fl.id as flow_id, fl.kind as flow_kind, f.depth as flow_depth
74
+ LIMIT 3
75
+ """,
76
+ {"sid": item["id"]},
77
+ )
78
+ item["context"] = ctx
79
+
80
+ return results[:k]
@@ -0,0 +1,9 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def reciprocal_rank_fusion(rankings: list[list[tuple[str, float]]], k: int = 60) -> list[tuple[str, float]]:
5
+ scores: dict[str, float] = {}
6
+ for ranking in rankings:
7
+ for rank, (doc_id, _) in enumerate(ranking, start=1):
8
+ scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (k + rank)
9
+ return sorted(scores.items(), key=lambda x: x[1], reverse=True)
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import math
5
+ import sqlite3
6
+ from functools import lru_cache
7
+
8
+ from codespine.config import SETTINGS
9
+
10
+
11
+ def _hash_vector(text: str, dim: int) -> list[float]:
12
+ """Deterministic fallback embedding when sentence-transformers is unavailable."""
13
+ vec = [0.0] * dim
14
+ if not text:
15
+ return vec
16
+ tokens = text.lower().split()
17
+ for token in tokens:
18
+ digest = hashlib.sha1(token.encode("utf-8")).digest()
19
+ idx = int.from_bytes(digest[:2], "big") % dim
20
+ sign = 1.0 if digest[2] % 2 == 0 else -1.0
21
+ vec[idx] += sign
22
+ norm = math.sqrt(sum(v * v for v in vec)) or 1.0
23
+ return [v / norm for v in vec]
24
+
25
+
26
+ @lru_cache(maxsize=1)
27
+ def _load_model():
28
+ try:
29
+ from sentence_transformers import SentenceTransformer
30
+
31
+ return SentenceTransformer(SETTINGS.embedding_model)
32
+ except Exception:
33
+ return None
34
+
35
+
36
+ @lru_cache(maxsize=1)
37
+ def _embedding_cache_conn():
38
+ conn = sqlite3.connect(SETTINGS.embedding_cache_db)
39
+ conn.execute(
40
+ """
41
+ CREATE TABLE IF NOT EXISTS embedding_cache (
42
+ cache_key TEXT PRIMARY KEY,
43
+ dim INTEGER NOT NULL,
44
+ vector_json TEXT NOT NULL
45
+ )
46
+ """
47
+ )
48
+ return conn
49
+
50
+
51
+ def _cache_key(text: str, dim: int) -> str:
52
+ return hashlib.sha1(f"{SETTINGS.embedding_model}|{dim}|{text}".encode("utf-8")).hexdigest()
53
+
54
+
55
+ def _get_cached_embedding(text: str, dim: int) -> list[float] | None:
56
+ key = _cache_key(text, dim)
57
+ conn = _embedding_cache_conn()
58
+ row = conn.execute("SELECT vector_json FROM embedding_cache WHERE cache_key = ? AND dim = ?", (key, dim)).fetchone()
59
+ if not row:
60
+ return None
61
+ import json
62
+
63
+ return [float(x) for x in json.loads(row[0])]
64
+
65
+
66
+ def _set_cached_embedding(text: str, dim: int, vec: list[float]) -> None:
67
+ key = _cache_key(text, dim)
68
+ conn = _embedding_cache_conn()
69
+ import json
70
+
71
+ conn.execute(
72
+ "INSERT OR REPLACE INTO embedding_cache(cache_key, dim, vector_json) VALUES (?, ?, ?)",
73
+ (key, dim, json.dumps(vec)),
74
+ )
75
+ conn.commit()
76
+
77
+
78
+ def embed_text(text: str, dim: int | None = None) -> list[float]:
79
+ dim = dim or SETTINGS.vector_dim
80
+ cached = _get_cached_embedding(text or "", dim)
81
+ if cached is not None:
82
+ return cached
83
+
84
+ model = _load_model()
85
+ if model is None:
86
+ vec = _hash_vector(text, dim)
87
+ _set_cached_embedding(text or "", dim, vec)
88
+ return vec
89
+
90
+ vec = [float(x) for x in model.encode([text or ""], normalize_embeddings=True)[0]]
91
+ _set_cached_embedding(text or "", dim, vec)
92
+ return vec
93
+
94
+
95
+ def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
96
+ if not vec_a or not vec_b:
97
+ return 0.0
98
+ n = min(len(vec_a), len(vec_b))
99
+ dot = sum(vec_a[i] * vec_b[i] for i in range(n))
100
+ na = math.sqrt(sum(vec_a[i] * vec_a[i] for i in range(n))) or 1.0
101
+ nb = math.sqrt(sum(vec_b[i] * vec_b[i] for i in range(n))) or 1.0
102
+ return dot / (na * nb)
103
+
104
+
105
+ def rank_semantic(query: str, docs: list[tuple[str, list[float] | None]]) -> list[tuple[str, float]]:
106
+ qv = embed_text(query)
107
+ ranked: list[tuple[str, float]] = []
108
+ for doc_id, emb in docs:
109
+ if emb is None:
110
+ continue
111
+ ranked.append((doc_id, cosine_similarity(qv, emb)))
112
+ ranked.sort(key=lambda x: x[1], reverse=True)
113
+ return ranked
@@ -0,0 +1 @@
1
+ """Watch mode layer."""
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+
5
+ from codespine.analysis.community import detect_communities
6
+ from codespine.analysis.coupling import compute_coupling
7
+ from codespine.analysis.deadcode import detect_dead_code
8
+ from codespine.analysis.flow import trace_execution_flows
9
+ from codespine.config import SETTINGS
10
+ from codespine.indexer.engine import JavaIndexer
11
+
12
+
13
+ def run_watch_mode(store, path: str, global_interval: int = SETTINGS.default_global_interval_s) -> None:
14
+ try:
15
+ from watchfiles import watch
16
+ except Exception as exc: # pragma: no cover
17
+ raise RuntimeError("watchfiles is required for watch mode") from exc
18
+
19
+ indexer = JavaIndexer(store)
20
+ last_global = 0.0
21
+ print(f"Watching {path} for changes...")
22
+
23
+ for changes in watch(path):
24
+ changed_java = [p for _, p in changes if p.endswith(".java")]
25
+ if not changed_java:
26
+ continue
27
+
28
+ start = time.time()
29
+ result = indexer.index_project(path, full=False)
30
+ elapsed = time.time() - start
31
+ print(f"[{time.strftime('%H:%M:%S')}] {len(changed_java)} file(s) modified -> re-indexed ({elapsed:.1f}s)")
32
+
33
+ if time.time() - last_global >= global_interval:
34
+ detect_communities(store)
35
+ detect_dead_code(store, limit=200)
36
+ trace_execution_flows(store)
37
+ compute_coupling(store, path, result.project_id)
38
+ last_global = time.time()