codespine 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codespine/db/store.py ADDED
@@ -0,0 +1,313 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import logging
6
+ from contextlib import contextmanager
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ import kuzu
11
+
12
+ from codespine.config import SETTINGS
13
+ from codespine.db.schema import ensure_schema
14
+
15
+ LOGGER = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass
19
+ class GraphStore:
20
+ read_only: bool = False
21
+
22
+ def __post_init__(self) -> None:
23
+ self.db = kuzu.Database(SETTINGS.db_path, buffer_pool_size=1024**3)
24
+ self.conn = kuzu.Connection(self.db)
25
+ if not self.read_only:
26
+ ensure_schema(self.conn)
27
+
28
+ @staticmethod
29
+ def stable_id(*parts: str) -> str:
30
+ raw = "::".join(parts)
31
+ return hashlib.sha1(raw.encode("utf-8")).hexdigest()
32
+
33
+ def execute(self, query: str, params: dict[str, Any] | None = None):
34
+ return self.conn.execute(query, params or {})
35
+
36
+ @contextmanager
37
+ def transaction(self):
38
+ tx_started = True
39
+ try:
40
+ self.execute("BEGIN TRANSACTION")
41
+ except Exception:
42
+ tx_started = False
43
+ try:
44
+ yield
45
+ if tx_started:
46
+ self.execute("COMMIT")
47
+ except Exception:
48
+ if tx_started:
49
+ self.execute("ROLLBACK")
50
+ raise
51
+
52
+ def clear_project(self, project_id: str) -> None:
53
+ # Keep project node and rebuild attached graph artifacts.
54
+ self.execute(
55
+ """
56
+ MATCH (s:Symbol), (f:File)
57
+ WHERE s.file_id = f.id AND f.project_id = $pid
58
+ DETACH DELETE s
59
+ """,
60
+ {"pid": project_id},
61
+ )
62
+ self.execute(
63
+ """
64
+ MATCH (m:Method), (c:Class), (f:File)
65
+ WHERE m.class_id = c.id AND c.file_id = f.id AND f.project_id = $pid
66
+ DETACH DELETE m
67
+ """,
68
+ {"pid": project_id},
69
+ )
70
+ self.execute(
71
+ """
72
+ MATCH (c:Class), (f:File)
73
+ WHERE c.file_id = f.id AND f.project_id = $pid
74
+ DETACH DELETE c
75
+ """,
76
+ {"pid": project_id},
77
+ )
78
+ self.execute(
79
+ """
80
+ MATCH (f:File) WHERE f.project_id = $pid
81
+ DETACH DELETE f
82
+ """,
83
+ {"pid": project_id},
84
+ )
85
+
86
+ def upsert_project(self, project_id: str, path: str) -> None:
87
+ self.execute(
88
+ "MERGE (p:Project {id: $id}) SET p.path = $path, p.language = 'java'",
89
+ {"id": project_id, "path": path},
90
+ )
91
+
92
+ def project_file_hashes(self, project_id: str) -> dict[str, dict[str, str]]:
93
+ recs = self.query_records(
94
+ """
95
+ MATCH (f:File)
96
+ WHERE f.project_id = $pid
97
+ RETURN f.id as id, f.path as path, f.hash as hash
98
+ """,
99
+ {"pid": project_id},
100
+ )
101
+ return {r["id"]: {"path": r.get("path", ""), "hash": r.get("hash", "")} for r in recs}
102
+
103
+ def clear_file(self, file_id: str) -> None:
104
+ self.execute(
105
+ """
106
+ MATCH (s:Symbol) WHERE s.file_id = $fid
107
+ DETACH DELETE s
108
+ """,
109
+ {"fid": file_id},
110
+ )
111
+ self.execute(
112
+ """
113
+ MATCH (m:Method), (c:Class)
114
+ WHERE m.class_id = c.id AND c.file_id = $fid
115
+ DETACH DELETE m
116
+ """,
117
+ {"fid": file_id},
118
+ )
119
+ self.execute(
120
+ """
121
+ MATCH (c:Class) WHERE c.file_id = $fid
122
+ DETACH DELETE c
123
+ """,
124
+ {"fid": file_id},
125
+ )
126
+ self.execute(
127
+ """
128
+ MATCH (f:File {id: $fid})
129
+ DETACH DELETE f
130
+ """,
131
+ {"fid": file_id},
132
+ )
133
+
134
+ def list_methods(self) -> list[dict[str, Any]]:
135
+ return self.query_records(
136
+ """
137
+ MATCH (m:Method), (c:Class)
138
+ WHERE m.class_id = c.id
139
+ RETURN m.id as method_id, m.name as name, m.signature as signature, c.fqcn as class_fqcn
140
+ """
141
+ )
142
+
143
+ def upsert_file(self, file_id: str, path: str, project_id: str, is_test: bool, digest: str) -> None:
144
+ self.execute(
145
+ """
146
+ MERGE (f:File {id: $id})
147
+ SET f.path = $path, f.project_id = $project_id, f.is_test = $is_test, f.hash = $hash
148
+ """,
149
+ {
150
+ "id": file_id,
151
+ "path": path,
152
+ "project_id": project_id,
153
+ "is_test": is_test,
154
+ "hash": digest,
155
+ },
156
+ )
157
+
158
+ def upsert_class(self, class_id: str, fqcn: str, name: str, package: str, file_id: str) -> None:
159
+ self.execute(
160
+ """
161
+ MERGE (c:Class {id: $id})
162
+ SET c.fqcn = $fqcn, c.name = $name, c.package = $package, c.file_id = $file_id
163
+ """,
164
+ {
165
+ "id": class_id,
166
+ "fqcn": fqcn,
167
+ "name": name,
168
+ "package": package,
169
+ "file_id": file_id,
170
+ },
171
+ )
172
+
173
+ def upsert_method(
174
+ self,
175
+ method_id: str,
176
+ class_id: str,
177
+ name: str,
178
+ signature: str,
179
+ return_type: str,
180
+ modifiers: list[str],
181
+ is_constructor: bool,
182
+ is_test: bool,
183
+ ) -> None:
184
+ self.execute(
185
+ """
186
+ MERGE (m:Method {id: $id})
187
+ SET m.class_id = $class_id,
188
+ m.name = $name,
189
+ m.signature = $signature,
190
+ m.return_type = $return_type,
191
+ m.modifiers = $modifiers,
192
+ m.is_constructor = $is_constructor,
193
+ m.is_test = $is_test
194
+ """,
195
+ {
196
+ "id": method_id,
197
+ "class_id": class_id,
198
+ "name": name,
199
+ "signature": signature,
200
+ "return_type": return_type,
201
+ "modifiers": modifiers,
202
+ "is_constructor": is_constructor,
203
+ "is_test": is_test,
204
+ },
205
+ )
206
+ self.execute(
207
+ "MATCH (c:Class {id: $cid}), (m:Method {id: $mid}) MERGE (c)-[:HAS_METHOD]->(m)",
208
+ {"cid": class_id, "mid": method_id},
209
+ )
210
+
211
+ def upsert_symbol(
212
+ self,
213
+ symbol_id: str,
214
+ kind: str,
215
+ name: str,
216
+ fqname: str,
217
+ file_id: str,
218
+ line: int,
219
+ col: int,
220
+ embedding: list[float] | None,
221
+ ) -> None:
222
+ self.execute(
223
+ """
224
+ MERGE (s:Symbol {id: $id})
225
+ SET s.kind = $kind,
226
+ s.name = $name,
227
+ s.fqname = $fqname,
228
+ s.file_id = $file_id,
229
+ s.line = $line,
230
+ s.col = $col,
231
+ s.embedding = $embedding
232
+ """,
233
+ {
234
+ "id": symbol_id,
235
+ "kind": kind,
236
+ "name": name,
237
+ "fqname": fqname,
238
+ "file_id": file_id,
239
+ "line": line,
240
+ "col": col,
241
+ "embedding": embedding,
242
+ },
243
+ )
244
+ self.execute(
245
+ "MATCH (f:File {id: $fid}), (s:Symbol {id: $sid}) MERGE (f)-[:DECLARES]->(s)",
246
+ {"fid": file_id, "sid": symbol_id},
247
+ )
248
+
249
+ def add_call(self, source_id: str, target_id: str, confidence: float, reason: str) -> None:
250
+ self.execute(
251
+ """
252
+ MATCH (source:Method {id: $source_id}), (target:Method {id: $target_id})
253
+ MERGE (source)-[:CALLS {confidence: $confidence, reason: $reason}]->(target)
254
+ """,
255
+ {
256
+ "source_id": source_id,
257
+ "target_id": target_id,
258
+ "confidence": confidence,
259
+ "reason": reason,
260
+ },
261
+ )
262
+
263
+ def add_reference(self, rel: str, src_label: str, src_id: str, dst_label: str, dst_id: str, confidence: float) -> None:
264
+ if rel not in {"REFERENCES_TYPE", "IMPLEMENTS", "OVERRIDES"}:
265
+ return
266
+ query = (
267
+ f"MATCH (s:{src_label} {{id: $src_id}}), (d:{dst_label} {{id: $dst_id}}) "
268
+ f"MERGE (s)-[:{rel} {{confidence: $confidence}}]->(d)"
269
+ )
270
+ self.execute(query, {"src_id": src_id, "dst_id": dst_id, "confidence": confidence})
271
+
272
+ def set_community(self, community_id: str, label: str, cohesion: float, symbol_ids: list[str]) -> None:
273
+ self.execute(
274
+ "MERGE (c:Community {id: $id}) SET c.label = $label, c.cohesion = $cohesion",
275
+ {"id": community_id, "label": label, "cohesion": cohesion},
276
+ )
277
+ for sid in symbol_ids:
278
+ self.execute(
279
+ "MATCH (s:Symbol {id: $sid}), (c:Community {id: $cid}) MERGE (s)-[:IN_COMMUNITY]->(c)",
280
+ {"sid": sid, "cid": community_id},
281
+ )
282
+
283
+ def set_flow(self, flow_id: str, entry_symbol_id: str, kind: str, symbols_at_depth: list[tuple[str, int]]) -> None:
284
+ self.execute(
285
+ "MERGE (f:Flow {id: $id}) SET f.entry_symbol_id = $entry, f.kind = $kind",
286
+ {"id": flow_id, "entry": entry_symbol_id, "kind": kind},
287
+ )
288
+ for sid, depth in symbols_at_depth:
289
+ self.execute(
290
+ "MATCH (s:Symbol {id: $sid}), (f:Flow {id: $fid}) MERGE (s)-[:IN_FLOW {depth: $depth}]->(f)",
291
+ {"sid": sid, "fid": flow_id, "depth": int(depth)},
292
+ )
293
+
294
+ def upsert_coupling(self, file_a: str, file_b: str, strength: float, cochanges: int, months: int) -> None:
295
+ self.execute(
296
+ """
297
+ MATCH (a:File {id: $a}), (b:File {id: $b})
298
+ MERGE (a)-[:CO_CHANGED_WITH {strength: $strength, cochanges: $cochanges, months: $months}]->(b)
299
+ """,
300
+ {
301
+ "a": file_a,
302
+ "b": file_b,
303
+ "strength": strength,
304
+ "cochanges": int(cochanges),
305
+ "months": int(months),
306
+ },
307
+ )
308
+
309
+ def query_records(self, query: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]:
310
+ frame = self.execute(query, params or {}).get_as_df()
311
+ if frame.empty:
312
+ return []
313
+ return json.loads(frame.to_json(orient="records"))
@@ -0,0 +1 @@
1
+ """Branch diff layer."""
@@ -0,0 +1,163 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import os
6
+ import re
7
+ import shutil
8
+ import subprocess
9
+ import tempfile
10
+
11
+ import tree_sitter_java as tsjava
12
+ from tree_sitter import Language, Parser, Query
13
+
14
+ from codespine.indexer.java_parser import parse_java_source
15
+
16
+ JAVA_LANGUAGE = Language(tsjava.language())
17
+ PARSER = Parser(JAVA_LANGUAGE)
18
+
19
+
20
+ def _text(node) -> str:
21
+ return node.text.decode("utf-8")
22
+
23
+
24
+ def _hash_text(text: str) -> str:
25
+ return hashlib.sha1(_normalize_java_snippet(text).encode("utf-8")).hexdigest()
26
+
27
+
28
+ def _normalize_java_snippet(text: str) -> str:
29
+ """Normalize formatting/comments so branch diff emphasizes semantic edits."""
30
+ text = re.sub(r"/\*.*?\*/", "", text, flags=re.DOTALL)
31
+ text = re.sub(r"//.*?$", "", text, flags=re.MULTILINE)
32
+ text = re.sub(r"\s+", " ", text).strip()
33
+ return text
34
+
35
+
36
+ def _method_hashes(source: bytes) -> dict[str, dict]:
37
+ tree = PARSER.parse(source)
38
+ root = tree.root_node
39
+ method_query = Query(
40
+ JAVA_LANGUAGE,
41
+ """
42
+ [
43
+ (method_declaration
44
+ name: (identifier) @name
45
+ parameters: (formal_parameters) @params) @decl
46
+ (constructor_declaration
47
+ name: (identifier) @name
48
+ parameters: (formal_parameters) @params) @decl
49
+ ]
50
+ """,
51
+ )
52
+ methods: dict[str, dict] = {}
53
+ grouped: dict[object, dict[str, str]] = {}
54
+ for node, tag in method_query.captures(root):
55
+ key_node = node if tag == "decl" else node.parent
56
+ grouped.setdefault(key_node, {})[tag] = _text(node)
57
+
58
+ for node, capture in grouped.items():
59
+ name = capture.get("name")
60
+ params = capture.get("params", "()")
61
+ if not name:
62
+ continue
63
+ signature = f"{name}{params}"
64
+ methods[signature] = {
65
+ "hash": _hash_text(_text(node)),
66
+ "line_start": node.start_point[0] + 1,
67
+ "line_end": node.end_point[0] + 1,
68
+ }
69
+ return methods
70
+
71
+
72
+ def _class_hashes(source: bytes) -> dict[str, str]:
73
+ tree = PARSER.parse(source)
74
+ root = tree.root_node
75
+ class_query = Query(
76
+ JAVA_LANGUAGE,
77
+ """
78
+ (class_declaration
79
+ name: (identifier) @name) @decl
80
+ """,
81
+ )
82
+ grouped: dict[object, dict[str, str]] = {}
83
+ for node, tag in class_query.captures(root):
84
+ key_node = node if tag == "decl" else node.parent
85
+ grouped.setdefault(key_node, {})[tag] = _text(node)
86
+ out: dict[str, str] = {}
87
+ for node, capture in grouped.items():
88
+ name = capture.get("name")
89
+ if name:
90
+ out[name] = _hash_text(_text(node))
91
+ return out
92
+
93
+
94
+ def _symbol_manifest(repo_path: str) -> dict[str, dict]:
95
+ manifest: dict[str, dict] = {}
96
+ for root, _, files in os.walk(repo_path):
97
+ if any(skip in root for skip in [".git", "target", "build", "out"]):
98
+ continue
99
+ for f in files:
100
+ if not f.endswith(".java"):
101
+ continue
102
+ path = os.path.join(root, f)
103
+ rel = os.path.relpath(path, repo_path)
104
+ with open(path, "rb") as fp:
105
+ source = fp.read()
106
+ parsed = parse_java_source(source)
107
+ method_hashes = _method_hashes(source)
108
+ class_hashes = _class_hashes(source)
109
+ for cls in parsed.classes:
110
+ cls_key = f"class:{cls.fqcn}"
111
+ manifest[cls_key] = {
112
+ "kind": "Class",
113
+ "file": rel,
114
+ "name": cls.fqcn,
115
+ "hash": class_hashes.get(cls.name, cls.body_hash),
116
+ "line_start": cls.line,
117
+ }
118
+ for m in cls.methods:
119
+ m_key = f"method:{cls.fqcn}#{m.signature}"
120
+ mh = method_hashes.get(f"{m.name}({','.join(m.parameter_types)})") or method_hashes.get(m.signature) or {}
121
+ manifest[m_key] = {
122
+ "kind": "Method",
123
+ "file": rel,
124
+ "name": m.signature,
125
+ "class": cls.fqcn,
126
+ "hash": m.body_hash or mh.get("hash"),
127
+ "line_start": mh.get("line_start", m.line),
128
+ "line_end": mh.get("line_end", m.line),
129
+ }
130
+ return manifest
131
+
132
+
133
+ def compare_branches(repo_path: str, base_ref: str, head_ref: str) -> dict:
134
+ temp_dir = tempfile.mkdtemp(prefix="codespine-diff-")
135
+ base_dir = os.path.join(temp_dir, "base")
136
+ head_dir = os.path.join(temp_dir, "head")
137
+
138
+ try:
139
+ subprocess.run(["git", "-C", repo_path, "worktree", "add", "--detach", base_dir, base_ref], check=True, capture_output=True)
140
+ subprocess.run(["git", "-C", repo_path, "worktree", "add", "--detach", head_dir, head_ref], check=True, capture_output=True)
141
+
142
+ base_manifest = _symbol_manifest(base_dir)
143
+ head_manifest = _symbol_manifest(head_dir)
144
+
145
+ added = sorted(set(head_manifest) - set(base_manifest))
146
+ removed = sorted(set(base_manifest) - set(head_manifest))
147
+
148
+ modified = []
149
+ for key in sorted(set(base_manifest) & set(head_manifest)):
150
+ if json.dumps(base_manifest[key], sort_keys=True) != json.dumps(head_manifest[key], sort_keys=True):
151
+ modified.append(key)
152
+
153
+ return {
154
+ "base": base_ref,
155
+ "head": head_ref,
156
+ "added": [head_manifest[k] for k in added],
157
+ "removed": [base_manifest[k] for k in removed],
158
+ "modified": [head_manifest[k] for k in modified],
159
+ }
160
+ finally:
161
+ subprocess.run(["git", "-C", repo_path, "worktree", "remove", "--force", base_dir], check=False, capture_output=True)
162
+ subprocess.run(["git", "-C", repo_path, "worktree", "remove", "--force", head_dir], check=False, capture_output=True)
163
+ shutil.rmtree(temp_dir, ignore_errors=True)
@@ -0,0 +1 @@
1
+ """Indexing layer."""
@@ -0,0 +1,137 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+
5
+ from codespine.noise.blocklist import NOISE_METHOD_NAMES
6
+
7
+
8
+ def _simple_type_name(type_name: str | None) -> str:
9
+ if not type_name:
10
+ return ""
11
+ base = type_name.strip().replace("[]", "")
12
+ return base.split(".")[-1]
13
+
14
+
15
+ def _resolve_type_candidates(type_name: str | None, context: dict, class_catalog: dict[str, list[str]]) -> list[str]:
16
+ """Best-effort type resolution using fqcn/simple-name, imports, and package."""
17
+ if not type_name:
18
+ return []
19
+ resolved: list[str] = []
20
+ raw = type_name.strip()
21
+ simple = _simple_type_name(raw)
22
+
23
+ # Direct FQCN hint.
24
+ if "." in raw:
25
+ resolved.append(raw)
26
+
27
+ # Imported types.
28
+ imports = context.get("imports", []) or []
29
+ for imp in imports:
30
+ if imp.endswith(f".{simple}"):
31
+ resolved.append(imp)
32
+
33
+ # Same package fallback.
34
+ pkg = context.get("package", "")
35
+ if pkg:
36
+ resolved.append(f"{pkg}.{simple}")
37
+
38
+ # Indexed type matches by simple class name.
39
+ resolved.extend(class_catalog.get(simple, []))
40
+
41
+ # Stable unique order.
42
+ uniq: list[str] = []
43
+ seen = set()
44
+ for item in resolved:
45
+ if item and item not in seen:
46
+ uniq.append(item)
47
+ seen.add(item)
48
+ return uniq
49
+
50
+
51
+ def resolve_calls(
52
+ method_catalog: dict[str, dict],
53
+ calls: dict[str, list],
54
+ method_context: dict[str, dict],
55
+ class_catalog: dict[str, list[str]],
56
+ ) -> list[tuple[str, str, float, str]]:
57
+ """Resolve call names to known method ids.
58
+
59
+ Returns tuples: (source_method_id, target_method_id, confidence, reason)
60
+ """
61
+ name_arity_to_method_ids: dict[tuple[str, int], list[str]] = defaultdict(list)
62
+ class_method_index: dict[str, dict[tuple[str, int], list[str]]] = defaultdict(lambda: defaultdict(list))
63
+ for method_id, meta in method_catalog.items():
64
+ key = (meta["name"], int(meta["param_count"]))
65
+ name_arity_to_method_ids[key].append(method_id)
66
+ class_method_index[meta["class_fqcn"]][key].append(method_id)
67
+
68
+ edges: list[tuple[str, str, float, str]] = []
69
+ for source_id, call_sites in calls.items():
70
+ src_meta = method_catalog.get(source_id, {})
71
+ src_ctx = method_context.get(source_id, {})
72
+ src_class = src_meta.get("class_fqcn", "")
73
+ local_types = src_ctx.get("local_types", {}) or {}
74
+ field_types = src_ctx.get("field_types", {}) or {}
75
+
76
+ for call in call_sites:
77
+ call_name = call.name
78
+ if call_name in NOISE_METHOD_NAMES:
79
+ continue
80
+
81
+ key = (call_name, int(call.arg_count))
82
+ targets: list[str] = []
83
+ confidence = 0.5
84
+ reason = "fuzzy_name_ambiguous"
85
+
86
+ receiver = (call.receiver or "").strip() if getattr(call, "receiver", None) else ""
87
+ if receiver:
88
+ receiver_type = None
89
+ receiver_is_this = False
90
+ if receiver == "this":
91
+ receiver_type = src_class
92
+ receiver_is_this = True
93
+ elif receiver in local_types:
94
+ receiver_type = local_types[receiver]
95
+ elif receiver in field_types:
96
+ receiver_type = field_types[receiver]
97
+ else:
98
+ receiver_type = receiver
99
+
100
+ receiver_fqcn_candidates = _resolve_type_candidates(receiver_type, src_ctx, class_catalog)
101
+
102
+ for fqcn in receiver_fqcn_candidates:
103
+ targets.extend(class_method_index.get(fqcn, {}).get(key, []))
104
+
105
+ if targets:
106
+ confidence = 1.0 if receiver_is_this else 0.8
107
+ reason = "receiver_this_exact" if receiver_is_this else "receiver_method_match"
108
+
109
+ if not targets:
110
+ in_class = class_method_index.get(src_class, {}).get(key, [])
111
+ if in_class:
112
+ targets = in_class
113
+ confidence = 1.0
114
+ reason = "intra_class_exact"
115
+
116
+ if not targets:
117
+ # Prefer same-package candidates before global fallback.
118
+ src_pkg = src_ctx.get("package", "")
119
+ same_pkg = []
120
+ for mid in name_arity_to_method_ids.get(key, []):
121
+ fqcn = method_catalog.get(mid, {}).get("class_fqcn", "")
122
+ if src_pkg and fqcn.startswith(f"{src_pkg}."):
123
+ same_pkg.append(mid)
124
+ targets = same_pkg or name_arity_to_method_ids.get(key, [])
125
+ if len(targets) == 1:
126
+ confidence = 1.0
127
+ reason = "exact_name_arity_unique"
128
+ elif len(targets) > 1:
129
+ confidence = 0.5
130
+ reason = "fuzzy_name_arity_ambiguous"
131
+
132
+ if not targets:
133
+ continue
134
+ for target_id in targets:
135
+ edges.append((source_id, target_id, confidence, reason))
136
+
137
+ return edges