coderay 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderay/__init__.py +1 -0
- coderay/chunking/__init__.py +0 -0
- coderay/chunking/chunker.py +127 -0
- coderay/chunking/registry.py +190 -0
- coderay/cli/__init__.py +3 -0
- coderay/cli/commands.py +475 -0
- coderay/core/__init__.py +0 -0
- coderay/core/config.py +73 -0
- coderay/core/lock.py +36 -0
- coderay/core/models.py +71 -0
- coderay/core/timing.py +45 -0
- coderay/core/utils.py +35 -0
- coderay/embedding/__init__.py +0 -0
- coderay/embedding/base.py +60 -0
- coderay/embedding/local.py +68 -0
- coderay/embedding/openai.py +87 -0
- coderay/graph/__init__.py +19 -0
- coderay/graph/builder.py +128 -0
- coderay/graph/code_graph.py +311 -0
- coderay/graph/extractor.py +315 -0
- coderay/mcp_server/__init__.py +0 -0
- coderay/mcp_server/server.py +178 -0
- coderay/pipeline/__init__.py +0 -0
- coderay/pipeline/indexer.py +417 -0
- coderay/pipeline/watcher.py +318 -0
- coderay/retrieval/__init__.py +3 -0
- coderay/retrieval/boosting.py +80 -0
- coderay/retrieval/search.py +121 -0
- coderay/skeleton/__init__.py +0 -0
- coderay/skeleton/extractor.py +140 -0
- coderay/state/__init__.py +8 -0
- coderay/state/machine.py +242 -0
- coderay/state/version.py +47 -0
- coderay/storage/__init__.py +0 -0
- coderay/storage/lancedb.py +268 -0
- coderay/vcs/__init__.py +0 -0
- coderay/vcs/git.py +193 -0
- coderay-1.0.0.dist-info/METADATA +145 -0
- coderay-1.0.0.dist-info/RECORD +42 -0
- coderay-1.0.0.dist-info/WHEEL +5 -0
- coderay-1.0.0.dist-info/entry_points.txt +3 -0
- coderay-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import networkx as nx
|
|
8
|
+
|
|
9
|
+
from coderay.chunking.registry import (
|
|
10
|
+
get_init_filenames,
|
|
11
|
+
get_resolution_suffixes,
|
|
12
|
+
get_supported_extensions,
|
|
13
|
+
)
|
|
14
|
+
from coderay.core.models import EdgeKind, GraphEdge, GraphNode, NodeKind
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_KNOWN_EXTENSIONS: frozenset[str] = frozenset()
|
|
20
|
+
_KNOWN_INIT_FILENAMES: frozenset[str] = frozenset()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _ensure_registry_cache() -> None:
|
|
24
|
+
"""Lazily populate the cached extension and init filename sets."""
|
|
25
|
+
global _KNOWN_EXTENSIONS, _KNOWN_INIT_FILENAMES # noqa: PLW0603
|
|
26
|
+
if not _KNOWN_EXTENSIONS:
|
|
27
|
+
_KNOWN_EXTENSIONS = frozenset(get_supported_extensions())
|
|
28
|
+
_KNOWN_INIT_FILENAMES = frozenset(get_init_filenames())
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _file_path_to_module_names(file_path: str) -> list[str]:
|
|
32
|
+
"""Derive possible module names from a file path."""
|
|
33
|
+
_ensure_registry_cache()
|
|
34
|
+
|
|
35
|
+
# Strip file extension using the registry
|
|
36
|
+
cleaned = file_path
|
|
37
|
+
for ext in sorted(_KNOWN_EXTENSIONS, key=len, reverse=True):
|
|
38
|
+
if cleaned.endswith(ext):
|
|
39
|
+
cleaned = cleaned[: -len(ext)]
|
|
40
|
+
break
|
|
41
|
+
|
|
42
|
+
parts = cleaned.replace("\\", "/").split("/")
|
|
43
|
+
|
|
44
|
+
# Strip common layout prefixes
|
|
45
|
+
if parts and parts[0] == "src":
|
|
46
|
+
parts = parts[1:]
|
|
47
|
+
|
|
48
|
+
# Strip init-style filenames (Python __init__, JS/TS index)
|
|
49
|
+
if parts and parts[-1] in _KNOWN_INIT_FILENAMES:
|
|
50
|
+
parts = parts[:-1]
|
|
51
|
+
|
|
52
|
+
if not parts:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
names: list[str] = []
|
|
56
|
+
for i in range(len(parts)):
|
|
57
|
+
suffix = parts[i:]
|
|
58
|
+
dotted = ".".join(suffix)
|
|
59
|
+
names.append(dotted)
|
|
60
|
+
slashed = "/".join(suffix)
|
|
61
|
+
if slashed != dotted:
|
|
62
|
+
names.append(slashed)
|
|
63
|
+
return names
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class CodeGraph:
|
|
67
|
+
"""In-memory directed graph of code relationships."""
|
|
68
|
+
|
|
69
|
+
def __init__(self) -> None:
|
|
70
|
+
self._g: nx.DiGraph = nx.DiGraph()
|
|
71
|
+
|
|
72
|
+
# short name -> {full node IDs} — enables resolving bare names like
|
|
73
|
+
# "foo" to "src/utils.py::foo". Multiple IDs when the name is ambiguous
|
|
74
|
+
# (e.g. two files both define a function called "helper").
|
|
75
|
+
self._symbol_index: dict[str, set[str]] = defaultdict(set)
|
|
76
|
+
|
|
77
|
+
# dotted module name -> node ID — maps Python-style import paths
|
|
78
|
+
# (e.g. "core.models") to the MODULE node they refer to.
|
|
79
|
+
self._module_index: dict[str, str] = {}
|
|
80
|
+
|
|
81
|
+
# file path -> {node IDs} — all nodes belonging to a file, for O(k)
|
|
82
|
+
# file removal instead of scanning the entire graph.
|
|
83
|
+
self._file_index: dict[str, set[str]] = defaultdict(set)
|
|
84
|
+
|
|
85
|
+
def _index_node(self, node: GraphNode) -> None:
|
|
86
|
+
"""Register a node in all secondary indexes."""
|
|
87
|
+
self._symbol_index[node.name].add(node.id)
|
|
88
|
+
self._file_index[node.file_path].add(node.id)
|
|
89
|
+
if node.kind == NodeKind.MODULE:
|
|
90
|
+
# Register all suffix variants so that "import models" and
|
|
91
|
+
# "import core.models" both resolve to the same MODULE node.
|
|
92
|
+
for mod_name in _file_path_to_module_names(node.file_path):
|
|
93
|
+
if mod_name not in self._module_index:
|
|
94
|
+
self._module_index[mod_name] = node.id
|
|
95
|
+
|
|
96
|
+
def _unindex_node(self, node: GraphNode) -> None:
|
|
97
|
+
"""Remove a node from all secondary indexes."""
|
|
98
|
+
sym_entries = self._symbol_index.get(node.name)
|
|
99
|
+
if sym_entries is not None:
|
|
100
|
+
sym_entries.discard(node.id)
|
|
101
|
+
file_entries = self._file_index.get(node.file_path)
|
|
102
|
+
if file_entries is not None:
|
|
103
|
+
file_entries.discard(node.id)
|
|
104
|
+
if node.kind == NodeKind.MODULE:
|
|
105
|
+
for mod_name in _file_path_to_module_names(node.file_path):
|
|
106
|
+
if self._module_index.get(mod_name) == node.id:
|
|
107
|
+
del self._module_index[mod_name]
|
|
108
|
+
|
|
109
|
+
def add_node(self, node: GraphNode) -> None:
|
|
110
|
+
"""Insert a node into the graph and update all secondary indexes."""
|
|
111
|
+
self._g.add_node(node.id, data=node)
|
|
112
|
+
self._index_node(node)
|
|
113
|
+
|
|
114
|
+
def add_edge(self, edge: GraphEdge) -> None:
|
|
115
|
+
"""Insert a directed edge."""
|
|
116
|
+
self._g.add_edge(edge.source, edge.target, kind=edge.kind)
|
|
117
|
+
|
|
118
|
+
def add_nodes_and_edges(
|
|
119
|
+
self, nodes: list[GraphNode], edges: list[GraphEdge]
|
|
120
|
+
) -> None:
|
|
121
|
+
"""Bulk-insert nodes then edges (order matters — nodes first)."""
|
|
122
|
+
for n in nodes:
|
|
123
|
+
self.add_node(n)
|
|
124
|
+
for e in edges:
|
|
125
|
+
self.add_edge(e)
|
|
126
|
+
|
|
127
|
+
def remove_file(self, file_path: str) -> int:
|
|
128
|
+
"""Remove all nodes belonging to a file and clean up every index.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Number of nodes removed.
|
|
132
|
+
"""
|
|
133
|
+
to_remove = self._file_index.pop(file_path, set())
|
|
134
|
+
for nid in to_remove:
|
|
135
|
+
node: GraphNode | None = self._g.nodes[nid].get("data")
|
|
136
|
+
if node:
|
|
137
|
+
self._unindex_node(node)
|
|
138
|
+
# NetworkX also removes all edges touching this node
|
|
139
|
+
self._g.remove_node(nid)
|
|
140
|
+
return len(to_remove)
|
|
141
|
+
|
|
142
|
+
def resolve_symbol(self, name: str, caller_file: str | None = None) -> str | None:
|
|
143
|
+
"""Resolve a short/bare name to a fully-qualified node ID.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Full node ID, or None if the name cannot be uniquely resolved.
|
|
147
|
+
"""
|
|
148
|
+
# Already a full node ID (e.g. "src/a.py::foo") — fast path
|
|
149
|
+
if name in self._g and self._g.nodes[name].get("data") is not None:
|
|
150
|
+
return name
|
|
151
|
+
candidates = self._symbol_index.get(name, set())
|
|
152
|
+
if len(candidates) == 1:
|
|
153
|
+
return next(iter(candidates))
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
def resolve_edges(self) -> int:
|
|
157
|
+
"""Rewire phantom edge targets to real node IDs.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Number of edges successfully resolved.
|
|
161
|
+
"""
|
|
162
|
+
resolvable = {EdgeKind.CALLS, EdgeKind.INHERITS, EdgeKind.IMPORTS}
|
|
163
|
+
to_remove: list[tuple[str, str]] = []
|
|
164
|
+
to_add: list[tuple[str, str, dict]] = []
|
|
165
|
+
|
|
166
|
+
for u, v, data in self._g.edges(data=True):
|
|
167
|
+
kind = data.get("kind")
|
|
168
|
+
if kind not in resolvable:
|
|
169
|
+
continue
|
|
170
|
+
# Target is already a real node — nothing to resolve
|
|
171
|
+
if v in self._g and self._g.nodes[v].get("data") is not None:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
# Extract the caller's file path from its node ID
|
|
175
|
+
# ("src/a.py::MyClass.method" -> "src/a.py")
|
|
176
|
+
caller_file = u.split("::")[0] if "::" in u else u
|
|
177
|
+
resolved = self.resolve_symbol(v, caller_file=caller_file)
|
|
178
|
+
if not resolved and kind == EdgeKind.IMPORTS:
|
|
179
|
+
resolved = self._module_index.get(v)
|
|
180
|
+
if not resolved and kind == EdgeKind.IMPORTS:
|
|
181
|
+
resolved = self._resolve_path_target(v)
|
|
182
|
+
if resolved and resolved != v:
|
|
183
|
+
to_remove.append((u, v))
|
|
184
|
+
to_add.append((u, resolved, dict(data)))
|
|
185
|
+
|
|
186
|
+
# Apply changes in a second pass (safe to mutate now)
|
|
187
|
+
for u, v in to_remove:
|
|
188
|
+
if self._g.has_edge(u, v):
|
|
189
|
+
self._g.remove_edge(u, v)
|
|
190
|
+
for u, v, data in to_add:
|
|
191
|
+
self._g.add_edge(u, v, **data)
|
|
192
|
+
|
|
193
|
+
if to_add:
|
|
194
|
+
logger.info("Resolved %d edges via symbol/module index", len(to_add))
|
|
195
|
+
return len(to_add)
|
|
196
|
+
|
|
197
|
+
def _resolve_path_target(self, target: str) -> str | None:
|
|
198
|
+
"""Try to match a path-style target to an existing MODULE node."""
|
|
199
|
+
if "/" not in target:
|
|
200
|
+
return None
|
|
201
|
+
for suffix in get_resolution_suffixes():
|
|
202
|
+
candidate = target + suffix
|
|
203
|
+
node_data = self._g.nodes.get(candidate, {})
|
|
204
|
+
if node_data and node_data.get("data") is not None:
|
|
205
|
+
return candidate
|
|
206
|
+
cleaned = target
|
|
207
|
+
if cleaned.startswith("src/"):
|
|
208
|
+
cleaned = cleaned[4:]
|
|
209
|
+
dotted = cleaned.replace("/", ".")
|
|
210
|
+
return self._module_index.get(dotted)
|
|
211
|
+
|
|
212
|
+
def get_node(self, node_id: str) -> GraphNode | None:
|
|
213
|
+
"""Look up a node by its full ID. Returns None for phantoms or missing nodes."""
|
|
214
|
+
data = self._g.nodes.get(node_id)
|
|
215
|
+
if data:
|
|
216
|
+
return data.get("data")
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
# ------------------------------------------------------------------
|
|
220
|
+
# Public query methods
|
|
221
|
+
# ------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
def get_impact_radius(self, symbol: str, depth: int = 2) -> list[GraphNode]:
|
|
224
|
+
"""Find all nodes that could be affected if ``symbol`` changes.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
depth: Number of reverse-BFS hops. Higher values may return
|
|
228
|
+
very large sets.
|
|
229
|
+
"""
|
|
230
|
+
resolved = self.resolve_symbol(symbol) or symbol
|
|
231
|
+
visited: set[str] = set()
|
|
232
|
+
frontier = {resolved}
|
|
233
|
+
for _ in range(depth):
|
|
234
|
+
next_frontier: set[str] = set()
|
|
235
|
+
for nid in frontier:
|
|
236
|
+
# predecessors = nodes that have an edge pointing TO nid
|
|
237
|
+
for pred in self._g.predecessors(nid):
|
|
238
|
+
if pred not in visited:
|
|
239
|
+
visited.add(pred)
|
|
240
|
+
next_frontier.add(pred)
|
|
241
|
+
frontier = next_frontier
|
|
242
|
+
return [self.get_node(nid) for nid in visited if self.get_node(nid) is not None]
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def node_count(self) -> int:
|
|
246
|
+
"""Total nodes in the graph (including phantoms)."""
|
|
247
|
+
return self._g.number_of_nodes()
|
|
248
|
+
|
|
249
|
+
@property
|
|
250
|
+
def edge_count(self) -> int:
|
|
251
|
+
"""Total edges in the graph."""
|
|
252
|
+
return self._g.number_of_edges()
|
|
253
|
+
|
|
254
|
+
# ------------------------------------------------------------------
|
|
255
|
+
# Serialisation
|
|
256
|
+
#
|
|
257
|
+
# The graph is persisted as JSON (graph.json). Only real nodes are
|
|
258
|
+
# serialised — phantom nodes are recreated implicitly when edges
|
|
259
|
+
# referencing them are re-added. Secondary indexes are rebuilt by
|
|
260
|
+
# ``add_node`` during ``from_dict``.
|
|
261
|
+
# ------------------------------------------------------------------
|
|
262
|
+
|
|
263
|
+
def to_dict(self) -> dict[str, Any]:
|
|
264
|
+
"""Serialise the graph to a JSON-compatible dict."""
|
|
265
|
+
nodes = []
|
|
266
|
+
for nid, data in self._g.nodes(data=True):
|
|
267
|
+
gn: GraphNode | None = data.get("data") if data else None
|
|
268
|
+
if gn:
|
|
269
|
+
nodes.append(
|
|
270
|
+
{
|
|
271
|
+
"id": gn.id,
|
|
272
|
+
"kind": gn.kind.value,
|
|
273
|
+
"file_path": gn.file_path,
|
|
274
|
+
"start_line": gn.start_line,
|
|
275
|
+
"end_line": gn.end_line,
|
|
276
|
+
"name": gn.name,
|
|
277
|
+
"qualified_name": gn.qualified_name,
|
|
278
|
+
}
|
|
279
|
+
)
|
|
280
|
+
edges_list = []
|
|
281
|
+
for u, v, data in self._g.edges(data=True):
|
|
282
|
+
kind = data.get("kind", "")
|
|
283
|
+
kind_val = kind.value if hasattr(kind, "value") else str(kind)
|
|
284
|
+
edges_list.append({"source": u, "target": v, "kind": kind_val})
|
|
285
|
+
return {"nodes": nodes, "edges": edges_list}
|
|
286
|
+
|
|
287
|
+
@classmethod
|
|
288
|
+
def from_dict(cls, data: dict[str, Any]) -> CodeGraph:
|
|
289
|
+
"""Deserialise a graph from a dict produced by ``to_dict``."""
|
|
290
|
+
graph = cls()
|
|
291
|
+
for nd in data.get("nodes", []):
|
|
292
|
+
graph.add_node(
|
|
293
|
+
GraphNode(
|
|
294
|
+
id=nd["id"],
|
|
295
|
+
kind=NodeKind(nd["kind"]),
|
|
296
|
+
file_path=nd["file_path"],
|
|
297
|
+
start_line=nd["start_line"],
|
|
298
|
+
end_line=nd["end_line"],
|
|
299
|
+
name=nd["name"],
|
|
300
|
+
qualified_name=nd["qualified_name"],
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
for ed in data.get("edges", []):
|
|
304
|
+
graph.add_edge(
|
|
305
|
+
GraphEdge(
|
|
306
|
+
source=ed["source"],
|
|
307
|
+
target=ed["target"],
|
|
308
|
+
kind=EdgeKind(ed["kind"]),
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
return graph
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import builtins
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from coderay.chunking.registry import LanguageConfig, get_language_for_file
|
|
8
|
+
from coderay.core.models import EdgeKind, GraphEdge, GraphNode, NodeKind
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
_PYTHON_BUILTINS: frozenset[str] = frozenset(
|
|
13
|
+
name for name in dir(builtins) if not name.startswith("_")
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_callee_filter(config: dict[str, Any] | None = None) -> frozenset[str]:
|
|
18
|
+
"""Build the callee exclusion set from builtins + user config.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
config: Full application config dict. Only the ``graph`` section
|
|
22
|
+
is read. If None, defaults are used.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Frozen set of callee names to exclude from CALLS edges.
|
|
26
|
+
"""
|
|
27
|
+
graph_cfg = (config or {}).get("graph") or {}
|
|
28
|
+
extra_excludes = set(graph_cfg.get("exclude_callees") or [])
|
|
29
|
+
force_includes = set(graph_cfg.get("include_callees") or [])
|
|
30
|
+
return frozenset((_PYTHON_BUILTINS | extra_excludes) - force_includes)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _resolve_relative_import(source_file: str, relative_target: str) -> str | None:
|
|
34
|
+
"""Resolve a Python relative import to a path-based target.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
source_file: Path of the file containing the import.
|
|
38
|
+
relative_target: Dotted import string starting with one or more dots.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Slash-separated path (no extension), or None if dots exceed
|
|
42
|
+
the directory depth.
|
|
43
|
+
"""
|
|
44
|
+
dots = len(relative_target) - len(relative_target.lstrip("."))
|
|
45
|
+
rest = relative_target[dots:]
|
|
46
|
+
|
|
47
|
+
parts = source_file.replace("\\", "/").split("/")
|
|
48
|
+
dir_parts = parts[:-1]
|
|
49
|
+
|
|
50
|
+
levels_up = max(dots - 1, 0)
|
|
51
|
+
if levels_up > len(dir_parts):
|
|
52
|
+
return None
|
|
53
|
+
if levels_up > 0:
|
|
54
|
+
dir_parts = dir_parts[:-levels_up]
|
|
55
|
+
|
|
56
|
+
if rest:
|
|
57
|
+
dir_parts.extend(rest.split("."))
|
|
58
|
+
|
|
59
|
+
return "/".join(dir_parts) if dir_parts else None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _extract_callee_name(text: str) -> str:
|
|
63
|
+
"""Extract the final method/function name from a call expression."""
|
|
64
|
+
cleaned = text
|
|
65
|
+
if cleaned.startswith("self."):
|
|
66
|
+
cleaned = cleaned[5:]
|
|
67
|
+
elif cleaned.startswith("this."):
|
|
68
|
+
cleaned = cleaned[5:]
|
|
69
|
+
parts = cleaned.split(".")
|
|
70
|
+
return parts[-1] if parts else cleaned
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class GraphExtractor:
|
|
74
|
+
"""Extract graph nodes and edges from source files."""
|
|
75
|
+
|
|
76
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
77
|
+
"""Initialize the extractor with optional config overrides."""
|
|
78
|
+
self._excluded_callees = build_callee_filter(config)
|
|
79
|
+
self._source_bytes: bytes = b""
|
|
80
|
+
self._file_path: str = ""
|
|
81
|
+
self._module_id: str = ""
|
|
82
|
+
self._lang_cfg: LanguageConfig | None = None
|
|
83
|
+
self._nodes: list[GraphNode] = []
|
|
84
|
+
self._edges: list[GraphEdge] = []
|
|
85
|
+
|
|
86
|
+
def extract_from_file(
|
|
87
|
+
self,
|
|
88
|
+
file_path: str,
|
|
89
|
+
content: str,
|
|
90
|
+
) -> tuple[list[GraphNode], list[GraphEdge]]:
|
|
91
|
+
"""Parse a source file and extract all graph nodes and edges.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Tuple of (nodes, edges). Returns ``([], [])`` if the language
|
|
95
|
+
is unsupported or parsing fails.
|
|
96
|
+
"""
|
|
97
|
+
lang_cfg = get_language_for_file(file_path)
|
|
98
|
+
if lang_cfg is None:
|
|
99
|
+
return [], []
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
parser = lang_cfg.get_parser()
|
|
103
|
+
except Exception:
|
|
104
|
+
return [], []
|
|
105
|
+
|
|
106
|
+
self._source_bytes = content.encode("utf-8")
|
|
107
|
+
self._file_path = file_path
|
|
108
|
+
self._module_id = file_path
|
|
109
|
+
self._lang_cfg = lang_cfg
|
|
110
|
+
self._nodes = []
|
|
111
|
+
self._edges = []
|
|
112
|
+
|
|
113
|
+
tree = parser.parse(self._source_bytes)
|
|
114
|
+
|
|
115
|
+
module_node = GraphNode(
|
|
116
|
+
id=self._module_id,
|
|
117
|
+
kind=NodeKind.MODULE,
|
|
118
|
+
file_path=file_path,
|
|
119
|
+
start_line=1,
|
|
120
|
+
end_line=tree.root_node.end_point[0] + 1,
|
|
121
|
+
name=file_path,
|
|
122
|
+
qualified_name=file_path,
|
|
123
|
+
)
|
|
124
|
+
self._nodes.append(module_node)
|
|
125
|
+
|
|
126
|
+
self._visit(tree.root_node, scope_stack=[])
|
|
127
|
+
return self._nodes, self._edges
|
|
128
|
+
|
|
129
|
+
# ------------------------------------------------------------------
|
|
130
|
+
# Tree traversal
|
|
131
|
+
# ------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
def _visit(self, node, *, scope_stack: list[str]) -> None:
|
|
134
|
+
"""Recursively walk the syntax tree, dispatching to type-specific handlers."""
|
|
135
|
+
ntype = node.type
|
|
136
|
+
lang_cfg = self._lang_cfg
|
|
137
|
+
|
|
138
|
+
if ntype in lang_cfg.import_types:
|
|
139
|
+
self._handle_import(node)
|
|
140
|
+
elif ntype in lang_cfg.function_scope_types:
|
|
141
|
+
self._handle_function_def(node, scope_stack=scope_stack)
|
|
142
|
+
return
|
|
143
|
+
elif ntype in lang_cfg.class_scope_types:
|
|
144
|
+
self._handle_class_def(node, scope_stack=scope_stack)
|
|
145
|
+
return
|
|
146
|
+
elif ntype in lang_cfg.call_types:
|
|
147
|
+
self._handle_call(node, scope_stack=scope_stack)
|
|
148
|
+
|
|
149
|
+
for child in node.children:
|
|
150
|
+
self._visit(child, scope_stack=scope_stack)
|
|
151
|
+
|
|
152
|
+
# ------------------------------------------------------------------
|
|
153
|
+
# Node-type handlers
|
|
154
|
+
# ------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
def _handle_import(self, node) -> None:
|
|
157
|
+
"""Create IMPORTS edges for an import statement."""
|
|
158
|
+
is_from_import = node.type == "import_from_statement"
|
|
159
|
+
found_module = False
|
|
160
|
+
for child in node.children:
|
|
161
|
+
if child.type in ("dotted_name", "relative_import"):
|
|
162
|
+
if is_from_import and found_module:
|
|
163
|
+
continue
|
|
164
|
+
target = self._text(child)
|
|
165
|
+
if target:
|
|
166
|
+
if child.type == "relative_import" and target.startswith("."):
|
|
167
|
+
resolved = _resolve_relative_import(self._module_id, target)
|
|
168
|
+
if resolved:
|
|
169
|
+
target = resolved
|
|
170
|
+
self._edges.append(
|
|
171
|
+
GraphEdge(
|
|
172
|
+
source=self._module_id,
|
|
173
|
+
target=target,
|
|
174
|
+
kind=EdgeKind.IMPORTS,
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
if is_from_import:
|
|
178
|
+
found_module = True
|
|
179
|
+
elif child.type == "string":
|
|
180
|
+
target = self._text(child).strip("'\"")
|
|
181
|
+
if target:
|
|
182
|
+
self._edges.append(
|
|
183
|
+
GraphEdge(
|
|
184
|
+
source=self._module_id,
|
|
185
|
+
target=target,
|
|
186
|
+
kind=EdgeKind.IMPORTS,
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
elif child.type == "interpreted_string_literal":
|
|
190
|
+
target = self._text(child).strip('"')
|
|
191
|
+
if target:
|
|
192
|
+
self._edges.append(
|
|
193
|
+
GraphEdge(
|
|
194
|
+
source=self._module_id,
|
|
195
|
+
target=target,
|
|
196
|
+
kind=EdgeKind.IMPORTS,
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def _handle_function_def(self, node, *, scope_stack: list[str]) -> None:
|
|
201
|
+
"""Create a FUNCTION node and DEFINES edge, then recurse into the body."""
|
|
202
|
+
name = self._get_identifier(node)
|
|
203
|
+
if not name:
|
|
204
|
+
return
|
|
205
|
+
qualified = ".".join([*scope_stack, name])
|
|
206
|
+
node_id = f"{self._file_path}::{qualified}"
|
|
207
|
+
self._nodes.append(
|
|
208
|
+
GraphNode(
|
|
209
|
+
id=node_id,
|
|
210
|
+
kind=NodeKind.FUNCTION,
|
|
211
|
+
file_path=self._file_path,
|
|
212
|
+
start_line=node.start_point[0] + 1,
|
|
213
|
+
end_line=node.end_point[0] + 1,
|
|
214
|
+
name=name,
|
|
215
|
+
qualified_name=qualified,
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
self._edges.append(
|
|
219
|
+
GraphEdge(
|
|
220
|
+
source=self._module_id,
|
|
221
|
+
target=node_id,
|
|
222
|
+
kind=EdgeKind.DEFINES,
|
|
223
|
+
)
|
|
224
|
+
)
|
|
225
|
+
new_scope = [*scope_stack, name]
|
|
226
|
+
for child in node.children:
|
|
227
|
+
self._visit(child, scope_stack=new_scope)
|
|
228
|
+
|
|
229
|
+
def _handle_class_def(self, node, *, scope_stack: list[str]) -> None:
|
|
230
|
+
"""Create a CLASS node, DEFINES + INHERITS edges, then recurse."""
|
|
231
|
+
name = self._get_identifier(node)
|
|
232
|
+
if not name:
|
|
233
|
+
return
|
|
234
|
+
qualified = ".".join([*scope_stack, name])
|
|
235
|
+
node_id = f"{self._file_path}::{qualified}"
|
|
236
|
+
self._nodes.append(
|
|
237
|
+
GraphNode(
|
|
238
|
+
id=node_id,
|
|
239
|
+
kind=NodeKind.CLASS,
|
|
240
|
+
file_path=self._file_path,
|
|
241
|
+
start_line=node.start_point[0] + 1,
|
|
242
|
+
end_line=node.end_point[0] + 1,
|
|
243
|
+
name=name,
|
|
244
|
+
qualified_name=qualified,
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
self._edges.append(
|
|
248
|
+
GraphEdge(
|
|
249
|
+
source=self._module_id,
|
|
250
|
+
target=node_id,
|
|
251
|
+
kind=EdgeKind.DEFINES,
|
|
252
|
+
)
|
|
253
|
+
)
|
|
254
|
+
for child in node.children:
|
|
255
|
+
if child.type in ("argument_list", "superclass", "extends_clause"):
|
|
256
|
+
for arg in child.children:
|
|
257
|
+
if arg.type in (
|
|
258
|
+
"identifier",
|
|
259
|
+
"dotted_name",
|
|
260
|
+
"attribute",
|
|
261
|
+
"type_identifier",
|
|
262
|
+
):
|
|
263
|
+
if base_name := self._text(arg):
|
|
264
|
+
self._edges.append(
|
|
265
|
+
GraphEdge(
|
|
266
|
+
source=node_id,
|
|
267
|
+
target=base_name,
|
|
268
|
+
kind=EdgeKind.INHERITS,
|
|
269
|
+
)
|
|
270
|
+
)
|
|
271
|
+
new_scope = [*scope_stack, name]
|
|
272
|
+
for child in node.children:
|
|
273
|
+
self._visit(child, scope_stack=new_scope)
|
|
274
|
+
|
|
275
|
+
def _handle_call(self, node, *, scope_stack: list[str]) -> None:
|
|
276
|
+
"""Create a CALLS edge from the enclosing scope to the callee."""
|
|
277
|
+
caller_id = (
|
|
278
|
+
f"{self._file_path}::{'.'.join(scope_stack)}"
|
|
279
|
+
if scope_stack
|
|
280
|
+
else self._module_id
|
|
281
|
+
)
|
|
282
|
+
first_child = node.children[0] if node.children else None
|
|
283
|
+
if first_child is None:
|
|
284
|
+
return
|
|
285
|
+
raw_callee = self._text(first_child)
|
|
286
|
+
if not raw_callee:
|
|
287
|
+
return
|
|
288
|
+
callee_name = _extract_callee_name(raw_callee)
|
|
289
|
+
if callee_name and callee_name not in self._excluded_callees:
|
|
290
|
+
self._edges.append(
|
|
291
|
+
GraphEdge(
|
|
292
|
+
source=caller_id,
|
|
293
|
+
target=callee_name,
|
|
294
|
+
kind=EdgeKind.CALLS,
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# ------------------------------------------------------------------
|
|
299
|
+
# Helpers
|
|
300
|
+
# ------------------------------------------------------------------
|
|
301
|
+
|
|
302
|
+
def _get_identifier(self, node) -> str:
|
|
303
|
+
"""Return the identifier name from a definition node."""
|
|
304
|
+
for child in node.children:
|
|
305
|
+
if child.type in ("identifier", "type_identifier", "field_identifier"):
|
|
306
|
+
return self._source_bytes[child.start_byte : child.end_byte].decode(
|
|
307
|
+
"utf-8", errors="replace"
|
|
308
|
+
)
|
|
309
|
+
return ""
|
|
310
|
+
|
|
311
|
+
def _text(self, node) -> str:
|
|
312
|
+
"""Decode the raw source text spanned by a syntax tree node."""
|
|
313
|
+
return self._source_bytes[node.start_byte : node.end_byte].decode(
|
|
314
|
+
"utf-8", errors="replace"
|
|
315
|
+
)
|
|
File without changes
|