coderay 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,311 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from collections import defaultdict
5
+ from typing import Any
6
+
7
+ import networkx as nx
8
+
9
+ from coderay.chunking.registry import (
10
+ get_init_filenames,
11
+ get_resolution_suffixes,
12
+ get_supported_extensions,
13
+ )
14
+ from coderay.core.models import EdgeKind, GraphEdge, GraphNode, NodeKind
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ _KNOWN_EXTENSIONS: frozenset[str] = frozenset()
20
+ _KNOWN_INIT_FILENAMES: frozenset[str] = frozenset()
21
+
22
+
23
+ def _ensure_registry_cache() -> None:
24
+ """Lazily populate the cached extension and init filename sets."""
25
+ global _KNOWN_EXTENSIONS, _KNOWN_INIT_FILENAMES # noqa: PLW0603
26
+ if not _KNOWN_EXTENSIONS:
27
+ _KNOWN_EXTENSIONS = frozenset(get_supported_extensions())
28
+ _KNOWN_INIT_FILENAMES = frozenset(get_init_filenames())
29
+
30
+
31
+ def _file_path_to_module_names(file_path: str) -> list[str]:
32
+ """Derive possible module names from a file path."""
33
+ _ensure_registry_cache()
34
+
35
+ # Strip file extension using the registry
36
+ cleaned = file_path
37
+ for ext in sorted(_KNOWN_EXTENSIONS, key=len, reverse=True):
38
+ if cleaned.endswith(ext):
39
+ cleaned = cleaned[: -len(ext)]
40
+ break
41
+
42
+ parts = cleaned.replace("\\", "/").split("/")
43
+
44
+ # Strip common layout prefixes
45
+ if parts and parts[0] == "src":
46
+ parts = parts[1:]
47
+
48
+ # Strip init-style filenames (Python __init__, JS/TS index)
49
+ if parts and parts[-1] in _KNOWN_INIT_FILENAMES:
50
+ parts = parts[:-1]
51
+
52
+ if not parts:
53
+ return []
54
+
55
+ names: list[str] = []
56
+ for i in range(len(parts)):
57
+ suffix = parts[i:]
58
+ dotted = ".".join(suffix)
59
+ names.append(dotted)
60
+ slashed = "/".join(suffix)
61
+ if slashed != dotted:
62
+ names.append(slashed)
63
+ return names
64
+
65
+
66
+ class CodeGraph:
67
+ """In-memory directed graph of code relationships."""
68
+
69
+ def __init__(self) -> None:
70
+ self._g: nx.DiGraph = nx.DiGraph()
71
+
72
+ # short name -> {full node IDs} — enables resolving bare names like
73
+ # "foo" to "src/utils.py::foo". Multiple IDs when the name is ambiguous
74
+ # (e.g. two files both define a function called "helper").
75
+ self._symbol_index: dict[str, set[str]] = defaultdict(set)
76
+
77
+ # dotted module name -> node ID — maps Python-style import paths
78
+ # (e.g. "core.models") to the MODULE node they refer to.
79
+ self._module_index: dict[str, str] = {}
80
+
81
+ # file path -> {node IDs} — all nodes belonging to a file, for O(k)
82
+ # file removal instead of scanning the entire graph.
83
+ self._file_index: dict[str, set[str]] = defaultdict(set)
84
+
85
+ def _index_node(self, node: GraphNode) -> None:
86
+ """Register a node in all secondary indexes."""
87
+ self._symbol_index[node.name].add(node.id)
88
+ self._file_index[node.file_path].add(node.id)
89
+ if node.kind == NodeKind.MODULE:
90
+ # Register all suffix variants so that "import models" and
91
+ # "import core.models" both resolve to the same MODULE node.
92
+ for mod_name in _file_path_to_module_names(node.file_path):
93
+ if mod_name not in self._module_index:
94
+ self._module_index[mod_name] = node.id
95
+
96
+ def _unindex_node(self, node: GraphNode) -> None:
97
+ """Remove a node from all secondary indexes."""
98
+ sym_entries = self._symbol_index.get(node.name)
99
+ if sym_entries is not None:
100
+ sym_entries.discard(node.id)
101
+ file_entries = self._file_index.get(node.file_path)
102
+ if file_entries is not None:
103
+ file_entries.discard(node.id)
104
+ if node.kind == NodeKind.MODULE:
105
+ for mod_name in _file_path_to_module_names(node.file_path):
106
+ if self._module_index.get(mod_name) == node.id:
107
+ del self._module_index[mod_name]
108
+
109
+ def add_node(self, node: GraphNode) -> None:
110
+ """Insert a node into the graph and update all secondary indexes."""
111
+ self._g.add_node(node.id, data=node)
112
+ self._index_node(node)
113
+
114
+ def add_edge(self, edge: GraphEdge) -> None:
115
+ """Insert a directed edge."""
116
+ self._g.add_edge(edge.source, edge.target, kind=edge.kind)
117
+
118
+ def add_nodes_and_edges(
119
+ self, nodes: list[GraphNode], edges: list[GraphEdge]
120
+ ) -> None:
121
+ """Bulk-insert nodes then edges (order matters — nodes first)."""
122
+ for n in nodes:
123
+ self.add_node(n)
124
+ for e in edges:
125
+ self.add_edge(e)
126
+
127
+ def remove_file(self, file_path: str) -> int:
128
+ """Remove all nodes belonging to a file and clean up every index.
129
+
130
+ Returns:
131
+ Number of nodes removed.
132
+ """
133
+ to_remove = self._file_index.pop(file_path, set())
134
+ for nid in to_remove:
135
+ node: GraphNode | None = self._g.nodes[nid].get("data")
136
+ if node:
137
+ self._unindex_node(node)
138
+ # NetworkX also removes all edges touching this node
139
+ self._g.remove_node(nid)
140
+ return len(to_remove)
141
+
142
+ def resolve_symbol(self, name: str, caller_file: str | None = None) -> str | None:
143
+ """Resolve a short/bare name to a fully-qualified node ID.
144
+
145
+ Returns:
146
+ Full node ID, or None if the name cannot be uniquely resolved.
147
+ """
148
+ # Already a full node ID (e.g. "src/a.py::foo") — fast path
149
+ if name in self._g and self._g.nodes[name].get("data") is not None:
150
+ return name
151
+ candidates = self._symbol_index.get(name, set())
152
+ if len(candidates) == 1:
153
+ return next(iter(candidates))
154
+ return None
155
+
156
+ def resolve_edges(self) -> int:
157
+ """Rewire phantom edge targets to real node IDs.
158
+
159
+ Returns:
160
+ Number of edges successfully resolved.
161
+ """
162
+ resolvable = {EdgeKind.CALLS, EdgeKind.INHERITS, EdgeKind.IMPORTS}
163
+ to_remove: list[tuple[str, str]] = []
164
+ to_add: list[tuple[str, str, dict]] = []
165
+
166
+ for u, v, data in self._g.edges(data=True):
167
+ kind = data.get("kind")
168
+ if kind not in resolvable:
169
+ continue
170
+ # Target is already a real node — nothing to resolve
171
+ if v in self._g and self._g.nodes[v].get("data") is not None:
172
+ continue
173
+
174
+ # Extract the caller's file path from its node ID
175
+ # ("src/a.py::MyClass.method" -> "src/a.py")
176
+ caller_file = u.split("::")[0] if "::" in u else u
177
+ resolved = self.resolve_symbol(v, caller_file=caller_file)
178
+ if not resolved and kind == EdgeKind.IMPORTS:
179
+ resolved = self._module_index.get(v)
180
+ if not resolved and kind == EdgeKind.IMPORTS:
181
+ resolved = self._resolve_path_target(v)
182
+ if resolved and resolved != v:
183
+ to_remove.append((u, v))
184
+ to_add.append((u, resolved, dict(data)))
185
+
186
+ # Apply changes in a second pass (safe to mutate now)
187
+ for u, v in to_remove:
188
+ if self._g.has_edge(u, v):
189
+ self._g.remove_edge(u, v)
190
+ for u, v, data in to_add:
191
+ self._g.add_edge(u, v, **data)
192
+
193
+ if to_add:
194
+ logger.info("Resolved %d edges via symbol/module index", len(to_add))
195
+ return len(to_add)
196
+
197
+ def _resolve_path_target(self, target: str) -> str | None:
198
+ """Try to match a path-style target to an existing MODULE node."""
199
+ if "/" not in target:
200
+ return None
201
+ for suffix in get_resolution_suffixes():
202
+ candidate = target + suffix
203
+ node_data = self._g.nodes.get(candidate, {})
204
+ if node_data and node_data.get("data") is not None:
205
+ return candidate
206
+ cleaned = target
207
+ if cleaned.startswith("src/"):
208
+ cleaned = cleaned[4:]
209
+ dotted = cleaned.replace("/", ".")
210
+ return self._module_index.get(dotted)
211
+
212
+ def get_node(self, node_id: str) -> GraphNode | None:
213
+ """Look up a node by its full ID. Returns None for phantoms or missing nodes."""
214
+ data = self._g.nodes.get(node_id)
215
+ if data:
216
+ return data.get("data")
217
+ return None
218
+
219
+ # ------------------------------------------------------------------
220
+ # Public query methods
221
+ # ------------------------------------------------------------------
222
+
223
+ def get_impact_radius(self, symbol: str, depth: int = 2) -> list[GraphNode]:
224
+ """Find all nodes that could be affected if ``symbol`` changes.
225
+
226
+ Args:
227
+ depth: Number of reverse-BFS hops. Higher values may return
228
+ very large sets.
229
+ """
230
+ resolved = self.resolve_symbol(symbol) or symbol
231
+ visited: set[str] = set()
232
+ frontier = {resolved}
233
+ for _ in range(depth):
234
+ next_frontier: set[str] = set()
235
+ for nid in frontier:
236
+ # predecessors = nodes that have an edge pointing TO nid
237
+ for pred in self._g.predecessors(nid):
238
+ if pred not in visited:
239
+ visited.add(pred)
240
+ next_frontier.add(pred)
241
+ frontier = next_frontier
242
+ return [self.get_node(nid) for nid in visited if self.get_node(nid) is not None]
243
+
244
+ @property
245
+ def node_count(self) -> int:
246
+ """Total nodes in the graph (including phantoms)."""
247
+ return self._g.number_of_nodes()
248
+
249
+ @property
250
+ def edge_count(self) -> int:
251
+ """Total edges in the graph."""
252
+ return self._g.number_of_edges()
253
+
254
+ # ------------------------------------------------------------------
255
+ # Serialisation
256
+ #
257
+ # The graph is persisted as JSON (graph.json). Only real nodes are
258
+ # serialised — phantom nodes are recreated implicitly when edges
259
+ # referencing them are re-added. Secondary indexes are rebuilt by
260
+ # ``add_node`` during ``from_dict``.
261
+ # ------------------------------------------------------------------
262
+
263
+ def to_dict(self) -> dict[str, Any]:
264
+ """Serialise the graph to a JSON-compatible dict."""
265
+ nodes = []
266
+ for nid, data in self._g.nodes(data=True):
267
+ gn: GraphNode | None = data.get("data") if data else None
268
+ if gn:
269
+ nodes.append(
270
+ {
271
+ "id": gn.id,
272
+ "kind": gn.kind.value,
273
+ "file_path": gn.file_path,
274
+ "start_line": gn.start_line,
275
+ "end_line": gn.end_line,
276
+ "name": gn.name,
277
+ "qualified_name": gn.qualified_name,
278
+ }
279
+ )
280
+ edges_list = []
281
+ for u, v, data in self._g.edges(data=True):
282
+ kind = data.get("kind", "")
283
+ kind_val = kind.value if hasattr(kind, "value") else str(kind)
284
+ edges_list.append({"source": u, "target": v, "kind": kind_val})
285
+ return {"nodes": nodes, "edges": edges_list}
286
+
287
+ @classmethod
288
+ def from_dict(cls, data: dict[str, Any]) -> CodeGraph:
289
+ """Deserialise a graph from a dict produced by ``to_dict``."""
290
+ graph = cls()
291
+ for nd in data.get("nodes", []):
292
+ graph.add_node(
293
+ GraphNode(
294
+ id=nd["id"],
295
+ kind=NodeKind(nd["kind"]),
296
+ file_path=nd["file_path"],
297
+ start_line=nd["start_line"],
298
+ end_line=nd["end_line"],
299
+ name=nd["name"],
300
+ qualified_name=nd["qualified_name"],
301
+ )
302
+ )
303
+ for ed in data.get("edges", []):
304
+ graph.add_edge(
305
+ GraphEdge(
306
+ source=ed["source"],
307
+ target=ed["target"],
308
+ kind=EdgeKind(ed["kind"]),
309
+ )
310
+ )
311
+ return graph
@@ -0,0 +1,315 @@
1
+ from __future__ import annotations
2
+
3
+ import builtins
4
+ import logging
5
+ from typing import Any
6
+
7
+ from coderay.chunking.registry import LanguageConfig, get_language_for_file
8
+ from coderay.core.models import EdgeKind, GraphEdge, GraphNode, NodeKind
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _PYTHON_BUILTINS: frozenset[str] = frozenset(
13
+ name for name in dir(builtins) if not name.startswith("_")
14
+ )
15
+
16
+
17
+ def build_callee_filter(config: dict[str, Any] | None = None) -> frozenset[str]:
18
+ """Build the callee exclusion set from builtins + user config.
19
+
20
+ Args:
21
+ config: Full application config dict. Only the ``graph`` section
22
+ is read. If None, defaults are used.
23
+
24
+ Returns:
25
+ Frozen set of callee names to exclude from CALLS edges.
26
+ """
27
+ graph_cfg = (config or {}).get("graph") or {}
28
+ extra_excludes = set(graph_cfg.get("exclude_callees") or [])
29
+ force_includes = set(graph_cfg.get("include_callees") or [])
30
+ return frozenset((_PYTHON_BUILTINS | extra_excludes) - force_includes)
31
+
32
+
33
+ def _resolve_relative_import(source_file: str, relative_target: str) -> str | None:
34
+ """Resolve a Python relative import to a path-based target.
35
+
36
+ Args:
37
+ source_file: Path of the file containing the import.
38
+ relative_target: Dotted import string starting with one or more dots.
39
+
40
+ Returns:
41
+ Slash-separated path (no extension), or None if dots exceed
42
+ the directory depth.
43
+ """
44
+ dots = len(relative_target) - len(relative_target.lstrip("."))
45
+ rest = relative_target[dots:]
46
+
47
+ parts = source_file.replace("\\", "/").split("/")
48
+ dir_parts = parts[:-1]
49
+
50
+ levels_up = max(dots - 1, 0)
51
+ if levels_up > len(dir_parts):
52
+ return None
53
+ if levels_up > 0:
54
+ dir_parts = dir_parts[:-levels_up]
55
+
56
+ if rest:
57
+ dir_parts.extend(rest.split("."))
58
+
59
+ return "/".join(dir_parts) if dir_parts else None
60
+
61
+
62
+ def _extract_callee_name(text: str) -> str:
63
+ """Extract the final method/function name from a call expression."""
64
+ cleaned = text
65
+ if cleaned.startswith("self."):
66
+ cleaned = cleaned[5:]
67
+ elif cleaned.startswith("this."):
68
+ cleaned = cleaned[5:]
69
+ parts = cleaned.split(".")
70
+ return parts[-1] if parts else cleaned
71
+
72
+
73
+ class GraphExtractor:
74
+ """Extract graph nodes and edges from source files."""
75
+
76
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
77
+ """Initialize the extractor with optional config overrides."""
78
+ self._excluded_callees = build_callee_filter(config)
79
+ self._source_bytes: bytes = b""
80
+ self._file_path: str = ""
81
+ self._module_id: str = ""
82
+ self._lang_cfg: LanguageConfig | None = None
83
+ self._nodes: list[GraphNode] = []
84
+ self._edges: list[GraphEdge] = []
85
+
86
+ def extract_from_file(
87
+ self,
88
+ file_path: str,
89
+ content: str,
90
+ ) -> tuple[list[GraphNode], list[GraphEdge]]:
91
+ """Parse a source file and extract all graph nodes and edges.
92
+
93
+ Returns:
94
+ Tuple of (nodes, edges). Returns ``([], [])`` if the language
95
+ is unsupported or parsing fails.
96
+ """
97
+ lang_cfg = get_language_for_file(file_path)
98
+ if lang_cfg is None:
99
+ return [], []
100
+
101
+ try:
102
+ parser = lang_cfg.get_parser()
103
+ except Exception:
104
+ return [], []
105
+
106
+ self._source_bytes = content.encode("utf-8")
107
+ self._file_path = file_path
108
+ self._module_id = file_path
109
+ self._lang_cfg = lang_cfg
110
+ self._nodes = []
111
+ self._edges = []
112
+
113
+ tree = parser.parse(self._source_bytes)
114
+
115
+ module_node = GraphNode(
116
+ id=self._module_id,
117
+ kind=NodeKind.MODULE,
118
+ file_path=file_path,
119
+ start_line=1,
120
+ end_line=tree.root_node.end_point[0] + 1,
121
+ name=file_path,
122
+ qualified_name=file_path,
123
+ )
124
+ self._nodes.append(module_node)
125
+
126
+ self._visit(tree.root_node, scope_stack=[])
127
+ return self._nodes, self._edges
128
+
129
+ # ------------------------------------------------------------------
130
+ # Tree traversal
131
+ # ------------------------------------------------------------------
132
+
133
+ def _visit(self, node, *, scope_stack: list[str]) -> None:
134
+ """Recursively walk the syntax tree, dispatching to type-specific handlers."""
135
+ ntype = node.type
136
+ lang_cfg = self._lang_cfg
137
+
138
+ if ntype in lang_cfg.import_types:
139
+ self._handle_import(node)
140
+ elif ntype in lang_cfg.function_scope_types:
141
+ self._handle_function_def(node, scope_stack=scope_stack)
142
+ return
143
+ elif ntype in lang_cfg.class_scope_types:
144
+ self._handle_class_def(node, scope_stack=scope_stack)
145
+ return
146
+ elif ntype in lang_cfg.call_types:
147
+ self._handle_call(node, scope_stack=scope_stack)
148
+
149
+ for child in node.children:
150
+ self._visit(child, scope_stack=scope_stack)
151
+
152
+ # ------------------------------------------------------------------
153
+ # Node-type handlers
154
+ # ------------------------------------------------------------------
155
+
156
+ def _handle_import(self, node) -> None:
157
+ """Create IMPORTS edges for an import statement."""
158
+ is_from_import = node.type == "import_from_statement"
159
+ found_module = False
160
+ for child in node.children:
161
+ if child.type in ("dotted_name", "relative_import"):
162
+ if is_from_import and found_module:
163
+ continue
164
+ target = self._text(child)
165
+ if target:
166
+ if child.type == "relative_import" and target.startswith("."):
167
+ resolved = _resolve_relative_import(self._module_id, target)
168
+ if resolved:
169
+ target = resolved
170
+ self._edges.append(
171
+ GraphEdge(
172
+ source=self._module_id,
173
+ target=target,
174
+ kind=EdgeKind.IMPORTS,
175
+ )
176
+ )
177
+ if is_from_import:
178
+ found_module = True
179
+ elif child.type == "string":
180
+ target = self._text(child).strip("'\"")
181
+ if target:
182
+ self._edges.append(
183
+ GraphEdge(
184
+ source=self._module_id,
185
+ target=target,
186
+ kind=EdgeKind.IMPORTS,
187
+ )
188
+ )
189
+ elif child.type == "interpreted_string_literal":
190
+ target = self._text(child).strip('"')
191
+ if target:
192
+ self._edges.append(
193
+ GraphEdge(
194
+ source=self._module_id,
195
+ target=target,
196
+ kind=EdgeKind.IMPORTS,
197
+ )
198
+ )
199
+
200
+ def _handle_function_def(self, node, *, scope_stack: list[str]) -> None:
201
+ """Create a FUNCTION node and DEFINES edge, then recurse into the body."""
202
+ name = self._get_identifier(node)
203
+ if not name:
204
+ return
205
+ qualified = ".".join([*scope_stack, name])
206
+ node_id = f"{self._file_path}::{qualified}"
207
+ self._nodes.append(
208
+ GraphNode(
209
+ id=node_id,
210
+ kind=NodeKind.FUNCTION,
211
+ file_path=self._file_path,
212
+ start_line=node.start_point[0] + 1,
213
+ end_line=node.end_point[0] + 1,
214
+ name=name,
215
+ qualified_name=qualified,
216
+ )
217
+ )
218
+ self._edges.append(
219
+ GraphEdge(
220
+ source=self._module_id,
221
+ target=node_id,
222
+ kind=EdgeKind.DEFINES,
223
+ )
224
+ )
225
+ new_scope = [*scope_stack, name]
226
+ for child in node.children:
227
+ self._visit(child, scope_stack=new_scope)
228
+
229
+ def _handle_class_def(self, node, *, scope_stack: list[str]) -> None:
230
+ """Create a CLASS node, DEFINES + INHERITS edges, then recurse."""
231
+ name = self._get_identifier(node)
232
+ if not name:
233
+ return
234
+ qualified = ".".join([*scope_stack, name])
235
+ node_id = f"{self._file_path}::{qualified}"
236
+ self._nodes.append(
237
+ GraphNode(
238
+ id=node_id,
239
+ kind=NodeKind.CLASS,
240
+ file_path=self._file_path,
241
+ start_line=node.start_point[0] + 1,
242
+ end_line=node.end_point[0] + 1,
243
+ name=name,
244
+ qualified_name=qualified,
245
+ )
246
+ )
247
+ self._edges.append(
248
+ GraphEdge(
249
+ source=self._module_id,
250
+ target=node_id,
251
+ kind=EdgeKind.DEFINES,
252
+ )
253
+ )
254
+ for child in node.children:
255
+ if child.type in ("argument_list", "superclass", "extends_clause"):
256
+ for arg in child.children:
257
+ if arg.type in (
258
+ "identifier",
259
+ "dotted_name",
260
+ "attribute",
261
+ "type_identifier",
262
+ ):
263
+ if base_name := self._text(arg):
264
+ self._edges.append(
265
+ GraphEdge(
266
+ source=node_id,
267
+ target=base_name,
268
+ kind=EdgeKind.INHERITS,
269
+ )
270
+ )
271
+ new_scope = [*scope_stack, name]
272
+ for child in node.children:
273
+ self._visit(child, scope_stack=new_scope)
274
+
275
+ def _handle_call(self, node, *, scope_stack: list[str]) -> None:
276
+ """Create a CALLS edge from the enclosing scope to the callee."""
277
+ caller_id = (
278
+ f"{self._file_path}::{'.'.join(scope_stack)}"
279
+ if scope_stack
280
+ else self._module_id
281
+ )
282
+ first_child = node.children[0] if node.children else None
283
+ if first_child is None:
284
+ return
285
+ raw_callee = self._text(first_child)
286
+ if not raw_callee:
287
+ return
288
+ callee_name = _extract_callee_name(raw_callee)
289
+ if callee_name and callee_name not in self._excluded_callees:
290
+ self._edges.append(
291
+ GraphEdge(
292
+ source=caller_id,
293
+ target=callee_name,
294
+ kind=EdgeKind.CALLS,
295
+ )
296
+ )
297
+
298
+ # ------------------------------------------------------------------
299
+ # Helpers
300
+ # ------------------------------------------------------------------
301
+
302
+ def _get_identifier(self, node) -> str:
303
+ """Return the identifier name from a definition node."""
304
+ for child in node.children:
305
+ if child.type in ("identifier", "type_identifier", "field_identifier"):
306
+ return self._source_bytes[child.start_byte : child.end_byte].decode(
307
+ "utf-8", errors="replace"
308
+ )
309
+ return ""
310
+
311
+ def _text(self, node) -> str:
312
+ """Decode the raw source text spanned by a syntax tree node."""
313
+ return self._source_bytes[node.start_byte : node.end_byte].decode(
314
+ "utf-8", errors="replace"
315
+ )
File without changes