codeanalyzer-python 0.1.14__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,176 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """The output-agnostic intermediate between :func:`project` and the two writers
18
+ (cypher snapshot / bolt incremental). Pure data — no I/O, no driver. A
19
+ :class:`GraphRows` is a deterministic, deduped bag of nodes and edges that both
20
+ writers consume identically.
21
+
22
+ Property values are restricted to Neo4j-legal shapes: primitives and homogeneous
23
+ arrays of primitives. ``None`` values are pruned (in Neo4j a null property is
24
+ simply absence).
25
+ """
26
+ from __future__ import annotations
27
+
28
+ from dataclasses import dataclass, field
29
+ from typing import Dict, List, Optional, Union
30
+
31
+ # A property value: a primitive, or a homogeneous list of primitives.
32
+ Scalar = Union[str, int, float, bool]
33
+ Prop = Union[Scalar, List[str], List[int], List[float], List[bool]]
34
+ Props = Dict[str, Prop]
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class NodeRef:
39
+ """How an edge addresses one of its endpoints: the label + key property to
40
+ MATCH on, and the value."""
41
+
42
+ label: str # the label carrying the uniqueness constraint (e.g. "PySymbol", "PyModule")
43
+ key_prop: str # "signature" | "file_key" | "name" | "id"
44
+ value: str
45
+
46
+
47
+ @dataclass
48
+ class NodeRow:
49
+ labels: List[str] # labels[0] is the constrained MERGE label; the rest are SET as extra labels
50
+ key_prop: str
51
+ value: str
52
+ props: Props
53
+
54
+
55
+ @dataclass
56
+ class EdgeRow:
57
+ type: str
58
+ from_ref: NodeRef
59
+ to_ref: NodeRef
60
+ props: Props
61
+
62
+
63
+ @dataclass
64
+ class GraphRows:
65
+ nodes: List[NodeRow] = field(default_factory=list)
66
+ edges: List[EdgeRow] = field(default_factory=list)
67
+
68
+
69
+ def prune(p: Dict[str, Optional[Prop]]) -> Props:
70
+ """Drop ``None`` entries — in Neo4j a null property means "absent", so we
71
+ never store one. Empty lists are kept (a present-but-empty array is legal)."""
72
+ return {k: v for k, v in p.items() if v is not None}
73
+
74
+
75
+ class RowBuilder:
76
+ """Accumulates nodes/edges with ``MERGE`` semantics in memory, so the same
77
+ node touched many times (a hot external symbol, a canonical decorator)
78
+ collapses to one row, and cross-reference edges to a target that never
79
+ materialized are dropped (the "edge-only-when-resolved" rule).
80
+ """
81
+
82
+ def __init__(self) -> None:
83
+ self._nodes: Dict[str, NodeRow] = {} # key: f"{labels[0]} {value}"
84
+ self._edges: List[EdgeRow] = []
85
+ self._deferred: List[EdgeRow] = [] # edges gated against node existence at finish()
86
+ self._keys: set = set() # every node value seen, for resolved-gating
87
+
88
+ def node(self, labels: List[str], key_prop: str, value: str, props: Props) -> NodeRef:
89
+ """Upsert a node. Re-seeing the same ``(labels[0], value)`` merges props
90
+ (last write wins) and unions labels — the in-memory analog of
91
+ ``MERGE (n:Label {key}) SET n += props``."""
92
+ node_id = f"{labels[0]} {value}"
93
+ existing = self._nodes.get(node_id)
94
+ if existing is not None:
95
+ existing.props.update(props)
96
+ for label in labels:
97
+ if label not in existing.labels:
98
+ existing.labels.append(label)
99
+ else:
100
+ self._nodes[node_id] = NodeRow(list(labels), key_prop, value, dict(props))
101
+ self._keys.add(value)
102
+ return NodeRef(labels[0], key_prop, value)
103
+
104
+ def edge(self, type_: str, from_ref: NodeRef, to_ref: NodeRef, props: Optional[Props] = None) -> None:
105
+ """An edge whose endpoints are known to exist (both ends emitted this run)."""
106
+ self._edges.append(EdgeRow(type_, from_ref, to_ref, dict(props or {})))
107
+
108
+ def edge_to_symbol(
109
+ self, type_: str, from_ref: NodeRef, target_signature: str, props: Optional[Props] = None
110
+ ) -> None:
111
+ """An edge to a ``:PySymbol`` target that may be external/library code not
112
+ present in the graph. Deferred and kept only if the target signature was
113
+ actually emitted as a node — so PY_EXTENDS / PY_RESOLVES_TO never dangle (the
114
+ string fallback lives on the source node's props)."""
115
+ self._deferred.append(
116
+ EdgeRow(
117
+ type_,
118
+ from_ref,
119
+ NodeRef("PySymbol", "signature", target_signature),
120
+ dict(props or {}),
121
+ )
122
+ )
123
+
124
+ def has_key(self, value: str) -> bool:
125
+ return value in self._keys
126
+
127
+ def finish(self) -> GraphRows:
128
+ for e in self._deferred:
129
+ if e.to_ref.value in self._keys:
130
+ self._edges.append(e)
131
+ nodes = sorted(self._nodes.values(), key=lambda n: f"{n.labels[0]} {n.value}")
132
+ edges = sorted(self._edges, key=lambda e: f"{e.type} {e.from_ref.value} {e.to_ref.value}")
133
+ return GraphRows(nodes, edges)
134
+
135
+
136
+ # ----------------------------------------------------------------------------------------------
137
+ # Cypher literal rendering (used by the snapshot writer; the bolt writer passes params instead).
138
+ # ----------------------------------------------------------------------------------------------
139
+
140
+
141
+ def cypher_value(v: Prop) -> str:
142
+ """Render a property value as a Cypher literal."""
143
+ if isinstance(v, bool):
144
+ return "true" if v else "false"
145
+ if isinstance(v, str):
146
+ return _cypher_string(v)
147
+ if isinstance(v, (int, float)):
148
+ # bools are handled above; int/float fall through here.
149
+ if isinstance(v, float) and (v != v or v in (float("inf"), float("-inf"))):
150
+ return "null"
151
+ return repr(v) if isinstance(v, float) else str(v)
152
+ if isinstance(v, list):
153
+ return "[" + ", ".join(cypher_value(x) for x in v) + "]"
154
+ return "null"
155
+
156
+
157
+ def cypher_map(props: Props) -> str:
158
+ """Render a props map as a Cypher map literal: ``{key: value, ...}``.
159
+ Keys are valid identifiers."""
160
+ return "{" + ", ".join(f"{k}: {cypher_value(v)}" for k, v in props.items()) + "}"
161
+
162
+
163
+ def _cypher_string(s: str) -> str:
164
+ escaped = (
165
+ s.replace("\\", "\\\\")
166
+ .replace("'", "\\'")
167
+ .replace("\n", "\\n")
168
+ .replace("\r", "\\r")
169
+ .replace("\t", "\\t")
170
+ )
171
+ return f"'{escaped}'"
172
+
173
+
174
+ def chunk(items: list, size: int) -> list:
175
+ """Split a list into chunks of at most ``size`` (UNWIND batch sizing)."""
176
+ return [items[i : i + size] for i in range(0, len(items), size)]
@@ -0,0 +1,39 @@
1
+ ################################################################################
2
+ # Copyright IBM Corporation 2025
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ ################################################################################
16
+
17
+ """The Cypher DDL — uniqueness constraints and indexes — shared by both writers.
18
+ Run BEFORE any load so MERGE uses an index seek (not a label scan) and the
19
+ identity invariant is enforced by the database. Every statement is idempotent
20
+ (``IF NOT EXISTS``).
21
+ """
22
+ from typing import List
23
+
24
+ CONSTRAINTS: List[str] = [
25
+ "CREATE CONSTRAINT py_symbol_sig IF NOT EXISTS FOR (s:PySymbol) REQUIRE s.signature IS UNIQUE",
26
+ "CREATE CONSTRAINT py_app_name IF NOT EXISTS FOR (a:PyApplication) REQUIRE a.name IS UNIQUE",
27
+ "CREATE CONSTRAINT py_module_key IF NOT EXISTS FOR (m:PyModule) REQUIRE m.file_key IS UNIQUE",
28
+ "CREATE CONSTRAINT py_package_name IF NOT EXISTS FOR (p:PyPackage) REQUIRE p.name IS UNIQUE",
29
+ "CREATE CONSTRAINT py_decorator_name IF NOT EXISTS FOR (d:PyDecorator) REQUIRE d.name IS UNIQUE",
30
+ "CREATE CONSTRAINT py_callsite_id IF NOT EXISTS FOR (c:PyCallSite) REQUIRE c.id IS UNIQUE",
31
+ "CREATE CONSTRAINT py_attribute_id IF NOT EXISTS FOR (a:PyAttribute) REQUIRE a.id IS UNIQUE",
32
+ "CREATE CONSTRAINT py_variable_id IF NOT EXISTS FOR (v:PyVariable) REQUIRE v.id IS UNIQUE",
33
+ ]
34
+
35
+ INDEXES: List[str] = [
36
+ "CREATE INDEX py_callable_name IF NOT EXISTS FOR (c:PyCallable) ON (c.name)",
37
+ "CREATE INDEX py_class_name IF NOT EXISTS FOR (c:PyClass) ON (c.name)",
38
+ "CREATE FULLTEXT INDEX py_code_fts IF NOT EXISTS FOR (c:PyCallable) ON EACH [c.code, c.docstring]",
39
+ ]
@@ -1,3 +1,3 @@
1
- from .options import AnalysisOptions
1
+ from .options import AnalysisOptions, EmitTarget, OutputFormat
2
2
 
3
- __all__ = ["AnalysisOptions"]
3
+ __all__ = ["AnalysisOptions", "EmitTarget", "OutputFormat"]
@@ -9,11 +9,31 @@ class OutputFormat(str, Enum):
9
9
  MSGPACK = "msgpack"
10
10
 
11
11
 
12
+ class EmitTarget(str, Enum):
13
+ """Output target selected by ``--emit``.
14
+
15
+ - ``json`` : the canonical ``analysis.json`` (symbol table + call graph).
16
+ - ``neo4j`` : project the analysis into a labeled property graph — a
17
+ ``graph.cypher`` snapshot, or a live Bolt push with ``--neo4j-uri``.
18
+ - ``schema`` : the machine-readable, version-stamped Neo4j schema contract.
19
+ """
20
+
21
+ JSON = "json"
22
+ NEO4J = "neo4j"
23
+ SCHEMA = "schema"
24
+
25
+
12
26
  @dataclass
13
27
  class AnalysisOptions:
14
28
  input: Path
15
29
  output: Optional[Path] = None
16
30
  format: OutputFormat = OutputFormat.JSON
31
+ emit: EmitTarget = EmitTarget.JSON
32
+ app_name: Optional[str] = None
33
+ neo4j_uri: Optional[str] = None
34
+ neo4j_user: str = "neo4j"
35
+ neo4j_password: str = "neo4j"
36
+ neo4j_database: Optional[str] = None
17
37
  using_codeql: bool = False
18
38
  using_ray: bool = False
19
39
  rebuild_analysis: bool = False
@@ -32,6 +32,67 @@ from codeanalyzer.semantic_analysis.codeql.codeql_query_runner import CodeQLQuer
32
32
  from codeanalyzer.utils import logger
33
33
 
34
34
 
35
+ class _CallableResolver:
36
+ """Maps a CodeQL endpoint ``(file, start_line, name, arity)`` to a Jedi
37
+ ``PyCallable``.
38
+
39
+ Resolution ladder:
40
+ 1. exact ``(abs_path, start_line)`` — the precise join;
41
+ 2. on miss, candidates sharing ``(abs_path, short_name)``: a single
42
+ candidate is taken directly; otherwise prefer those whose
43
+ parameter count equals the CodeQL positional arity, then the
44
+ nearest ``start_line``;
45
+ 3. no name match -> ``None`` (caller row skipped / callee becomes
46
+ a ghost node).
47
+
48
+ Step 2 recovers edges the ``(file, line)`` join silently drops when
49
+ CodeQL and Jedi disagree on a definition's start line (e.g. decorator
50
+ handling). Jedi's ``parameters`` counts every declared slot (incl.
51
+ ``*args``/``**kwargs``/keyword-only) whereas CodeQL's arity is
52
+ positional only, so the arity filter is exact for plain signatures
53
+ and otherwise yields to the nearest-line tiebreak.
54
+ """
55
+
56
+ def __init__(self) -> None:
57
+ self._by_loc: Dict[Tuple[str, int], Any] = {}
58
+ self._by_name: Dict[Tuple[str, str], List[Any]] = {}
59
+
60
+ @staticmethod
61
+ def _abs(path: str) -> str:
62
+ try:
63
+ return str(Path(path).resolve())
64
+ except (OSError, RuntimeError):
65
+ return path
66
+
67
+ @classmethod
68
+ def from_symbol_table(
69
+ cls, symbol_table: Dict[str, PyModule]
70
+ ) -> "_CallableResolver":
71
+ resolver = cls()
72
+ for c in iter_callables_in_symbol_table(symbol_table):
73
+ abs_path = cls._abs(c.path)
74
+ resolver._by_loc[(abs_path, c.start_line)] = c
75
+ resolver._by_name.setdefault((abs_path, c.name), []).append(c)
76
+ return resolver
77
+
78
+ def resolve(
79
+ self, file: str, start_line: int, name: str, arity: int
80
+ ) -> Any:
81
+ exact = self._by_loc.get((file, start_line))
82
+ if exact is not None:
83
+ return exact
84
+ if not name:
85
+ return None
86
+ candidates = self._by_name.get((file, name))
87
+ if not candidates:
88
+ return None
89
+ if len(candidates) == 1:
90
+ return candidates[0]
91
+ arity_matched = [c for c in candidates if len(c.parameters) == arity]
92
+ pool = arity_matched or candidates
93
+ return min(pool, key=lambda c: abs(c.start_line - start_line))
94
+
95
+
35
96
  class CodeQL:
36
97
  """A class for building the application view of a Python application using CodeQL.
37
98
 
@@ -99,9 +160,14 @@ class CodeQL:
99
160
  # codeql/python-all 7.x — it returns the ``CallNode`` (CFG)
100
161
  # whose target was resolved to that ``Value``. Cleaner than
101
162
  # poking at ``pointsTo`` directly.
102
- "from CallNode call, Function caller, FunctionValue calleeVal",
163
+ # ``callee`` is bound to the FunctionValue's scope so the
164
+ # endpoint emits the same Function-level facts (name, arity,
165
+ # location) the post-processor needs for the name+arity
166
+ # fallback when the (file, start_line) join misses.
167
+ "from CallNode call, Function caller, FunctionValue calleeVal, Function callee",
103
168
  "where",
104
169
  " call.getScope() = caller and",
170
+ " callee = calleeVal.getScope() and",
105
171
  " (",
106
172
  # Direct function / bound-method call: foo() or obj.foo()
107
173
  " call = calleeVal.getACall()",
@@ -115,15 +181,20 @@ class CodeQL:
115
181
  " )",
116
182
  " )",
117
183
  "select",
118
- # --- Caller endpoint --- (joins to PyCallable via file + start_line)
184
+ # --- Caller endpoint --- (joins to PyCallable: exact by
185
+ # (file, start_line), else by (file, name) + arity)
119
186
  " caller.getLocation().getFile().getAbsolutePath(),",
120
187
  " caller.getLocation().getStartLine(),",
121
188
  " caller.getQualifiedName(),",
189
+ " caller.getName(),",
190
+ " count(caller.getArg(_)),",
122
191
  # --- Callee endpoint --- (file/line may live in a library stub;
123
192
  # post-processor classifies as in-source or ghost)
124
- " calleeVal.getScope().getLocation().getFile().getAbsolutePath(),",
125
- " calleeVal.getScope().getLocation().getStartLine(),",
193
+ " callee.getLocation().getFile().getAbsolutePath(),",
194
+ " callee.getLocation().getStartLine(),",
126
195
  " calleeVal.getQualifiedName(),",
196
+ " callee.getName(),",
197
+ " count(callee.getArg(_)),",
127
198
  # --- Call-site location --- (for PyCallsite augmentation)
128
199
  " call.getLocation().getStartLine(),",
129
200
  " call.getLocation().getStartColumn(),",
@@ -149,9 +220,13 @@ class CodeQL:
149
220
  "caller_file",
150
221
  "caller_start_line",
151
222
  "caller_qname",
223
+ "caller_name",
224
+ "caller_arity",
152
225
  "callee_file",
153
226
  "callee_start_line",
154
227
  "callee_qname",
228
+ "callee_name",
229
+ "callee_arity",
155
230
  "call_start_line",
156
231
  "call_start_column",
157
232
  "call_end_line",
@@ -162,24 +237,15 @@ class CodeQL:
162
237
  return df
163
238
 
164
239
  @staticmethod
165
- def _build_callable_location_index(
240
+ def _build_callable_resolver(
166
241
  symbol_table: Dict[str, PyModule],
167
- ) -> Dict[Tuple[str, int], "PyCallable"]:
168
- """Build ``(absolute_file_path, start_line) -> PyCallable`` from Jedi.
242
+ ) -> _CallableResolver:
243
+ """Build the endpoint -> ``PyCallable`` resolver from Jedi.
169
244
 
170
245
  Paths are resolved so they match CodeQL's ``getAbsolutePath()``
171
246
  regardless of symlinks or the current working directory.
172
247
  """
173
- from codeanalyzer.schema.py_schema import PyCallable # local to avoid cycle
174
-
175
- index: Dict[Tuple[str, int], PyCallable] = {}
176
- for c in iter_callables_in_symbol_table(symbol_table):
177
- try:
178
- abs_path = str(Path(c.path).resolve())
179
- except (OSError, RuntimeError):
180
- abs_path = c.path
181
- index[(abs_path, c.start_line)] = c
182
- return index
248
+ return _CallableResolver.from_symbol_table(symbol_table)
183
249
 
184
250
  def _iter_resolved_rows(
185
251
  self, symbol_table: Dict[str, PyModule]
@@ -194,19 +260,27 @@ class CodeQL:
194
260
  df = self._query_call_edges()
195
261
  if df.empty:
196
262
  return
197
- location_index = self._build_callable_location_index(symbol_table)
263
+ resolver = self._build_callable_resolver(symbol_table)
198
264
 
199
265
  skipped_unknown_caller = 0
200
266
  ghost_callees = 0
201
267
  for row in df.itertuples(index=False):
202
- caller_key = (row.caller_file, int(row.caller_start_line))
203
- caller = location_index.get(caller_key)
268
+ caller = resolver.resolve(
269
+ row.caller_file,
270
+ int(row.caller_start_line),
271
+ row.caller_name,
272
+ int(row.caller_arity),
273
+ )
204
274
  if caller is None:
205
275
  skipped_unknown_caller += 1
206
276
  continue
207
277
 
208
- callee_key = (row.callee_file, int(row.callee_start_line))
209
- callee = location_index.get(callee_key)
278
+ callee = resolver.resolve(
279
+ row.callee_file,
280
+ int(row.callee_start_line),
281
+ row.callee_name,
282
+ int(row.callee_arity),
283
+ )
210
284
  if callee is not None:
211
285
  target_sig = callee.signature
212
286
  else:
@@ -267,20 +341,28 @@ class CodeQL:
267
341
  Returns:
268
342
  Number of ``PyCallsite`` entries augmented.
269
343
  """
270
- location_index = self._build_callable_location_index(symbol_table)
344
+ resolver = self._build_callable_resolver(symbol_table)
271
345
  df = self._query_call_edges()
272
346
  if df.empty:
273
347
  return 0
274
348
 
275
349
  augmented = 0
276
350
  for row in df.itertuples(index=False):
277
- caller_key = (row.caller_file, int(row.caller_start_line))
278
- caller = location_index.get(caller_key)
351
+ caller = resolver.resolve(
352
+ row.caller_file,
353
+ int(row.caller_start_line),
354
+ row.caller_name,
355
+ int(row.caller_arity),
356
+ )
279
357
  if caller is None:
280
358
  continue
281
359
 
282
- callee_key = (row.callee_file, int(row.callee_start_line))
283
- callee = location_index.get(callee_key)
360
+ callee = resolver.resolve(
361
+ row.callee_file,
362
+ int(row.callee_start_line),
363
+ row.callee_name,
364
+ int(row.callee_arity),
365
+ )
284
366
  resolved_sig = callee.signature if callee is not None else row.callee_qname
285
367
 
286
368
  call_start = int(row.call_start_line)