codeanalyzer-python 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeanalyzer/__main__.py +99 -11
- codeanalyzer/core.py +154 -19
- codeanalyzer/neo4j/__init__.py +46 -0
- codeanalyzer/neo4j/bolt.py +223 -0
- codeanalyzer/neo4j/catalog.py +245 -0
- codeanalyzer/neo4j/cypher.py +138 -0
- codeanalyzer/neo4j/emit.py +74 -0
- codeanalyzer/neo4j/project.py +322 -0
- codeanalyzer/neo4j/rows.py +176 -0
- codeanalyzer/neo4j/schema.py +39 -0
- codeanalyzer/options/__init__.py +2 -2
- codeanalyzer/options/options.py +20 -1
- codeanalyzer/schema/py_schema.py +20 -0
- codeanalyzer/semantic_analysis/call_graph.py +266 -0
- codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +318 -69
- codeanalyzer/semantic_analysis/codeql/codeql_loader.py +32 -4
- codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py +51 -31
- codeanalyzer/syntactic_analysis/symbol_table_builder.py +87 -4
- codeanalyzer_python-0.2.0.dist-info/METADATA +393 -0
- codeanalyzer_python-0.2.0.dist-info/RECORD +39 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/WHEEL +1 -1
- codeanalyzer_python-0.2.0.dist-info/entry_points.txt +3 -0
- codeanalyzer/semantic_analysis/wala/__init__.py +0 -15
- codeanalyzer_python-0.1.13.dist-info/METADATA +0 -414
- codeanalyzer_python-0.1.13.dist-info/RECORD +0 -31
- codeanalyzer_python-0.1.13.dist-info/entry_points.txt +0 -2
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {codeanalyzer_python-0.1.13.dist-info → codeanalyzer_python-0.2.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -4,7 +4,7 @@ import tokenize
|
|
|
4
4
|
from ast import AST, ClassDef
|
|
5
5
|
from io import StringIO
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Dict, List, Optional, Union
|
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import jedi
|
|
10
10
|
from jedi.api import Script
|
|
@@ -71,6 +71,32 @@ class SymbolTableBuilder:
|
|
|
71
71
|
pass
|
|
72
72
|
return None
|
|
73
73
|
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _infer_callee(
|
|
76
|
+
script: Script, line: int, column: int
|
|
77
|
+
) -> Tuple[Optional[str], bool]:
|
|
78
|
+
"""Infer ``(qualified_name, is_class)`` at a call expression.
|
|
79
|
+
|
|
80
|
+
When the callee resolves to a class (e.g. ``A()``), the qualified
|
|
81
|
+
name is normalized to ``<class>.__init__`` so it joins to the
|
|
82
|
+
``PyCallable`` entry for the constructor in the symbol table —
|
|
83
|
+
classes themselves are not ``PyCallable``s, so without this
|
|
84
|
+
rewrite every constructor call would surface as a ghost node in
|
|
85
|
+
the call graph.
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
definitions = script.infer(line=line, column=column)
|
|
89
|
+
if not definitions:
|
|
90
|
+
return None, False
|
|
91
|
+
d = definitions[0]
|
|
92
|
+
is_class = (d.type == "class")
|
|
93
|
+
full = d.full_name
|
|
94
|
+
if is_class and full:
|
|
95
|
+
full = f"{full}.__init__"
|
|
96
|
+
return full, is_class
|
|
97
|
+
except Exception:
|
|
98
|
+
return None, False
|
|
99
|
+
|
|
74
100
|
def build_pymodule_from_file(self, py_file: Path) -> PyModule:
|
|
75
101
|
"""Builds a PyModule from a Python file.
|
|
76
102
|
|
|
@@ -485,6 +511,63 @@ class SymbolTableBuilder:
|
|
|
485
511
|
symbols.append(symbol)
|
|
486
512
|
return symbols
|
|
487
513
|
|
|
514
|
+
@staticmethod
|
|
515
|
+
def _iter_calls_in_scope(fn_node: ast.AST):
|
|
516
|
+
"""Yield ``ast.Call`` nodes belonging to ``fn_node``'s own scope.
|
|
517
|
+
|
|
518
|
+
Naive ``ast.walk`` descends into nested ``FunctionDef`` / ``ClassDef``
|
|
519
|
+
bodies, attributing their calls to the outer function — wrong, since
|
|
520
|
+
those nested definitions have their own ``PyCallable`` entries
|
|
521
|
+
(built recursively by ``_callables``/``_add_class``) and own
|
|
522
|
+
``call_sites`` lists.
|
|
523
|
+
|
|
524
|
+
Decorators, default arguments, return-type annotations, base
|
|
525
|
+
classes and class-level keyword args ARE evaluated in the
|
|
526
|
+
enclosing scope, so calls in those subtrees stay attributed to
|
|
527
|
+
``fn_node``. Bodies of nested defs/classes are skipped. Lambdas,
|
|
528
|
+
comprehensions and inline conditionals don't get their own
|
|
529
|
+
``PyCallable`` so their internals stay attributed to the enclosing
|
|
530
|
+
function.
|
|
531
|
+
"""
|
|
532
|
+
|
|
533
|
+
def walk(node: ast.AST):
|
|
534
|
+
if isinstance(node, ast.Call):
|
|
535
|
+
yield node
|
|
536
|
+
|
|
537
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
538
|
+
# Decorators, defaults, return annotations run in
|
|
539
|
+
# enclosing scope. Body and arg names run in inner scope.
|
|
540
|
+
for dec in node.decorator_list:
|
|
541
|
+
yield from walk(dec)
|
|
542
|
+
for default in node.args.defaults:
|
|
543
|
+
yield from walk(default)
|
|
544
|
+
for default in node.args.kw_defaults:
|
|
545
|
+
if default is not None:
|
|
546
|
+
yield from walk(default)
|
|
547
|
+
if node.returns is not None:
|
|
548
|
+
yield from walk(node.returns)
|
|
549
|
+
return
|
|
550
|
+
|
|
551
|
+
if isinstance(node, ast.ClassDef):
|
|
552
|
+
# Decorators, bases, and keyword args run in enclosing scope.
|
|
553
|
+
# Body runs in class scope.
|
|
554
|
+
for dec in node.decorator_list:
|
|
555
|
+
yield from walk(dec)
|
|
556
|
+
for base in node.bases:
|
|
557
|
+
yield from walk(base)
|
|
558
|
+
for kw in node.keywords:
|
|
559
|
+
yield from walk(kw.value)
|
|
560
|
+
return
|
|
561
|
+
|
|
562
|
+
for child in ast.iter_child_nodes(node):
|
|
563
|
+
yield from walk(child)
|
|
564
|
+
|
|
565
|
+
for stmt in getattr(fn_node, "body", []):
|
|
566
|
+
yield from walk(stmt)
|
|
567
|
+
# Decorators / defaults / returns of fn_node itself are evaluated
|
|
568
|
+
# in the ENCLOSING scope, so they belong to fn_node's parent, not
|
|
569
|
+
# fn_node. Don't yield them here.
|
|
570
|
+
|
|
488
571
|
def _call_sites(self, fn_node: ast.FunctionDef, script: Script) -> List[PyCallsite]:
|
|
489
572
|
"""
|
|
490
573
|
Finds all call sites made from within the function using Jedi for type inference.
|
|
@@ -498,14 +581,14 @@ class SymbolTableBuilder:
|
|
|
498
581
|
"""
|
|
499
582
|
call_sites: List[PyCallsite] = []
|
|
500
583
|
|
|
501
|
-
for node in
|
|
584
|
+
for node in self._iter_calls_in_scope(fn_node):
|
|
502
585
|
if not isinstance(node, ast.Call):
|
|
503
586
|
continue
|
|
504
587
|
|
|
505
588
|
func_expr = node.func
|
|
506
589
|
|
|
507
590
|
method_name = "<unknown>"
|
|
508
|
-
callee_signature = self.
|
|
591
|
+
callee_signature, is_constructor = self._infer_callee(
|
|
509
592
|
script, node.lineno, node.col_offset
|
|
510
593
|
)
|
|
511
594
|
return_type = self._infer_type(script, node.lineno, node.col_offset)
|
|
@@ -535,7 +618,7 @@ class SymbolTableBuilder:
|
|
|
535
618
|
.argument_types(argument_types)
|
|
536
619
|
.return_type(return_type)
|
|
537
620
|
.callee_signature(callee_signature)
|
|
538
|
-
.is_constructor_call(
|
|
621
|
+
.is_constructor_call(is_constructor)
|
|
539
622
|
.start_line(getattr(node, "lineno", -1))
|
|
540
623
|
.start_column(getattr(node, "col_offset", -1))
|
|
541
624
|
.end_line(getattr(node, "end_lineno", -1))
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codeanalyzer-python
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Static Analysis on Python source code using Jedi, CodeQL and Treesitter — emits analysis.json or a Neo4j property graph.
|
|
5
|
+
Author-email: Rahul Krishna <i.m.ralk@gmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
License-File: NOTICE
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Requires-Dist: jedi<0.20.0,>=0.18.0; python_version < '3.11'
|
|
10
|
+
Requires-Dist: jedi<=0.19.2; python_version >= '3.11'
|
|
11
|
+
Requires-Dist: msgpack<1.0.7,>=1.0.0; python_version < '3.11'
|
|
12
|
+
Requires-Dist: msgpack<2.0.0,>=1.0.7; python_version >= '3.11'
|
|
13
|
+
Requires-Dist: networkx<3.2.0,>=2.6.0; python_version < '3.11'
|
|
14
|
+
Requires-Dist: networkx<4.0.0,>=3.0.0; python_version >= '3.11'
|
|
15
|
+
Requires-Dist: numpy<1.24.0,>=1.21.0; python_version < '3.11'
|
|
16
|
+
Requires-Dist: numpy<2.0.0,>=1.24.0; python_version >= '3.11' and python_version < '3.12'
|
|
17
|
+
Requires-Dist: numpy<2.0.0,>=1.26.0; python_version >= '3.12'
|
|
18
|
+
Requires-Dist: packaging>=25.0
|
|
19
|
+
Requires-Dist: pandas<2.0.0,>=1.3.0; python_version < '3.11'
|
|
20
|
+
Requires-Dist: pandas<3.0.0,>=2.0.0; python_version >= '3.11'
|
|
21
|
+
Requires-Dist: pydantic<2.0.0,>=1.8.0; python_version < '3.11'
|
|
22
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0; python_version >= '3.11'
|
|
23
|
+
Requires-Dist: ray<3.0.0,>=2.10.0; python_version >= '3.11'
|
|
24
|
+
Requires-Dist: ray==2.0.0; python_version < '3.11'
|
|
25
|
+
Requires-Dist: requests<3.0.0,>=2.20.0; python_version >= '3.11'
|
|
26
|
+
Requires-Dist: rich<14.0.0,>=12.6.0; python_version < '3.11'
|
|
27
|
+
Requires-Dist: rich<15.0.0,>=14.0.0; python_version >= '3.11'
|
|
28
|
+
Requires-Dist: typer<1.0.0,>=0.9.0; python_version < '3.11'
|
|
29
|
+
Requires-Dist: typer<2.0.0,>=0.9.0; python_version >= '3.11'
|
|
30
|
+
Requires-Dist: typing-extensions<5.0.0,>=4.0.0; python_version < '3.11'
|
|
31
|
+
Requires-Dist: typing-extensions<6.0.0,>=4.5.0; python_version >= '3.11'
|
|
32
|
+
Provides-Extra: neo4j
|
|
33
|
+
Requires-Dist: neo4j<6.0.0,>=5.0.0; extra == 'neo4j'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
<div align="center">
|
|
37
|
+
|
|
38
|
+
<img src="https://github.com/codellm-devkit/codeanalyzer-python/blob/main/docs/assets/logo.png?raw=true" alt="CodeLLM-DevKit" />
|
|
39
|
+
|
|
40
|
+
# codeanalyzer-python (`canpy`)
|
|
41
|
+
|
|
42
|
+
**A Python static-analysis toolkit — the CLDK backend that emits a canonical symbol table and call graph, as `analysis.json` or a Neo4j property graph.**
|
|
43
|
+
|
|
44
|
+
[](https://pypi.org/project/codeanalyzer-python/)
|
|
45
|
+
[](https://pypi.org/project/codeanalyzer-python/)
|
|
46
|
+
[](https://github.com/codellm-devkit/codeanalyzer-python/actions/workflows/release.yml)
|
|
47
|
+
[](./LICENSE)
|
|
48
|
+
|
|
49
|
+
</div>
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
`canpy` is a static analyzer for Python built on [Jedi](https://jedi.readthedocs.io/), with optional
|
|
54
|
+
[CodeQL](https://codeql.github.com/)-resolved call edges and
|
|
55
|
+
[Tree-sitter](https://tree-sitter.github.io/) parsing. It produces the canonical CodeLLM-DevKit
|
|
56
|
+
(CLDK) `analysis.json` — a symbol table plus a call graph — and can project that same analysis into a
|
|
57
|
+
**Neo4j property graph**. It is the Python backend behind
|
|
58
|
+
[CLDK](https://github.com/codellm-devkit/python-sdk), mirroring its
|
|
59
|
+
[TypeScript](https://github.com/codellm-devkit/codeanalyzer-typescript) (`cants`) and
|
|
60
|
+
[Java](https://github.com/codellm-devkit/codeanalyzer-java) siblings.
|
|
61
|
+
|
|
62
|
+
Every run produces a symbol table **and** a call graph. Edges come from Jedi's lexical resolution by
|
|
63
|
+
default; `--codeql` resolves additional edges (RPC / third-party / dynamically-dispatched targets)
|
|
64
|
+
and merges them with the Jedi-derived edges, also backfilling callees Jedi could not resolve.
|
|
65
|
+
|
|
66
|
+
## Table of Contents
|
|
67
|
+
|
|
68
|
+
- [Features](#features)
|
|
69
|
+
- [Installation](#installation)
|
|
70
|
+
- [Prerequisites](#prerequisites)
|
|
71
|
+
- [Install via pip (PyPI)](#install-via-pip-pypi)
|
|
72
|
+
- [Install via shell script](#install-via-shell-script)
|
|
73
|
+
- [Build from source](#build-from-source)
|
|
74
|
+
- [Usage](#usage)
|
|
75
|
+
- [Options](#options)
|
|
76
|
+
- [Examples](#examples)
|
|
77
|
+
- [Output targets](#output-targets)
|
|
78
|
+
- [`analysis.json` (default)](#analysisjson-default)
|
|
79
|
+
- [Neo4j graph](#neo4j-graph)
|
|
80
|
+
- [Schema contract](#schema-contract)
|
|
81
|
+
- [Development](#development)
|
|
82
|
+
- [License](#license)
|
|
83
|
+
|
|
84
|
+
## Features
|
|
85
|
+
|
|
86
|
+
- **Symbol table** — modules, classes, functions, methods, variables, decorators, imports, and
|
|
87
|
+
docstrings, with precise source spans.
|
|
88
|
+
- **Call graph** — Jedi's lexical resolver by default, with optional **CodeQL**-resolved edges
|
|
89
|
+
(`--codeql`) for RPC / third-party / dynamically-dispatched targets, merged with the Jedi edges;
|
|
90
|
+
CodeQL also backfills callees Jedi could not resolve.
|
|
91
|
+
- **Neo4j output** — project the analysis into a labeled property graph: a self-contained
|
|
92
|
+
`graph.cypher` snapshot, or an **incremental** push to a live database over Bolt.
|
|
93
|
+
- **Versioned schema** — a machine-readable, version-stamped Neo4j schema contract (`--emit schema`),
|
|
94
|
+
checked in as `schema.neo4j.json` and shipped with every release.
|
|
95
|
+
- **Incremental cache** — per-file results are cached under `.codeanalyzer`; `--lazy` (default)
|
|
96
|
+
reuses them, `--eager` forces a clean rebuild. `--ray` distributes the work across cores.
|
|
97
|
+
- **Compact output** — canonical `analysis.json`, or binary `analysis.msgpack` for smaller artifacts.
|
|
98
|
+
|
|
99
|
+
## Installation
|
|
100
|
+
|
|
101
|
+
### Prerequisites
|
|
102
|
+
|
|
103
|
+
- **Python 3.10 or newer.**
|
|
104
|
+
- A C toolchain and the `venv` / development headers — the analyzer builds an isolated virtual
|
|
105
|
+
environment per project (via Python's `venv`) so Jedi can resolve types and imports:
|
|
106
|
+
|
|
107
|
+
```sh
|
|
108
|
+
# Ubuntu / Debian
|
|
109
|
+
sudo apt install python3-venv python3-dev build-essential
|
|
110
|
+
|
|
111
|
+
# Fedora / RHEL / CentOS
|
|
112
|
+
sudo dnf group install "Development Tools" && sudo dnf install python3-venv python3-devel
|
|
113
|
+
|
|
114
|
+
# macOS
|
|
115
|
+
xcode-select --install
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Install via pip (PyPI)
|
|
119
|
+
|
|
120
|
+
```sh
|
|
121
|
+
pip install codeanalyzer-python
|
|
122
|
+
canpy --help
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
For the optional **live Neo4j push** (`--emit neo4j --neo4j-uri …`), install the `neo4j` extra:
|
|
126
|
+
|
|
127
|
+
```sh
|
|
128
|
+
pip install 'codeanalyzer-python[neo4j]'
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Install via shell script
|
|
132
|
+
|
|
133
|
+
Install the CLI as an isolated tool with the one-line installer (provisions via uv / pipx / pip):
|
|
134
|
+
|
|
135
|
+
```sh
|
|
136
|
+
curl --proto '=https' --tlsv1.2 -LsSf https://github.com/codellm-devkit/codeanalyzer-python/releases/latest/download/canpy-installer.sh | sh
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Build from source
|
|
140
|
+
|
|
141
|
+
This project uses [uv](https://docs.astral.sh/uv/) for dependency management.
|
|
142
|
+
|
|
143
|
+
```sh
|
|
144
|
+
git clone https://github.com/codellm-devkit/codeanalyzer-python
|
|
145
|
+
cd codeanalyzer-python
|
|
146
|
+
uv sync --all-groups
|
|
147
|
+
uv run canpy --help
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Usage
|
|
151
|
+
|
|
152
|
+
```sh
|
|
153
|
+
canpy --input /path/to/python/project
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
With no `--output`, the analysis is printed to stdout as compact JSON; with `--output <dir>` it is
|
|
157
|
+
written to `analysis.json` (or `graph.cypher` for `--emit neo4j`, or `analysis.msgpack` with
|
|
158
|
+
`--format msgpack`) in that directory.
|
|
159
|
+
|
|
160
|
+
### Options
|
|
161
|
+
|
|
162
|
+
<!-- BEGIN canpy-help -->
|
|
163
|
+
|
|
164
|
+
```text
|
|
165
|
+
$ canpy --help
|
|
166
|
+
|
|
167
|
+
Usage: canpy [OPTIONS] COMMAND [ARGS]...
|
|
168
|
+
|
|
169
|
+
Static Analysis on Python source code using Jedi, CodeQL and Tree sitter.
|
|
170
|
+
|
|
171
|
+
╭─ Options ────────────────────────────────────────────────────────────────────╮
|
|
172
|
+
│ --input -i PATH Path to the │
|
|
173
|
+
│ project root │
|
|
174
|
+
│ directory (not │
|
|
175
|
+
│ required for │
|
|
176
|
+
│ --emit schema). │
|
|
177
|
+
│ --output -o PATH Output directory │
|
|
178
|
+
│ for artifacts. │
|
|
179
|
+
│ --format -f [json|msgpack] Output format for │
|
|
180
|
+
│ --emit json: json │
|
|
181
|
+
│ or msgpack. │
|
|
182
|
+
│ [default: json] │
|
|
183
|
+
│ --emit [json|neo4j|sche Output target: │
|
|
184
|
+
│ ma] json │
|
|
185
|
+
│ (analysis.json, │
|
|
186
|
+
│ default) | neo4j │
|
|
187
|
+
│ (graph.cypher or │
|
|
188
|
+
│ live Bolt push) | │
|
|
189
|
+
│ schema (the Neo4j │
|
|
190
|
+
│ schema.json │
|
|
191
|
+
│ contract). │
|
|
192
|
+
│ [default: json] │
|
|
193
|
+
│ --app-name TEXT Logical │
|
|
194
|
+
│ application name │
|
|
195
|
+
│ for the graph │
|
|
196
|
+
│ :PyApplication │
|
|
197
|
+
│ anchor (default: │
|
|
198
|
+
│ input dir name). │
|
|
199
|
+
│ --neo4j-uri TEXT Push the graph to │
|
|
200
|
+
│ a live Neo4j over │
|
|
201
|
+
│ Bolt │
|
|
202
|
+
│ (incremental); │
|
|
203
|
+
│ omit to write │
|
|
204
|
+
│ graph.cypher. │
|
|
205
|
+
│ [env var: │
|
|
206
|
+
│ NEO4J_URI] │
|
|
207
|
+
│ --neo4j-user TEXT Neo4j username. │
|
|
208
|
+
│ [env var: │
|
|
209
|
+
│ NEO4J_USERNAME] │
|
|
210
|
+
│ [default: neo4j] │
|
|
211
|
+
│ --neo4j-password TEXT Neo4j password. │
|
|
212
|
+
│ Prefer the env │
|
|
213
|
+
│ var over the flag │
|
|
214
|
+
│ (the flag is │
|
|
215
|
+
│ visible in shell │
|
|
216
|
+
│ history / process │
|
|
217
|
+
│ list). │
|
|
218
|
+
│ [env var: │
|
|
219
|
+
│ NEO4J_PASSWORD] │
|
|
220
|
+
│ [default: neo4j] │
|
|
221
|
+
│ --neo4j-database TEXT Neo4j database │
|
|
222
|
+
│ name (default: │
|
|
223
|
+
│ server default). │
|
|
224
|
+
│ [env var: │
|
|
225
|
+
│ NEO4J_DATABASE] │
|
|
226
|
+
│ --codeql --no-codeql Enable │
|
|
227
|
+
│ CodeQL-based │
|
|
228
|
+
│ analysis. │
|
|
229
|
+
│ [default: │
|
|
230
|
+
│ no-codeql] │
|
|
231
|
+
│ --ray --no-ray Enable Ray for │
|
|
232
|
+
│ distributed │
|
|
233
|
+
│ analysis. │
|
|
234
|
+
│ [default: no-ray] │
|
|
235
|
+
│ --eager --lazy Enable eager or │
|
|
236
|
+
│ lazy analysis. │
|
|
237
|
+
│ Defaults to lazy. │
|
|
238
|
+
│ [default: lazy] │
|
|
239
|
+
│ --skip-tests --include-tests Skip test files │
|
|
240
|
+
│ in analysis. │
|
|
241
|
+
│ [default: │
|
|
242
|
+
│ skip-tests] │
|
|
243
|
+
│ --file-name PATH Analyze only the │
|
|
244
|
+
│ specified file │
|
|
245
|
+
│ (relative to │
|
|
246
|
+
│ input directory). │
|
|
247
|
+
│ --cache-dir -c PATH Directory to │
|
|
248
|
+
│ store analysis │
|
|
249
|
+
│ cache. Defaults │
|
|
250
|
+
│ to │
|
|
251
|
+
│ '.codeanalyzer' │
|
|
252
|
+
│ in the input │
|
|
253
|
+
│ directory. │
|
|
254
|
+
│ --clear-cache --keep-cache Clear cache after │
|
|
255
|
+
│ analysis. By │
|
|
256
|
+
│ default, cache is │
|
|
257
|
+
│ retained. │
|
|
258
|
+
│ [default: │
|
|
259
|
+
│ keep-cache] │
|
|
260
|
+
│ -v INTEGER Increase │
|
|
261
|
+
│ verbosity: -v, │
|
|
262
|
+
│ -vv, -vvv │
|
|
263
|
+
│ [default: 0] │
|
|
264
|
+
│ --help Show this message │
|
|
265
|
+
│ and exit. │
|
|
266
|
+
╰──────────────────────────────────────────────────────────────────────────────╯
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
<!-- END canpy-help -->
|
|
270
|
+
|
|
271
|
+
### Examples
|
|
272
|
+
|
|
273
|
+
1. **Basic analysis to stdout, or to a file:**
|
|
274
|
+
```sh
|
|
275
|
+
canpy --input ./my-python-project # compact JSON on stdout
|
|
276
|
+
canpy --input ./my-python-project --output ./out # → ./out/analysis.json
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
2. **Binary output (msgpack):**
|
|
280
|
+
```sh
|
|
281
|
+
canpy --input ./my-python-project --output ./out --format msgpack # → ./out/analysis.msgpack
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
3. **Resolve extra call edges with CodeQL:**
|
|
285
|
+
```sh
|
|
286
|
+
canpy --input ./my-python-project --codeql
|
|
287
|
+
```
|
|
288
|
+
By default, edges come from Jedi's lexical analysis. Adding `--codeql` resolves additional edges
|
|
289
|
+
(including RPC / third-party / dynamically-dispatched targets) and merges them with the
|
|
290
|
+
Jedi-derived edges; CodeQL also backfills resolved callees Jedi could not resolve. CodeQL
|
|
291
|
+
integration is experimental; the CLI is downloaded into `<cache_dir>/codeql/` on first use.
|
|
292
|
+
|
|
293
|
+
4. **Emit a Neo4j snapshot, or push to a live database:**
|
|
294
|
+
```sh
|
|
295
|
+
canpy --input ./my-python-project --emit neo4j --output ./out # → ./out/graph.cypher
|
|
296
|
+
canpy --input ./my-python-project --emit neo4j \
|
|
297
|
+
--neo4j-uri bolt://localhost:7687 --neo4j-user neo4j --neo4j-password secret
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
5. **Emit the Neo4j schema contract:**
|
|
301
|
+
```sh
|
|
302
|
+
canpy --emit schema # print schema.json to stdout (no project needed)
|
|
303
|
+
canpy --emit schema --output ./out # → ./out/schema.json
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
6. **Force a clean rebuild with a custom cache directory:**
|
|
307
|
+
```sh
|
|
308
|
+
canpy --input ./my-python-project --eager --cache-dir /path/to/custom-cache
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## Output targets
|
|
312
|
+
|
|
313
|
+
`canpy` builds one analysis in memory and can emit it three ways (`--emit`):
|
|
314
|
+
|
|
315
|
+
### `analysis.json` (default)
|
|
316
|
+
|
|
317
|
+
A `PyApplication` document — the canonical CLDK contract:
|
|
318
|
+
|
|
319
|
+
```jsonc
|
|
320
|
+
{
|
|
321
|
+
"symbol_table": { /* file path → module (classes, functions, variables, imports, …) */ },
|
|
322
|
+
"call_graph": [ /* CALL_DEP edges: { source, target, weight, provenance } keyed by callable signature */ ]
|
|
323
|
+
}
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
By default this is printed to stdout in JSON; with `--output` it is written to `analysis.json` (or
|
|
327
|
+
`analysis.msgpack` with `--format msgpack`, a more compact binary format).
|
|
328
|
+
|
|
329
|
+
### Neo4j graph
|
|
330
|
+
|
|
331
|
+
`--emit neo4j` projects the same analysis into a labeled property graph. Every node label is
|
|
332
|
+
`Py`-prefixed and every relationship type is `PY_`-prefixed (e.g. `:PyClass`, `PY_CALLS`) so multiple
|
|
333
|
+
language analyzers can share one database without label or relationship-type collisions. Declarations
|
|
334
|
+
are keyed by their signature under a shared `:PySymbol` label; calls, imports, inheritance,
|
|
335
|
+
decorators, and call sites are relationships:
|
|
336
|
+
|
|
337
|
+
- **Without `--neo4j-uri`** — writes a self-contained `graph.cypher` (constraints + indexes, a scoped
|
|
338
|
+
wipe, then batched `MERGE`s). Load it with `cypher-shell < graph.cypher`. Needs no extra
|
|
339
|
+
dependencies.
|
|
340
|
+
- **With `--neo4j-uri`** — pushes to a live Neo4j over Bolt **incrementally**: only modules whose
|
|
341
|
+
content hash changed are rewritten, and on a full run modules whose source file vanished are
|
|
342
|
+
pruned. Requires the `neo4j` extra. Every graph carries a `schema_version` on its `:PyApplication`
|
|
343
|
+
node.
|
|
344
|
+
|
|
345
|
+
Call-graph endpoints that aren't present in the symbol table (third-party / framework / RPC targets)
|
|
346
|
+
are materialized as `:PyExternal` ghost nodes, mirroring the analyzer's own ghost-node behaviour.
|
|
347
|
+
|
|
348
|
+
The connection options also read from the standard Neo4j environment variables — `NEO4J_URI`,
|
|
349
|
+
`NEO4J_USERNAME`, `NEO4J_PASSWORD`, `NEO4J_DATABASE` — when the corresponding flag is omitted (an
|
|
350
|
+
explicit flag wins). Prefer the env var for the password so it doesn't land in shell history or the
|
|
351
|
+
process list:
|
|
352
|
+
|
|
353
|
+
```sh
|
|
354
|
+
export NEO4J_URI=bolt://localhost:7687
|
|
355
|
+
export NEO4J_PASSWORD=secret
|
|
356
|
+
canpy -i ./my-project --emit neo4j # credentials picked up from the environment
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
### Schema contract
|
|
360
|
+
|
|
361
|
+
`--emit schema` writes the machine-readable, version-stamped Neo4j schema (`schema.json`: node labels,
|
|
362
|
+
relationships, properties, constraints, and indexes). It needs no project and is checked into the repo
|
|
363
|
+
as `schema.neo4j.json` and bundled in every release as a GitHub Release asset, so a consumer can
|
|
364
|
+
validate producer/consumer compatibility without invoking the tool. The shape of the contract matches
|
|
365
|
+
the [`codeanalyzer-typescript`](https://github.com/codellm-devkit/codeanalyzer-typescript) backend.
|
|
366
|
+
|
|
367
|
+
A UML of the `analysis.json` schema (the `PyApplication` containment tree) is checked in as
|
|
368
|
+
[`schema-uml.drawio`](./schema-uml.drawio), and the property-graph schema as
|
|
369
|
+
[`neo4j-schema.drawio`](./neo4j-schema.drawio).
|
|
370
|
+
|
|
371
|
+
## Development
|
|
372
|
+
|
|
373
|
+
This project uses [uv](https://docs.astral.sh/uv/).
|
|
374
|
+
|
|
375
|
+
```sh
|
|
376
|
+
uv sync --all-groups
|
|
377
|
+
uv run canpy --input /path/to/project # run from source
|
|
378
|
+
uv run canpy --emit schema > schema.neo4j.json # regenerate the checked-in schema contract
|
|
379
|
+
uv run python scripts/update_readme.py # regenerate the canpy --help block above
|
|
380
|
+
uv run pytest # run the test suite
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
The Neo4j schema-conformance test always runs. The Neo4j **bolt** integration test spins up a real
|
|
384
|
+
Neo4j via [Testcontainers](https://testcontainers.com/) and is **opt-in** — it needs a container
|
|
385
|
+
runtime (Docker or Podman) and is enabled with an environment variable:
|
|
386
|
+
|
|
387
|
+
```sh
|
|
388
|
+
RUN_CONTAINER_TESTS=1 uv run pytest test/test_neo4j_bolt.py -s
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
## License
|
|
392
|
+
|
|
393
|
+
Apache 2.0 — see [LICENSE](./LICENSE).
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
codeanalyzer/__init__.py,sha256=BZ3Kuwl-F_F-8H8cepLnVJ4Ku4NNUjjqg0Y6ujPQSsI,108
|
|
2
|
+
codeanalyzer/__main__.py,sha256=xCTzVoSKE-TdFlUoe9qSEZVeeyac_GctVu5p-WiHnIU,8148
|
|
3
|
+
codeanalyzer/core.py,sha256=_kndexmk7S0DswwvinevARiF4_bB7oMz2BRJBnBCp-w,30605
|
|
4
|
+
codeanalyzer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
codeanalyzer/config/__init__.py,sha256=9XBxAn1oWGRuhg3bEBUuVGs3hFNXEAKrr-Ce7tq9a2k,61
|
|
6
|
+
codeanalyzer/config/config.py,sha256=ZiKzc5uEUCIvih58-6BDtLLI1hPij41wGQjBcj9KNQM,188
|
|
7
|
+
codeanalyzer/jedi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
codeanalyzer/jedi/jedi.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
codeanalyzer/neo4j/__init__.py,sha256=AcbFNAMuXwkMFWH4h_HCmla6PCKTF3Xe5yNqg8F_kYk,1575
|
|
10
|
+
codeanalyzer/neo4j/bolt.py,sha256=lhYG17rjHnD8vJW-oGk7xDat0DYF8Su_ysfvJj5hD0E,9049
|
|
11
|
+
codeanalyzer/neo4j/catalog.py,sha256=PtvAdhYwK3kehVqPDtVvStzdWQtqJwFKHrMU5YiZJ6I,7398
|
|
12
|
+
codeanalyzer/neo4j/cypher.py,sha256=2zIWXA1AADrwCMhSTeqKjEXRgBjbob6o3bme_cwLu0s,5024
|
|
13
|
+
codeanalyzer/neo4j/emit.py,sha256=WtCndN6mA6PIzfzdgv9Xc5S5WP4rHUXCtB_r3G16rkg,3101
|
|
14
|
+
codeanalyzer/neo4j/project.py,sha256=5Zk-4ACQCIB7xVkVWeOx8EcfLail5kt5LTQcPpQHtWI,11870
|
|
15
|
+
codeanalyzer/neo4j/rows.py,sha256=_-A0gAOH8bI6F535QGHNvZYMGLtmP_z7ci8ji2jaH6E,6906
|
|
16
|
+
codeanalyzer/neo4j/schema.py,sha256=5xz8cZVuL73GAF-vs9QtN2TuKxIty0rHEe68nwQmLTY,2136
|
|
17
|
+
codeanalyzer/options/__init__.py,sha256=FNBGxdnESayUb0wEs395MKIJxoWIX7bp-FaLDP6qYUw,123
|
|
18
|
+
codeanalyzer/options/options.py,sha256=Um4TKMwGVgmXUgL9yCVa26YeGAR5x-f58MfDcr4YbqY,1249
|
|
19
|
+
codeanalyzer/schema/__init__.py,sha256=HB7y4y-49dkEo-H9GREam1_9Cr1N-GF6MYwx9yoU878,1978
|
|
20
|
+
codeanalyzer/schema/py_schema.py,sha256=meEgkl7C-LCEAXuqs4pe8eGrEnxIzbEJCHiv8Hd1ObE,11578
|
|
21
|
+
codeanalyzer/semantic_analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
codeanalyzer/semantic_analysis/call_graph.py,sha256=3hgLA1sL1YFQa4fzUz_ifVbLs1I9V2QFe7ldwN18mcg,10323
|
|
23
|
+
codeanalyzer/semantic_analysis/codeql/__init__.py,sha256=ODMkdGvs3ebJdfIZle8T4VcHoCBhH_ZehWuWFpNh3NI,1022
|
|
24
|
+
codeanalyzer/semantic_analysis/codeql/codeql_analysis.py,sha256=Z5E7Pj__eYsYSWFteOG1B4RTlH4huhEaPAOmTquzCBQ,15434
|
|
25
|
+
codeanalyzer/semantic_analysis/codeql/codeql_exceptions.py,sha256=PnJOasW9rP68SEX158jSqQFdqjW_Q_Fx3vbH6vNiCQs,474
|
|
26
|
+
codeanalyzer/semantic_analysis/codeql/codeql_loader.py,sha256=XFjTx6ERRipzTguwHzSS5BTI6Nzf95fcdK3nLxqHARs,3599
|
|
27
|
+
codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py,sha256=Zr5NPIpQjm8W-up-js8f3Q7Y9YYfSfUalm6yxtdTC90,7768
|
|
28
|
+
codeanalyzer/syntactic_analysis/__init__.py,sha256=EUQkJEh6wHjWx2qTTKbTbUgwSbfKeNieKHNy7RknVXA,476
|
|
29
|
+
codeanalyzer/syntactic_analysis/exceptions.py,sha256=whs_n0vIu655Jkk1a7iOoXY6iIca4pZqJnU40V9Ejaw,537
|
|
30
|
+
codeanalyzer/syntactic_analysis/symbol_table_builder.py,sha256=zmHFt8pN50jG-Ex4fnisvbLmn1XaW05jwbV_xSG4qfU,38177
|
|
31
|
+
codeanalyzer/utils/__init__.py,sha256=hC6VWdR5rerSqBxzu9KQHTASWqwrrYJv-CMDwrTlzkc,137
|
|
32
|
+
codeanalyzer/utils/logging.py,sha256=0vTkGSl5EZN8yhhWa_5Mrn1n_twRCSW53rNwjzQ9RbI,601
|
|
33
|
+
codeanalyzer/utils/progress_bar.py,sha256=ZHJzGiCo5q4dyXq4CtsrJeq9Ip7sD84T3yZjNX7TBys,2443
|
|
34
|
+
codeanalyzer_python-0.2.0.dist-info/METADATA,sha256=Ha1np-fBOgPRoUHJSiEPZItnBlEsMaNML3YdyiKOvfY,20978
|
|
35
|
+
codeanalyzer_python-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
36
|
+
codeanalyzer_python-0.2.0.dist-info/entry_points.txt,sha256=v4Vux0Nnx7sOntVk_CH7W9RX6SkIkvR1FQYq73oVlCQ,105
|
|
37
|
+
codeanalyzer_python-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
38
|
+
codeanalyzer_python-0.2.0.dist-info/licenses/NOTICE,sha256=YU0Z9NDWqKY-2jfFcbxeZ6fbnzz0oZeKmnUcO8a-bcQ,901
|
|
39
|
+
codeanalyzer_python-0.2.0.dist-info/RECORD,,
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
################################################################################
|
|
2
|
-
# Copyright IBM Corporation 2025
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
################################################################################
|