codeanalyzer-python 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. codeanalyzer/__init__.py +0 -0
  2. codeanalyzer/__main__.py +84 -0
  3. codeanalyzer/core.py +321 -0
  4. codeanalyzer/jedi/__init__.py +0 -0
  5. codeanalyzer/jedi/jedi.py +0 -0
  6. codeanalyzer/py.typed +0 -0
  7. codeanalyzer/schema/__init__.py +23 -0
  8. codeanalyzer/schema/py_schema.py +360 -0
  9. codeanalyzer/semantic_analysis/__init__.py +0 -0
  10. codeanalyzer/semantic_analysis/codeql/__init__.py +26 -0
  11. codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +133 -0
  12. codeanalyzer/semantic_analysis/codeql/codeql_exceptions.py +12 -0
  13. codeanalyzer/semantic_analysis/codeql/codeql_loader.py +74 -0
  14. codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py +164 -0
  15. codeanalyzer/semantic_analysis/wala/__init__.py +15 -0
  16. codeanalyzer/syntactic_analysis/__init__.py +0 -0
  17. codeanalyzer/syntactic_analysis/symbol_table_builder.py +903 -0
  18. codeanalyzer/utils/__init__.py +5 -0
  19. codeanalyzer/utils/logging.py +18 -0
  20. codeanalyzer/utils/progress_bar.py +69 -0
  21. {codeanalyzer_python-0.1.1.dist-info → codeanalyzer_python-0.1.2.dist-info}/METADATA +1 -1
  22. codeanalyzer_python-0.1.2.dist-info/RECORD +26 -0
  23. codeanalyzer_python-0.1.1.dist-info/RECORD +0 -6
  24. {codeanalyzer_python-0.1.1.dist-info → codeanalyzer_python-0.1.2.dist-info}/WHEEL +0 -0
  25. {codeanalyzer_python-0.1.1.dist-info → codeanalyzer_python-0.1.2.dist-info}/entry_points.txt +0 -0
  26. {codeanalyzer_python-0.1.1.dist-info → codeanalyzer_python-0.1.2.dist-info}/licenses/LICENSE +0 -0
  27. {codeanalyzer_python-0.1.1.dist-info → codeanalyzer_python-0.1.2.dist-info}/licenses/NOTICE +0 -0
File without changes
@@ -0,0 +1,84 @@
1
+ from contextlib import nullcontext
2
+ import sys
3
+ import typer
4
+ from typing import Optional, Annotated
5
+ from pathlib import Path
6
+ from codeanalyzer.utils import _set_log_level
7
+ from codeanalyzer.utils import logger
8
+ from codeanalyzer.core import AnalyzerCore
9
+
10
+
11
+ def main(
12
+ input: Annotated[
13
+ Path, typer.Option("-i", "--input", help="Path to the project root directory.")
14
+ ],
15
+ output: Annotated[
16
+ Optional[Path],
17
+ typer.Option("-o", "--output", help="Output directory for artifacts."),
18
+ ] = None,
19
+ analysis_level: Annotated[
20
+ int,
21
+ typer.Option("-a", "--analysis-level", help="1: symbol table, 2: call graph."),
22
+ ] = 1,
23
+ using_codeql: Annotated[
24
+ bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.")
25
+ ] = False,
26
+ rebuild_analysis: Annotated[
27
+ bool,
28
+ typer.Option(
29
+ "--eager/--lazy",
30
+ help="Enable eager or lazy analysis. Defaults to lazy.",
31
+ ),
32
+ ] = False,
33
+ cache_dir: Annotated[
34
+ Optional[Path],
35
+ typer.Option(
36
+ "-c",
37
+ "--cache-dir",
38
+ help="Directory to store analysis cache.",
39
+ ),
40
+ ] = None,
41
+ clear_cache: Annotated[
42
+ bool,
43
+ typer.Option("--clear-cache/--keep-cache", help="Clear cache after analysis."),
44
+ ] = True,
45
+ verbosity: Annotated[
46
+ int, typer.Option("-v", count=True, help="Increase verbosity: -v, -vv, -vvv")
47
+ ] = 0,
48
+ ):
49
+ """Static Analysis on Python source code using Jedi, Astroid, and Treesitter."""
50
+ _set_log_level(verbosity)
51
+
52
+ if not input.exists():
53
+ logger.error(f"Input path '{input}' does not exist.")
54
+ raise typer.Exit(code=1)
55
+
56
+ with AnalyzerCore(
57
+ input, analysis_level, using_codeql, rebuild_analysis, cache_dir, clear_cache
58
+ ) as analyzer:
59
+ artifacts = analyzer.analyze()
60
+ print_stream = sys.stdout
61
+ stream_context = nullcontext(print_stream)
62
+
63
+ if output is not None:
64
+ output.mkdir(parents=True, exist_ok=True)
65
+ output_file = output / "analysis.json"
66
+ stream_context = output_file.open("w")
67
+
68
+ with stream_context as f:
69
+ print(artifacts.model_dump_json(indent=4), file=f)
70
+
71
+
72
+ app = typer.Typer(
73
+ callback=main,
74
+ name="codeanalyzer",
75
+ help="Static Analysis on Python source code using Jedi, CodeQL and Tree sitter.",
76
+ invoke_without_command=True,
77
+ no_args_is_help=True,
78
+ add_completion=False,
79
+ rich_markup_mode="rich",
80
+ pretty_exceptions_show_locals=False,
81
+ )
82
+
83
+ if __name__ == "__main__":
84
+ app()
codeanalyzer/core.py ADDED
@@ -0,0 +1,321 @@
1
+ import hashlib
2
+ import os
3
+ from pdb import set_trace
4
+ import shutil
5
+ import subprocess
6
+ from pathlib import Path
7
+ import sys
8
+ from typing import Any, Dict, Union, Optional
9
+ from codeanalyzer.utils import logger
10
+
11
+ from codeanalyzer.schema.py_schema import PyApplication, PyModule
12
+ from codeanalyzer.semantic_analysis.codeql import CodeQLLoader
13
+ from codeanalyzer.semantic_analysis.codeql.codeql_exceptions import (
14
+ CodeQLExceptions,
15
+ )
16
+ from codeanalyzer.syntactic_analysis.symbol_table_builder import SymbolTableBuilder
17
+
18
+
19
+ class AnalyzerCore:
20
+ """Core functionality for CodeQL analysis.
21
+
22
+ Args:
23
+ project_dir (Union[str, Path]): The root directory of the project to analyze.
24
+ virtualenv (Optional[Path]): Path to the virtual environment directory.
25
+ using_codeql (bool): Whether to use CodeQL for analysis.
26
+ rebuild_analysis (bool): Whether to force rebuild the database.
27
+ clear_cache (bool): Whether to delete the cached directory after analysis.
28
+ analysis_depth (int): Depth of analysis (reserved for future use).
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ project_dir: Union[str, Path],
34
+ analysis_depth: int = 1,
35
+ using_codeql: bool = False,
36
+ rebuild_analysis: bool = False,
37
+ cache_dir: Optional[Path] = None,
38
+ clear_cache: bool = True,
39
+ ) -> None:
40
+ self.analysis_depth = analysis_depth
41
+ self.project_dir = Path(project_dir).resolve()
42
+ self.using_codeql = using_codeql
43
+ self.rebuild_analysis = rebuild_analysis
44
+ self.cache_dir = (
45
+ cache_dir.resolve() if cache_dir is not None else self.project_dir
46
+ ) / ".codeanalyzer"
47
+ self.clear_cache = clear_cache
48
+ self.db_path: Optional[Path] = None
49
+ self.codeql_bin: Optional[Path] = None
50
+ self.virtualenv: Optional[Path] = None
51
+
52
+ @staticmethod
53
+ def _cmd_exec_helper(
54
+ cmd: list[str],
55
+ cwd: Optional[Path] = None,
56
+ capture_output: bool = True,
57
+ check: bool = True,
58
+ suppress_output: bool = False,
59
+ ) -> subprocess.CompletedProcess:
60
+ """
61
+ Runs a subprocess with real-time output streaming to the logger.
62
+
63
+ Args:
64
+ cmd: Command as a list of arguments.
65
+ cwd: Working directory to run the command in.
66
+ capture_output: If True, retains and returns the output.
67
+ check: If True, raises CalledProcessError on non-zero exit.
68
+ suppress_output: If True, silences log output.
69
+
70
+ Returns:
71
+ subprocess.CompletedProcess
72
+ """
73
+ logger.info(f"Running: {' '.join(cmd)}")
74
+
75
+ process = subprocess.Popen(
76
+ cmd,
77
+ cwd=cwd,
78
+ stdout=subprocess.PIPE,
79
+ stderr=subprocess.STDOUT,
80
+ text=True,
81
+ bufsize=1,
82
+ universal_newlines=True,
83
+ )
84
+
85
+ assert process.stdout is not None # for type checking
86
+ output_lines = []
87
+
88
+ for line in process.stdout:
89
+ line = line.rstrip()
90
+ if not suppress_output:
91
+ logger.debug(line)
92
+ if capture_output:
93
+ output_lines.append(line)
94
+
95
+ returncode = process.wait()
96
+
97
+ if check and returncode != 0:
98
+ error_output = "\n".join(output_lines)
99
+ logger.error(f"Command failed with exit code {returncode}: {' '.join(cmd)}")
100
+ if error_output:
101
+ logger.error(f"Command output:\n{error_output}")
102
+ raise subprocess.CalledProcessError(returncode, cmd, output=error_output)
103
+
104
+ return subprocess.CompletedProcess(
105
+ args=cmd,
106
+ returncode=returncode,
107
+ stdout="\n".join(output_lines) if capture_output else None,
108
+ stderr=None,
109
+ )
110
+
111
+ @staticmethod
112
+ def _get_base_interpreter() -> Path:
113
+ """Get the base Python interpreter path.
114
+
115
+ This method finds a suitable base Python interpreter that can be used
116
+ to create virtual environments, even when running from within a virtual environment.
117
+ It supports various Python version managers like pyenv, conda, asdf, etc.
118
+
119
+ Returns:
120
+ Path: The base Python interpreter path.
121
+
122
+ Raises:
123
+ RuntimeError: If no suitable Python interpreter can be found.
124
+ """
125
+ # If we're not in a virtual environment, use the current interpreter
126
+ if sys.prefix == sys.base_prefix:
127
+ return Path(sys.executable)
128
+
129
+ # We're inside a virtual environment; need to find the base interpreter
130
+
131
+ # First, check if user explicitly set SYSTEM_PYTHON
132
+ if system_python := os.getenv("SYSTEM_PYTHON"):
133
+ system_python_path = Path(system_python)
134
+ if system_python_path.exists() and system_python_path.is_file():
135
+ return system_python_path
136
+
137
+ # Try to get the base interpreter from sys.base_executable (Python 3.3+)
138
+ if hasattr(sys, "base_executable") and sys.base_executable:
139
+ base_exec = Path(sys.base_executable)
140
+ if base_exec.exists() and base_exec.is_file():
141
+ return base_exec
142
+
143
+ # Try to find Python interpreters using shlex.which
144
+ python_candidates = []
145
+
146
+ # Use shutil.which to find python3 and python in PATH
147
+ for python_name in ["python3", "python"]:
148
+ if python_path := shutil.which(python_name):
149
+ candidate = Path(python_path)
150
+ # Skip if this is the current virtual environment's python
151
+ if not str(candidate).startswith(sys.prefix):
152
+ python_candidates.append(candidate)
153
+
154
+ # Check pyenv installation
155
+ if pyenv_root := os.getenv("PYENV_ROOT"):
156
+ pyenv_python = Path(pyenv_root) / "shims" / "python"
157
+ if pyenv_python.exists():
158
+ python_candidates.append(pyenv_python)
159
+
160
+ # Check default pyenv location
161
+ home_pyenv = Path.home() / ".pyenv" / "shims" / "python"
162
+ if home_pyenv.exists():
163
+ python_candidates.append(home_pyenv)
164
+
165
+ # Check conda base environment
166
+ if conda_prefix := os.getenv(
167
+ "CONDA_PREFIX_1"
168
+ ): # Original conda env before activation
169
+ conda_python = Path(conda_prefix) / "bin" / "python"
170
+ if conda_python.exists():
171
+ python_candidates.append(conda_python)
172
+
173
+ # Check asdf
174
+ if asdf_dir := os.getenv("ASDF_DIR"):
175
+ asdf_python = Path(asdf_dir) / "shims" / "python"
176
+ if asdf_python.exists():
177
+ python_candidates.append(asdf_python)
178
+
179
+ # Test candidates to find a working Python interpreter
180
+ for candidate in python_candidates:
181
+ try:
182
+ # Test if the interpreter works and can create venv
183
+ result = subprocess.run(
184
+ [str(candidate), "-c", "import venv; print('OK')"],
185
+ capture_output=True,
186
+ text=True,
187
+ timeout=5,
188
+ )
189
+ if result.returncode == 0 and "OK" in result.stdout:
190
+ return candidate
191
+ except (subprocess.TimeoutExpired, FileNotFoundError, PermissionError):
192
+ continue
193
+
194
+ # If nothing works, raise an informative error
195
+ raise RuntimeError(
196
+ f"Could not find a suitable base Python interpreter. "
197
+ f"Current environment: {sys.executable} (prefix: {sys.prefix}). "
198
+ f"Please set the SYSTEM_PYTHON environment variable to point to "
199
+ f"a working Python interpreter that can create virtual environments."
200
+ )
201
+
202
+ def __enter__(self) -> "AnalyzerCore":
203
+ # If no virtualenv is provided, try to create one using requirements.txt or pyproject.toml
204
+ venv_path = self.cache_dir / self.project_dir.name / "virtualenv"
205
+ # Ensure the cache directory exists for this project
206
+ venv_path.parent.mkdir(parents=True, exist_ok=True)
207
+ # Create the virtual environment if it does not exist
208
+ if not venv_path.exists() or self.rebuild_analysis:
209
+ logger.info(f"(Re-)creating virtual environment at {venv_path}")
210
+ self._cmd_exec_helper(
211
+ [str(self._get_base_interpreter()), "-m", "venv", str(venv_path)],
212
+ check=True,
213
+ )
214
+ # Find python in the virtual environment
215
+ venv_python = venv_path / "bin" / "python"
216
+
217
+ # Install the project itself (reads pyproject.toml)
218
+ self._cmd_exec_helper(
219
+ [str(venv_python), "-m", "pip", "install", "-U", f"{self.project_dir}"],
220
+ cwd=self.project_dir,
221
+ check=True,
222
+ )
223
+ # Install the project dependencies
224
+ self.virtualenv = venv_path
225
+
226
+ if self.using_codeql:
227
+ logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}")
228
+ cache_root = self.cache_dir / "codeql"
229
+ cache_root.mkdir(parents=True, exist_ok=True)
230
+ self.db_path = cache_root / f"{self.project_dir.name}-db"
231
+ self.db_path.mkdir(exist_ok=True)
232
+
233
+ checksum_file = self.db_path / ".checksum"
234
+ current_checksum = self._compute_checksum(self.project_dir)
235
+
236
+ def is_cache_valid() -> bool:
237
+ if not (self.db_path / "db-python").exists():
238
+ return False
239
+ if not checksum_file.exists():
240
+ return False
241
+ return checksum_file.read_text().strip() == current_checksum
242
+
243
+ if self.rebuild_analysis or not is_cache_valid():
244
+ logger.info("Creating new CodeQL database...")
245
+
246
+ codeql_in_path = shutil.which("codeql")
247
+ if codeql_in_path:
248
+ self.codeql_bin = Path(codeql_in_path)
249
+ else:
250
+ self.codeql_bin = CodeQLLoader.download_and_extract_codeql(
251
+ self.cache_dir / "codeql" / "bin"
252
+ )
253
+
254
+ if not shutil.which(str(self.codeql_bin)):
255
+ raise FileNotFoundError(
256
+ f"CodeQL binary not executable: {self.codeql_bin}"
257
+ )
258
+
259
+ cmd = [
260
+ str(self.codeql_bin),
261
+ "database",
262
+ "create",
263
+ str(self.db_path),
264
+ f"--source-root={self.project_dir}",
265
+ "--language=python",
266
+ "--overwrite",
267
+ ]
268
+
269
+ proc = subprocess.Popen(
270
+ cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE
271
+ )
272
+ _, err = proc.communicate()
273
+
274
+ if proc.returncode != 0:
275
+ raise CodeQLExceptions.CodeQLDatabaseBuildException(
276
+ f"Error building CodeQL database:\n{err.decode()}"
277
+ )
278
+
279
+ checksum_file.write_text(current_checksum)
280
+
281
+ else:
282
+ logger.info(f"Reusing cached CodeQL DB at {self.db_path}")
283
+
284
+ return self
285
+
286
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
287
+ if self.clear_cache and self.cache_dir.exists():
288
+ logger.info(f"Clearing cache directory: {self.cache_dir}")
289
+ shutil.rmtree(self.cache_dir)
290
+
291
+ def analyze(self) -> PyApplication:
292
+ """Return the path to the CodeQL database."""
293
+ return (
294
+ PyApplication.builder()
295
+ .with_symbol_table(self._build_symbol_table())
296
+ .build()
297
+ )
298
+
299
+ def _compute_checksum(self, root: Path) -> str:
300
+ """Compute SHA256 checksum of all Python source files in a project directory. If somethings changes, the
301
+ checksum will change and thus the analysis will be redone.
302
+
303
+ Args:
304
+ root (Path): Root directory of the project.
305
+
306
+ Returns:
307
+ str: SHA256 checksum of all Python files in the project.
308
+ """
309
+ sha256 = hashlib.sha256()
310
+ for py_file in sorted(root.rglob("*.py")):
311
+ sha256.update(py_file.read_bytes())
312
+ return sha256.hexdigest()
313
+
314
+ def _build_symbol_table(self) -> Dict[str, PyModule]:
315
+ """Retrieve a symbol table of the whole project."""
316
+ return SymbolTableBuilder(self.project_dir, self.virtualenv).build()
317
+
318
+ def _get_call_graph(self) -> Dict[str, Any]:
319
+ """Retrieve call graph from CodeQL database."""
320
+ logger.warning("Call graph extraction not yet implemented.")
321
+ return {}
File without changes
File without changes
codeanalyzer/py.typed ADDED
File without changes
@@ -0,0 +1,23 @@
1
+ from .py_schema import (
2
+ PyApplication,
3
+ PyImport,
4
+ PyComment,
5
+ PyModule,
6
+ PyClass,
7
+ PyVariableDeclaration,
8
+ PyCallable,
9
+ PyClassAttribute,
10
+ PyCallableParameter
11
+ )
12
+
13
+ __all__ = [
14
+ "PyApplication",
15
+ "PyImport",
16
+ "PyComment",
17
+ "PyModule",
18
+ "PyClass",
19
+ "PyVariableDeclaration",
20
+ "PyCallable",
21
+ "PyClassAttribute",
22
+ "PyCallableParameter"
23
+ ]