codeanalyzer-python 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeanalyzer/__main__.py CHANGED
@@ -1,11 +1,16 @@
1
- from contextlib import nullcontext
2
- import sys
3
- import typer
4
- from typing import Optional, Annotated
5
1
  from pathlib import Path
6
- from codeanalyzer.utils import _set_log_level
7
- from codeanalyzer.utils import logger
2
+ from typing import Annotated, Optional
3
+ from enum import Enum
4
+
5
+ import typer
6
+
8
7
  from codeanalyzer.core import AnalyzerCore
8
+ from codeanalyzer.utils import _set_log_level, logger
9
+
10
+
11
+ class OutputFormat(str, Enum):
12
+ JSON = "json"
13
+ MSGPACK = "msgpack"
9
14
 
10
15
 
11
16
  def main(
@@ -16,6 +21,15 @@ def main(
16
21
  Optional[Path],
17
22
  typer.Option("-o", "--output", help="Output directory for artifacts."),
18
23
  ] = None,
24
+ format: Annotated[
25
+ OutputFormat,
26
+ typer.Option(
27
+ "-f",
28
+ "--format",
29
+ help="Output format: json or msgpack.",
30
+ case_sensitive=False,
31
+ ),
32
+ ] = OutputFormat.JSON,
19
33
  analysis_level: Annotated[
20
34
  int,
21
35
  typer.Option("-a", "--analysis-level", help="1: symbol table, 2: call graph."),
@@ -57,16 +71,42 @@ def main(
57
71
  input, analysis_level, using_codeql, rebuild_analysis, cache_dir, clear_cache
58
72
  ) as analyzer:
59
73
  artifacts = analyzer.analyze()
60
- print_stream = sys.stdout
61
- stream_context = nullcontext(print_stream)
62
74
 
63
- if output is not None:
75
+ # Handle output based on format
76
+ if output is None:
77
+ # Output to stdout (only for JSON)
78
+ if format == OutputFormat.JSON:
79
+ print(artifacts.model_dump_json(separators=(",", ":")))
80
+ else:
81
+ logger.error(
82
+ f"Format '{format.value}' requires an output directory (use -o/--output)"
83
+ )
84
+ raise typer.Exit(code=1)
85
+ else:
86
+ # Output to file
64
87
  output.mkdir(parents=True, exist_ok=True)
65
- output_file = output / "analysis.json"
66
- stream_context = output_file.open("w")
88
+ _write_output(artifacts, output, format)
89
+
90
+
91
+ def _write_output(artifacts, output_dir: Path, format: OutputFormat):
92
+ """Write artifacts to file in the specified format."""
93
+ if format == OutputFormat.JSON:
94
+ output_file = output_dir / "analysis.json"
95
+ # Use Pydantic's json() with separators for compact output
96
+ json_str = artifacts.model_dump_json(indent=None)
97
+ with output_file.open("w") as f:
98
+ f.write(json_str)
99
+ logger.info(f"Analysis saved to {output_file}")
67
100
 
68
- with stream_context as f:
69
- print(artifacts.model_dump_json(indent=4), file=f)
101
+ elif format == OutputFormat.MSGPACK:
102
+ output_file = output_dir / "analysis.msgpack"
103
+ msgpack_data = artifacts.to_msgpack_bytes()
104
+ with output_file.open("wb") as f:
105
+ f.write(msgpack_data)
106
+ logger.info(f"Analysis saved to {output_file}")
107
+ logger.info(
108
+ f"Compression ratio: {artifacts.get_compression_ratio():.1%} of JSON size"
109
+ )
70
110
 
71
111
 
72
112
  app = typer.Typer(
codeanalyzer/core.py CHANGED
@@ -1,19 +1,16 @@
1
1
  import hashlib
2
2
  import os
3
- from pdb import set_trace
4
3
  import shutil
5
4
  import subprocess
6
- from pathlib import Path
7
5
  import sys
8
- from typing import Any, Dict, Union, Optional
9
- from codeanalyzer.utils import logger
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional, Union
10
8
 
11
9
  from codeanalyzer.schema.py_schema import PyApplication, PyModule
12
10
  from codeanalyzer.semantic_analysis.codeql import CodeQLLoader
13
- from codeanalyzer.semantic_analysis.codeql.codeql_exceptions import (
14
- CodeQLExceptions,
15
- )
11
+ from codeanalyzer.semantic_analysis.codeql.codeql_exceptions import CodeQLExceptions
16
12
  from codeanalyzer.syntactic_analysis.symbol_table_builder import SymbolTableBuilder
13
+ from codeanalyzer.utils import logger
17
14
 
18
15
 
19
16
  class AnalyzerCore:
@@ -290,11 +287,7 @@ class AnalyzerCore:
290
287
 
291
288
  def analyze(self) -> PyApplication:
292
289
  """Return the path to the CodeQL database."""
293
- return (
294
- PyApplication.builder()
295
- .with_symbol_table(self._build_symbol_table())
296
- .build()
297
- )
290
+ return PyApplication.builder().symbol_table(self._build_symbol_table()).build()
298
291
 
299
292
  def _compute_checksum(self, root: Path) -> str:
300
293
  """Compute SHA256 checksum of all Python source files in a project directory. If somethings changes, the
@@ -1,13 +1,13 @@
1
1
  from .py_schema import (
2
2
  PyApplication,
3
- PyImport,
3
+ PyCallable,
4
+ PyCallableParameter,
5
+ PyClass,
6
+ PyClassAttribute,
4
7
  PyComment,
8
+ PyImport,
5
9
  PyModule,
6
- PyClass,
7
10
  PyVariableDeclaration,
8
- PyCallable,
9
- PyClassAttribute,
10
- PyCallableParameter
11
11
  )
12
12
 
13
13
  __all__ = [
@@ -19,5 +19,5 @@ __all__ = [
19
19
  "PyVariableDeclaration",
20
20
  "PyCallable",
21
21
  "PyClassAttribute",
22
- "PyCallableParameter"
22
+ "PyCallableParameter",
23
23
  ]
@@ -20,12 +20,88 @@ This module defines the data models used to represent Python code structures
20
20
  for static analysis purposes.
21
21
  """
22
22
 
23
+ import inspect
23
24
  from pathlib import Path
24
25
  from typing import Any, Dict, List, Optional
25
- from typing_extensions import Literal
26
+ import gzip
27
+
26
28
  from pydantic import BaseModel
29
+ from typing_extensions import Literal
30
+ import msgpack
27
31
 
28
- import inspect
32
+
33
+ def msgpk(cls):
34
+ """
35
+ Decorator that adds MessagePack serialization methods to Pydantic models.
36
+
37
+ Adds methods:
38
+ - to_msgpack_bytes() -> bytes: Serialize to compact binary format
39
+ - from_msgpack_bytes(data: bytes) -> cls: Deserialize from binary format
40
+ - to_msgpack_dict() -> dict: Convert to msgpack-compatible dict
41
+ - from_msgpack_dict(data: dict) -> cls: Create instance from msgpack dict
42
+ """
43
+
44
+ def _prepare_for_serialization(obj: Any) -> Any:
45
+ """Convert objects to serialization-friendly format."""
46
+ if isinstance(obj, Path):
47
+ return str(obj)
48
+ elif isinstance(obj, dict):
49
+ return {
50
+ _prepare_for_serialization(k): _prepare_for_serialization(v)
51
+ for k, v in obj.items()
52
+ }
53
+ elif isinstance(obj, list):
54
+ return [_prepare_for_serialization(item) for item in obj]
55
+ elif isinstance(obj, tuple):
56
+ return tuple(_prepare_for_serialization(item) for item in obj)
57
+ elif isinstance(obj, set):
58
+ return [_prepare_for_serialization(item) for item in obj]
59
+ elif hasattr(obj, "model_dump"): # Pydantic model
60
+ return _prepare_for_serialization(obj.model_dump())
61
+ else:
62
+ return obj
63
+
64
+ def to_msgpack_bytes(self) -> bytes:
65
+ """Serialize the model to compact binary format using MessagePack + gzip."""
66
+ data = _prepare_for_serialization(self.model_dump())
67
+ msgpack_data = msgpack.packb(data, use_bin_type=True)
68
+ return gzip.compress(msgpack_data)
69
+
70
+ @classmethod
71
+ def from_msgpack_bytes(cls_obj, data: bytes):
72
+ """Deserialize from MessagePack + gzip binary format."""
73
+ decompressed_data = gzip.decompress(data)
74
+ obj_dict = msgpack.unpackb(decompressed_data, raw=False)
75
+ return cls_obj.model_validate(obj_dict)
76
+
77
+ def to_msgpack_dict(self) -> dict:
78
+ """Convert to msgpack-compatible dictionary format."""
79
+ return _prepare_for_serialization(self.model_dump())
80
+
81
+ @classmethod
82
+ def from_msgpack_dict(cls_obj, data: dict):
83
+ """Create instance from msgpack-compatible dictionary."""
84
+ return cls_obj.model_validate(data)
85
+
86
+ def get_msgpack_size(self) -> int:
87
+ """Get the size of the msgpack serialization in bytes."""
88
+ return len(self.to_msgpack_bytes())
89
+
90
+ def get_compression_ratio(self) -> float:
91
+ """Get compression ratio compared to JSON."""
92
+ json_size = len(self.model_dump_json().encode("utf-8"))
93
+ msgpack_gzip_size = self.get_msgpack_size()
94
+ return msgpack_gzip_size / json_size if json_size > 0 else 1.0
95
+
96
+ # Add methods to the class
97
+ cls.to_msgpack_bytes = to_msgpack_bytes
98
+ cls.from_msgpack_bytes = from_msgpack_bytes
99
+ cls.to_msgpack_dict = to_msgpack_dict
100
+ cls.from_msgpack_dict = from_msgpack_dict
101
+ cls.get_msgpack_size = get_msgpack_size
102
+ cls.get_compression_ratio = get_compression_ratio
103
+
104
+ return cls
29
105
 
30
106
 
31
107
  def builder(cls):
@@ -33,7 +109,7 @@ def builder(cls):
33
109
  Decorator that generates a builder class for a Pydantic models defined below.
34
110
 
35
111
  It creates methods like:
36
- - with_<fieldname>(value)
112
+ - <fieldname>(value)
37
113
  - build() to instantiate the model
38
114
 
39
115
  It supports nested builder patterns and is mypy-compatible.
@@ -70,12 +146,12 @@ def builder(cls):
70
146
  setattr(self, f"_{f}", value)
71
147
  return self
72
148
 
73
- method.__name__ = f"with_{f}"
149
+ method.__name__ = f"{f}"
74
150
  method.__annotations__ = {"value": t, "return": builder_name}
75
151
  method.__doc__ = f"Set {f} ({t.__name__})"
76
152
  return method
77
153
 
78
- namespace[f"with_{field}"] = make_method()
154
+ namespace[f"{field}"] = make_method()
79
155
 
80
156
  # Create a build method that constructs the model instance using the values set in the builder.
81
157
  def build(self):
@@ -92,26 +168,9 @@ def builder(cls):
92
168
 
93
169
 
94
170
  @builder
171
+ @msgpk
95
172
  class PyImport(BaseModel):
96
- """Represents a Python import statement.
97
-
98
- Attributes:
99
- module (str): The name of the module being imported.
100
- name (str): The name of the imported entity (e.g., function, class).
101
- alias (Optional[str]): An optional alias for the imported entity.
102
- start_line (int): The line number where the import statement starts.
103
- end_line (int): The line number where the import statement ends.
104
- start_column (int): The starting column of the import statement.
105
- end_column (int): The ending column of the import statement.
106
-
107
- Example:
108
- - import numpy as np will be represented as:
109
- PyImport(module="numpy", name="np", alias="np", start_line=1, end_line=1, start_column=0, end_column=16)
110
- - from math import sqrt will be represented as:
111
- PyImport(module="math", name="sqrt", alias=None, start_line=2, end_line=2, start_column=0, end_column=20
112
- - from os.path import join as path_join will be represented as:
113
- PyImport(module="os.path", name="path_join", alias="join", start_line=3, end_line=3, start_column=0, end_column=30)
114
- """
173
+ """Represents a Python import statement."""
115
174
 
116
175
  module: str
117
176
  name: str
@@ -123,18 +182,9 @@ class PyImport(BaseModel):
123
182
 
124
183
 
125
184
  @builder
185
+ @msgpk
126
186
  class PyComment(BaseModel):
127
- """
128
- Represents a Python comment.
129
-
130
- Attributes:
131
- content (str): The actual comment string (without the leading '#').
132
- start_line (int): The line number where the comment starts.
133
- end_line (int): The line number where the comment ends (same as start_line for single-line comments).
134
- start_column (int): The starting column of the comment.
135
- end_column (int): The ending column of the comment.
136
- is_docstring (bool): Whether this comment is actually a docstring (triple-quoted string).
137
- """
187
+ """Represents a Python comment."""
138
188
 
139
189
  content: str
140
190
  start_line: int = -1
@@ -145,20 +195,9 @@ class PyComment(BaseModel):
145
195
 
146
196
 
147
197
  @builder
198
+ @msgpk
148
199
  class PySymbol(BaseModel):
149
- """
150
- Represents a symbol used or declared in Python code.
151
-
152
- Attributes:
153
- name (str): The name of the symbol (e.g., 'x', 'self.x', 'os.path').
154
- scope (Literal['local', 'nonlocal', 'global', 'class', 'module']): The scope where the symbol is accessed.
155
- kind (Literal['variable', 'parameter', 'attribute', 'function', 'class', 'module']): The kind of symbol.
156
- type (Optional[str]): Inferred or annotated type, if available.
157
- qualified_name (Optional[str]): Fully qualified name (e.g., 'self.x', 'os.path.join').
158
- is_builtin (bool): Whether this is a Python builtin.
159
- lineno (int): Line number where the symbol is accessed or declared.
160
- col_offset (int): Column offset.
161
- """
200
+ """Represents a symbol used or declared in Python code."""
162
201
 
163
202
  name: str
164
203
  scope: Literal["local", "nonlocal", "global", "class", "module"]
@@ -171,11 +210,9 @@ class PySymbol(BaseModel):
171
210
 
172
211
 
173
212
  @builder
213
+ @msgpk
174
214
  class PyVariableDeclaration(BaseModel):
175
- """Represents a Python variable declaration.
176
-
177
- Attributes:
178
- """
215
+ """Represents a Python variable declaration."""
179
216
 
180
217
  name: str
181
218
  type: Optional[str]
@@ -189,18 +226,9 @@ class PyVariableDeclaration(BaseModel):
189
226
 
190
227
 
191
228
  @builder
229
+ @msgpk
192
230
  class PyCallableParameter(BaseModel):
193
- """Represents a parameter of a Python callable (function/method).
194
-
195
- Attributes:
196
- name (str): The name of the parameter.
197
- type (str): The type of the parameter.
198
- default_value (str): The default value of the parameter, if any.
199
- start_line (int): The line number where the parameter is defined.
200
- end_line (int): The line number where the parameter definition ends.
201
- start_column (int): The column number where the parameter starts.
202
- end_column (int): The column number where the parameter ends.
203
- """
231
+ """Represents a parameter of a Python callable (function/method)."""
204
232
 
205
233
  name: str
206
234
  type: Optional[str] = None
@@ -212,10 +240,9 @@ class PyCallableParameter(BaseModel):
212
240
 
213
241
 
214
242
  @builder
243
+ @msgpk
215
244
  class PyCallsite(BaseModel):
216
- """
217
- Represents a Python call site (function or method invocation) with contextual metadata.
218
- """
245
+ """Represents a Python call site (function or method invocation) with contextual metadata."""
219
246
 
220
247
  method_name: str
221
248
  receiver_expr: Optional[str] = None
@@ -231,26 +258,9 @@ class PyCallsite(BaseModel):
231
258
 
232
259
 
233
260
  @builder
261
+ @msgpk
234
262
  class PyCallable(BaseModel):
235
- """Represents a Python callable (function/method).
236
-
237
- Attributes:
238
- name (str): The name of the callable.
239
- signature (str): The fully qualified name of the callable (e.g., module.function_name).
240
- docstring (PyComment): The docstring of the callable.
241
- decorators (List[str]): List of decorators applied to the callable.
242
- parameters (List[PyCallableParameter]): List of parameters for the callable.
243
- return_type (Optional[str]): The type of the return value, if specified.
244
- code (str): The actual code of the callable.
245
- start_line (int): The line number where the callable is defined.
246
- end_line (int): The line number where the callable definition ends.
247
- code_start_line (int): The line number where the code block starts.
248
- accessed_symbols (List[str]): Symbols accessed within the callable.
249
- call_sites (List[str]): Call sites of this callable.
250
- is_entrypoint (bool): Whether this callable is an entry point.
251
- local_variables (List[PyVariableDeclaration]): Local variables within the callable.
252
- cyclomatic_complexity (int): Cyclomatic complexity of the callable.
253
- """
263
+ """Represents a Python callable (function/method)."""
254
264
 
255
265
  name: str
256
266
  path: str
@@ -274,16 +284,9 @@ class PyCallable(BaseModel):
274
284
 
275
285
 
276
286
  @builder
287
+ @msgpk
277
288
  class PyClassAttribute(BaseModel):
278
- """Represents a Python class attribute.
279
-
280
- Attributes:
281
- name (str): The name of the attribute.
282
- type (str): The type of the attribute.
283
- docstring (PyComment): The docstring of the attribute.
284
- start_line (int): The line number where the attribute is defined.
285
- end_line (int): The line number where the attribute definition ends.
286
- """
289
+ """Represents a Python class attribute."""
287
290
 
288
291
  name: str
289
292
  type: Optional[str] = None
@@ -293,20 +296,9 @@ class PyClassAttribute(BaseModel):
293
296
 
294
297
 
295
298
  @builder
299
+ @msgpk
296
300
  class PyClass(BaseModel):
297
- """Represents a Python class.
298
-
299
- Attributes:
300
- name (str): The name of the class.
301
- signature (str): The fully qualified name of the class (e.g., module.class_name).
302
- docstring (PyComment): The docstring of the class.
303
- base_classes (List[str]): List of base class names.
304
- methods (Dict[str, PyCallable]): Mapping of method names to their callable representations.
305
- attributes (Dict[str, PyClassAttribute]): Mapping of attribute names to their variable declarations.
306
- inner_classes (Dict[str, "PyClass"]): Mapping of inner class names to their class representations.
307
- start_line (int): The line number where the class definition starts.
308
- end_line (int): The line number where the class definition ends.
309
- """
301
+ """Represents a Python class."""
310
302
 
311
303
  name: str
312
304
  signature: str # e.g., module.class_name
@@ -325,18 +317,9 @@ class PyClass(BaseModel):
325
317
 
326
318
 
327
319
  @builder
320
+ @msgpk
328
321
  class PyModule(BaseModel):
329
- """Represents a Python module.
330
-
331
- Attributes:
332
- file_path (str): The file path of the module.
333
- module_name (str): The name of the module (e.g., module.submodule).
334
- imports (List[PyImport]): List of import statements in the module.
335
- comments (List[PyComment]): List of comments in the module.
336
- classes (Dict[str, PyClass]): Mapping of class names to their class representations.
337
- functions (Dict[str, PyCallable]): Mapping of function names to their callable representations.
338
- variables (List[PyVariableDeclaration]): List of variable declarations in the module.
339
- """
322
+ """Represents a Python module."""
340
323
 
341
324
  file_path: str
342
325
  module_name: str
@@ -348,13 +331,8 @@ class PyModule(BaseModel):
348
331
 
349
332
 
350
333
  @builder
334
+ @msgpk
351
335
  class PyApplication(BaseModel):
352
- """Represents a Python application.
353
-
354
- Attributes:
355
- name (str): The name of the application.
356
- version (str): The version of the application.
357
- description (str): A brief description of the application.
358
- """
336
+ """Represents a Python application."""
359
337
 
360
338
  symbol_table: dict[Path, PyModule]
@@ -19,8 +19,8 @@ CodeQL package
19
19
  """
20
20
 
21
21
  from .codeql_analysis import CodeQL
22
- from .codeql_query_runner import CodeQLQueryRunner
23
- from .codeql_loader import CodeQLLoader
24
22
  from .codeql_exceptions import CodeQLExceptions
23
+ from .codeql_loader import CodeQLLoader
24
+ from .codeql_query_runner import CodeQLQueryRunner
25
25
 
26
26
  __all__ = ["CodeQL", "CodeQLQueryRunner", "CodeQLLoader", "CodeQLExceptions"]
@@ -21,9 +21,10 @@ for Python projects and execute queries against them.
21
21
  """
22
22
 
23
23
  from pathlib import Path
24
+ from typing import Union
25
+
24
26
  from networkx import DiGraph
25
27
  from pandas import DataFrame
26
- from typing import Union
27
28
 
28
29
  from codeanalyzer.semantic_analysis.codeql.codeql_query_runner import CodeQLQueryRunner
29
30
 
@@ -130,4 +131,3 @@ class CodeQL:
130
131
  Returns:
131
132
  DiGraph: A directed graph representing the call graph of the application.
132
133
  """
133
- pass
@@ -1,9 +1,9 @@
1
1
  import platform
2
- import requests
3
2
  import zipfile
4
3
  from pathlib import Path
4
+
5
+ import requests
5
6
  from codeanalyzer.utils import logger
6
- from tqdm import tqdm
7
7
 
8
8
 
9
9
  class CodeQLLoader:
@@ -43,22 +43,11 @@ class CodeQLLoader:
43
43
  logger.info(f"Downloading CodeQL CLI from {download_url}")
44
44
  with requests.get(download_url, stream=True) as r:
45
45
  r.raise_for_status()
46
- total_size = int(r.headers.get("content-length", 0))
47
46
  block_size = 8192 # 8KB
48
47
 
49
- with (
50
- open(archive_path, "wb") as f,
51
- tqdm(
52
- total=total_size,
53
- unit="B",
54
- unit_scale=True,
55
- unit_divisor=1024,
56
- desc="Downloading CodeQL",
57
- ) as bar,
58
- ):
48
+ with open(archive_path, "wb") as f:
59
49
  for chunk in r.iter_content(chunk_size=block_size):
60
50
  f.write(chunk)
61
- bar.update(len(chunk))
62
51
 
63
52
  extract_dir = temp_dir / filename.replace(".zip", "")
64
53
  extract_dir.mkdir(exist_ok=True)
@@ -20,11 +20,12 @@ This module provides functionality to run CodeQL queries against CodeQL database
20
20
  and process the results.
21
21
  """
22
22
 
23
+ import shlex
23
24
  import subprocess
24
25
  import tempfile
25
26
  from pathlib import Path
26
- import shlex
27
27
  from typing import List
28
+
28
29
  import pandas as pd
29
30
  from pandas import DataFrame
30
31