codeclone 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeclone/__init__.py +1 -1
- codeclone/baseline.py +44 -14
- codeclone/blockhash.py +1 -1
- codeclone/blocks.py +4 -3
- codeclone/cache.py +154 -17
- codeclone/cfg.py +128 -38
- codeclone/cfg_model.py +47 -0
- codeclone/cli.py +524 -100
- codeclone/errors.py +27 -0
- codeclone/extractor.py +101 -24
- codeclone/html_report.py +230 -691
- codeclone/normalize.py +43 -13
- codeclone/py.typed +0 -0
- codeclone/report.py +23 -12
- codeclone/scanner.py +66 -3
- codeclone/templates.py +1262 -0
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/METADATA +62 -34
- codeclone-1.2.1.dist-info/RECORD +23 -0
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/WHEEL +1 -1
- codeclone-1.1.0.dist-info/RECORD +0 -19
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/entry_points.txt +0 -0
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/top_level.txt +0 -0
codeclone/errors.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CodeCloneError(Exception):
|
|
11
|
+
"""Base exception for CodeClone."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FileProcessingError(CodeCloneError):
|
|
15
|
+
"""Error processing a source file."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ParseError(FileProcessingError):
|
|
19
|
+
"""AST parsing failed."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ValidationError(CodeCloneError):
|
|
23
|
+
"""Input validation failed."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CacheError(CodeCloneError):
|
|
27
|
+
"""Cache operation failed."""
|
codeclone/extractor.py
CHANGED
|
@@ -9,21 +9,24 @@ Licensed under the MIT License.
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
11
|
import ast
|
|
12
|
+
import os
|
|
13
|
+
import signal
|
|
14
|
+
from collections.abc import Iterator
|
|
15
|
+
from contextlib import contextmanager
|
|
12
16
|
from dataclasses import dataclass
|
|
13
|
-
from typing import Sequence
|
|
14
17
|
|
|
15
|
-
from .blocks import
|
|
18
|
+
from .blocks import BlockUnit, extract_blocks
|
|
16
19
|
from .cfg import CFGBuilder
|
|
17
|
-
from .
|
|
20
|
+
from .errors import ParseError
|
|
21
|
+
from .fingerprint import bucket_loc, sha1
|
|
18
22
|
from .normalize import NormalizationConfig, normalized_ast_dump_from_list
|
|
19
23
|
|
|
20
|
-
|
|
21
24
|
# =========================
|
|
22
25
|
# Data structures
|
|
23
26
|
# =========================
|
|
24
27
|
|
|
25
28
|
|
|
26
|
-
@dataclass(frozen=True)
|
|
29
|
+
@dataclass(frozen=True, slots=True)
|
|
27
30
|
class Unit:
|
|
28
31
|
qualname: str
|
|
29
32
|
filepath: str
|
|
@@ -39,6 +42,67 @@ class Unit:
|
|
|
39
42
|
# Helpers
|
|
40
43
|
# =========================
|
|
41
44
|
|
|
45
|
+
PARSE_TIMEOUT_SECONDS = 5
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class _ParseTimeoutError(Exception):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@contextmanager
|
|
53
|
+
def _parse_limits(timeout_s: int) -> Iterator[None]:
|
|
54
|
+
if os.name != "posix" or timeout_s <= 0:
|
|
55
|
+
yield
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
old_handler = signal.getsignal(signal.SIGALRM)
|
|
59
|
+
|
|
60
|
+
def _timeout_handler(_signum: int, _frame: object) -> None:
|
|
61
|
+
raise _ParseTimeoutError("AST parsing timeout")
|
|
62
|
+
|
|
63
|
+
old_limits: tuple[int, int] | None = None
|
|
64
|
+
try:
|
|
65
|
+
signal.signal(signal.SIGALRM, _timeout_handler)
|
|
66
|
+
signal.setitimer(signal.ITIMER_REAL, timeout_s)
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
import resource
|
|
70
|
+
|
|
71
|
+
old_limits = resource.getrlimit(resource.RLIMIT_CPU)
|
|
72
|
+
soft, hard = old_limits
|
|
73
|
+
new_soft = (
|
|
74
|
+
min(timeout_s, soft) if soft != resource.RLIM_INFINITY else timeout_s
|
|
75
|
+
)
|
|
76
|
+
new_hard = (
|
|
77
|
+
min(timeout_s + 1, hard)
|
|
78
|
+
if hard != resource.RLIM_INFINITY
|
|
79
|
+
else timeout_s + 1
|
|
80
|
+
)
|
|
81
|
+
resource.setrlimit(resource.RLIMIT_CPU, (new_soft, new_hard))
|
|
82
|
+
except Exception:
|
|
83
|
+
# If resource is unavailable or cannot be set, rely on alarm only.
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
yield
|
|
87
|
+
finally:
|
|
88
|
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
|
89
|
+
signal.signal(signal.SIGALRM, old_handler)
|
|
90
|
+
if old_limits is not None:
|
|
91
|
+
try:
|
|
92
|
+
import resource
|
|
93
|
+
|
|
94
|
+
resource.setrlimit(resource.RLIMIT_CPU, old_limits)
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _parse_with_limits(source: str, timeout_s: int) -> ast.AST:
|
|
100
|
+
try:
|
|
101
|
+
with _parse_limits(timeout_s):
|
|
102
|
+
return ast.parse(source)
|
|
103
|
+
except _ParseTimeoutError as e:
|
|
104
|
+
raise ParseError(str(e)) from e
|
|
105
|
+
|
|
42
106
|
|
|
43
107
|
def _stmt_count(node: ast.AST) -> int:
|
|
44
108
|
body = getattr(node, "body", None)
|
|
@@ -46,6 +110,8 @@ def _stmt_count(node: ast.AST) -> int:
|
|
|
46
110
|
|
|
47
111
|
|
|
48
112
|
class _QualnameBuilder(ast.NodeVisitor):
|
|
113
|
+
__slots__ = ("stack", "units")
|
|
114
|
+
|
|
49
115
|
def __init__(self) -> None:
|
|
50
116
|
self.stack: list[str] = []
|
|
51
117
|
self.units: list[tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = []
|
|
@@ -56,11 +122,11 @@ class _QualnameBuilder(ast.NodeVisitor):
|
|
|
56
122
|
self.stack.pop()
|
|
57
123
|
|
|
58
124
|
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
|
59
|
-
name = ".".join(self.stack
|
|
125
|
+
name = ".".join([*self.stack, node.name]) if self.stack else node.name
|
|
60
126
|
self.units.append((name, node))
|
|
61
127
|
|
|
62
128
|
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
|
63
|
-
name = ".".join(self.stack
|
|
129
|
+
name = ".".join([*self.stack, node.name]) if self.stack else node.name
|
|
64
130
|
self.units.append((name, node))
|
|
65
131
|
|
|
66
132
|
|
|
@@ -75,28 +141,39 @@ def get_cfg_fingerprint(
|
|
|
75
141
|
qualname: str,
|
|
76
142
|
) -> str:
|
|
77
143
|
"""
|
|
78
|
-
|
|
144
|
+
Generate a structural fingerprint for a function using CFG analysis.
|
|
145
|
+
|
|
146
|
+
The fingerprint is computed by:
|
|
147
|
+
1. Building a Control Flow Graph (CFG) from the function
|
|
148
|
+
2. Normalizing each CFG block's statements (variable names, constants, etc.)
|
|
149
|
+
3. Creating a canonical representation of the CFG structure
|
|
150
|
+
4. Hashing the representation with SHA-1
|
|
151
|
+
|
|
152
|
+
Functions with identical control flow and normalized statements will
|
|
153
|
+
produce the same fingerprint, even if they differ in variable names,
|
|
154
|
+
constants, or type annotations.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
node: Function AST node to fingerprint
|
|
158
|
+
cfg: Normalization configuration (what to ignore)
|
|
159
|
+
qualname: Qualified name for logging/debugging
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
40-character hex SHA-1 hash of the normalized CFG
|
|
79
163
|
"""
|
|
80
164
|
builder = CFGBuilder()
|
|
81
165
|
graph = builder.build(qualname, node)
|
|
82
166
|
|
|
167
|
+
# Use generator to avoid building large list of strings
|
|
83
168
|
parts: list[str] = []
|
|
84
|
-
|
|
85
|
-
# Stable order for deterministic hash
|
|
86
169
|
for block in sorted(graph.blocks, key=lambda b: b.id):
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
stmts_as_ast: Sequence[ast.AST] = block.statements
|
|
91
|
-
normalized_stmts = normalized_ast_dump_from_list(stmts_as_ast, cfg)
|
|
92
|
-
|
|
93
|
-
successor_ids = sorted(succ.id for succ in block.successors)
|
|
94
|
-
|
|
170
|
+
succ_ids = ",".join(
|
|
171
|
+
str(s.id) for s in sorted(block.successors, key=lambda s: s.id)
|
|
172
|
+
)
|
|
95
173
|
parts.append(
|
|
96
|
-
f"BLOCK[{block.id}]:{
|
|
97
|
-
f"|SUCCESSORS:{
|
|
174
|
+
f"BLOCK[{block.id}]:{normalized_ast_dump_from_list(block.statements, cfg)}"
|
|
175
|
+
f"|SUCCESSORS:{succ_ids}"
|
|
98
176
|
)
|
|
99
|
-
|
|
100
177
|
return sha1("|".join(parts))
|
|
101
178
|
|
|
102
179
|
|
|
@@ -114,9 +191,9 @@ def extract_units_from_source(
|
|
|
114
191
|
min_stmt: int,
|
|
115
192
|
) -> tuple[list[Unit], list[BlockUnit]]:
|
|
116
193
|
try:
|
|
117
|
-
tree =
|
|
118
|
-
except SyntaxError:
|
|
119
|
-
|
|
194
|
+
tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS)
|
|
195
|
+
except SyntaxError as e:
|
|
196
|
+
raise ParseError(f"Failed to parse {filepath}: {e}") from e
|
|
120
197
|
|
|
121
198
|
qb = _QualnameBuilder()
|
|
122
199
|
qb.visit(tree)
|