codeclone 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeclone/__init__.py +1 -1
- codeclone/baseline.py +33 -7
- codeclone/blockhash.py +1 -1
- codeclone/blocks.py +4 -3
- codeclone/cache.py +151 -20
- codeclone/cfg.py +53 -128
- codeclone/cfg_model.py +47 -0
- codeclone/cli.py +308 -114
- codeclone/errors.py +27 -0
- codeclone/extractor.py +101 -24
- codeclone/html_report.py +196 -640
- codeclone/normalize.py +21 -14
- codeclone/py.typed +0 -0
- codeclone/report.py +23 -12
- codeclone/scanner.py +66 -3
- codeclone/templates.py +1262 -0
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/METADATA +53 -35
- codeclone-1.2.1.dist-info/RECORD +23 -0
- codeclone-1.2.0.dist-info/RECORD +0 -19
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/WHEEL +0 -0
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/entry_points.txt +0 -0
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/top_level.txt +0 -0
codeclone/__init__.py
CHANGED
codeclone/baseline.py
CHANGED
|
@@ -9,14 +9,19 @@ Licensed under the MIT License.
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
11
|
import json
|
|
12
|
+
from collections.abc import Mapping
|
|
12
13
|
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
class Baseline:
|
|
18
|
+
__slots__ = ("blocks", "functions", "path", "python_version")
|
|
19
|
+
|
|
16
20
|
def __init__(self, path: str | Path):
|
|
17
21
|
self.path = Path(path)
|
|
18
22
|
self.functions: set[str] = set()
|
|
19
23
|
self.blocks: set[str] = set()
|
|
24
|
+
self.python_version: str | None = None
|
|
20
25
|
|
|
21
26
|
def load(self) -> None:
|
|
22
27
|
if not self.path.exists():
|
|
@@ -26,6 +31,10 @@ class Baseline:
|
|
|
26
31
|
data = json.loads(self.path.read_text("utf-8"))
|
|
27
32
|
self.functions = set(data.get("functions", []))
|
|
28
33
|
self.blocks = set(data.get("blocks", []))
|
|
34
|
+
python_version = data.get("python_version")
|
|
35
|
+
self.python_version = (
|
|
36
|
+
python_version if isinstance(python_version, str) else None
|
|
37
|
+
)
|
|
29
38
|
except json.JSONDecodeError as e:
|
|
30
39
|
raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e
|
|
31
40
|
|
|
@@ -33,10 +42,7 @@ class Baseline:
|
|
|
33
42
|
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
34
43
|
self.path.write_text(
|
|
35
44
|
json.dumps(
|
|
36
|
-
|
|
37
|
-
"functions": sorted(self.functions),
|
|
38
|
-
"blocks": sorted(self.blocks),
|
|
39
|
-
},
|
|
45
|
+
_baseline_payload(self.functions, self.blocks, self.python_version),
|
|
40
46
|
indent=2,
|
|
41
47
|
ensure_ascii=False,
|
|
42
48
|
),
|
|
@@ -45,14 +51,34 @@ class Baseline:
|
|
|
45
51
|
|
|
46
52
|
@staticmethod
|
|
47
53
|
def from_groups(
|
|
48
|
-
func_groups:
|
|
49
|
-
|
|
54
|
+
func_groups: Mapping[str, object],
|
|
55
|
+
block_groups: Mapping[str, object],
|
|
56
|
+
path: str | Path = "",
|
|
57
|
+
python_version: str | None = None,
|
|
58
|
+
) -> Baseline:
|
|
50
59
|
bl = Baseline(path)
|
|
51
60
|
bl.functions = set(func_groups.keys())
|
|
52
61
|
bl.blocks = set(block_groups.keys())
|
|
62
|
+
bl.python_version = python_version
|
|
53
63
|
return bl
|
|
54
64
|
|
|
55
|
-
def diff(
|
|
65
|
+
def diff(
|
|
66
|
+
self, func_groups: Mapping[str, object], block_groups: Mapping[str, object]
|
|
67
|
+
) -> tuple[set[str], set[str]]:
|
|
56
68
|
new_funcs = set(func_groups.keys()) - self.functions
|
|
57
69
|
new_blocks = set(block_groups.keys()) - self.blocks
|
|
58
70
|
return new_funcs, new_blocks
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _baseline_payload(
|
|
74
|
+
functions: set[str],
|
|
75
|
+
blocks: set[str],
|
|
76
|
+
python_version: str | None,
|
|
77
|
+
) -> dict[str, Any]:
|
|
78
|
+
payload: dict[str, Any] = {
|
|
79
|
+
"functions": sorted(functions),
|
|
80
|
+
"blocks": sorted(blocks),
|
|
81
|
+
}
|
|
82
|
+
if python_version:
|
|
83
|
+
payload["python_version"] = python_version
|
|
84
|
+
return payload
|
codeclone/blockhash.py
CHANGED
|
@@ -11,7 +11,7 @@ from __future__ import annotations
|
|
|
11
11
|
import ast
|
|
12
12
|
import hashlib
|
|
13
13
|
|
|
14
|
-
from .normalize import
|
|
14
|
+
from .normalize import AstNormalizer, NormalizationConfig
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
|
codeclone/blocks.py
CHANGED
|
@@ -15,7 +15,7 @@ from .blockhash import stmt_hash
|
|
|
15
15
|
from .normalize import NormalizationConfig
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
@dataclass(frozen=True)
|
|
18
|
+
@dataclass(frozen=True, slots=True)
|
|
19
19
|
class BlockUnit:
|
|
20
20
|
block_hash: str
|
|
21
21
|
filepath: str
|
|
@@ -42,7 +42,8 @@ def extract_blocks(
|
|
|
42
42
|
|
|
43
43
|
blocks: list[BlockUnit] = []
|
|
44
44
|
last_start: int | None = None
|
|
45
|
-
|
|
45
|
+
# Allow some overlap (50%), but at least 3 lines apart
|
|
46
|
+
min_line_distance = max(block_size // 2, 3)
|
|
46
47
|
|
|
47
48
|
for i in range(len(stmt_hashes) - block_size + 1):
|
|
48
49
|
start = getattr(body[i], "lineno", None)
|
|
@@ -50,7 +51,7 @@ def extract_blocks(
|
|
|
50
51
|
if not start or not end:
|
|
51
52
|
continue
|
|
52
53
|
|
|
53
|
-
if last_start is not None and start - last_start <
|
|
54
|
+
if last_start is not None and start - last_start < min_line_distance:
|
|
54
55
|
continue
|
|
55
56
|
|
|
56
57
|
bh = "|".join(stmt_hashes[i : i + block_size])
|
codeclone/cache.py
CHANGED
|
@@ -8,47 +8,178 @@ Licensed under the MIT License.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import hashlib
|
|
12
|
+
import hmac
|
|
11
13
|
import json
|
|
12
14
|
import os
|
|
15
|
+
import secrets
|
|
16
|
+
from collections.abc import Mapping
|
|
13
17
|
from dataclasses import asdict
|
|
14
18
|
from pathlib import Path
|
|
15
|
-
from typing import Any,
|
|
19
|
+
from typing import TYPE_CHECKING, Any, TypedDict, cast
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from .blocks import BlockUnit
|
|
23
|
+
from .extractor import Unit
|
|
24
|
+
|
|
25
|
+
from .errors import CacheError
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FileStat(TypedDict):
|
|
29
|
+
mtime_ns: int
|
|
30
|
+
size: int
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class UnitDict(TypedDict):
|
|
34
|
+
qualname: str
|
|
35
|
+
filepath: str
|
|
36
|
+
start_line: int
|
|
37
|
+
end_line: int
|
|
38
|
+
loc: int
|
|
39
|
+
stmt_count: int
|
|
40
|
+
fingerprint: str
|
|
41
|
+
loc_bucket: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class BlockDict(TypedDict):
|
|
45
|
+
block_hash: str
|
|
46
|
+
filepath: str
|
|
47
|
+
qualname: str
|
|
48
|
+
start_line: int
|
|
49
|
+
end_line: int
|
|
50
|
+
size: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class CacheEntry(TypedDict):
|
|
54
|
+
stat: FileStat
|
|
55
|
+
units: list[UnitDict]
|
|
56
|
+
blocks: list[BlockDict]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class CacheData(TypedDict):
|
|
60
|
+
version: str
|
|
61
|
+
files: dict[str, CacheEntry]
|
|
16
62
|
|
|
17
63
|
|
|
18
64
|
class Cache:
|
|
65
|
+
__slots__ = ("data", "load_warning", "path", "secret")
|
|
66
|
+
CACHE_VERSION = "1.0"
|
|
67
|
+
|
|
19
68
|
def __init__(self, path: str | Path):
|
|
20
69
|
self.path = Path(path)
|
|
21
|
-
self.data:
|
|
70
|
+
self.data: CacheData = {"version": self.CACHE_VERSION, "files": {}}
|
|
71
|
+
self.secret = self._load_secret()
|
|
72
|
+
self.load_warning: str | None = None
|
|
22
73
|
|
|
23
|
-
def
|
|
24
|
-
|
|
74
|
+
def _load_secret(self) -> bytes:
|
|
75
|
+
"""Load or create cache signing secret."""
|
|
76
|
+
# Store secret in the same directory as the cache file, named .cache_secret
|
|
77
|
+
# If cache is at ~/.cache/codeclone/cache.json, secret is
|
|
78
|
+
# ~/.cache/codeclone/.cache_secret
|
|
79
|
+
secret_path = self.path.parent / ".cache_secret"
|
|
80
|
+
if secret_path.exists():
|
|
81
|
+
return secret_path.read_bytes()
|
|
82
|
+
else:
|
|
83
|
+
secret = secrets.token_bytes(32)
|
|
25
84
|
try:
|
|
26
|
-
self.
|
|
27
|
-
|
|
28
|
-
#
|
|
29
|
-
|
|
85
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
secret_path.write_bytes(secret)
|
|
87
|
+
# Set restrictive permissions on secret file (Unix only)
|
|
88
|
+
if os.name == "posix":
|
|
89
|
+
secret_path.chmod(0o600)
|
|
90
|
+
except OSError:
|
|
91
|
+
pass
|
|
92
|
+
return secret
|
|
93
|
+
|
|
94
|
+
def _sign_data(self, data: Mapping[str, Any]) -> str:
|
|
95
|
+
"""Create HMAC signature of cache data."""
|
|
96
|
+
# Sort keys for deterministic JSON serialization
|
|
97
|
+
data_str = json.dumps(data, sort_keys=True)
|
|
98
|
+
return hmac.new(self.secret, data_str.encode(), hashlib.sha256).hexdigest()
|
|
99
|
+
|
|
100
|
+
def load(self) -> None:
|
|
101
|
+
if not self.path.exists():
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
raw = json.loads(self.path.read_text("utf-8"))
|
|
106
|
+
stored_sig = raw.get("_signature")
|
|
107
|
+
|
|
108
|
+
# Extract data without signature for verification
|
|
109
|
+
data = {k: v for k, v in raw.items() if k != "_signature"}
|
|
110
|
+
|
|
111
|
+
# Verify signature
|
|
112
|
+
expected_sig = self._sign_data(data)
|
|
113
|
+
if stored_sig != expected_sig:
|
|
114
|
+
self.load_warning = "Cache signature mismatch; ignoring cache."
|
|
115
|
+
self.data = {"version": self.CACHE_VERSION, "files": {}}
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
if data.get("version") != self.CACHE_VERSION:
|
|
119
|
+
self.load_warning = (
|
|
120
|
+
"Cache version mismatch "
|
|
121
|
+
f"(found {data.get('version')}); ignoring cache."
|
|
122
|
+
)
|
|
123
|
+
self.data = {"version": self.CACHE_VERSION, "files": {}}
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
# Basic structure check
|
|
127
|
+
if not isinstance(data.get("files"), dict):
|
|
128
|
+
self.load_warning = "Cache format invalid; ignoring cache."
|
|
129
|
+
self.data = {"version": self.CACHE_VERSION, "files": {}}
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
self.data = cast(CacheData, data)
|
|
133
|
+
self.load_warning = None
|
|
134
|
+
|
|
135
|
+
except (json.JSONDecodeError, ValueError):
|
|
136
|
+
self.load_warning = "Cache corrupted; ignoring cache."
|
|
137
|
+
self.data = {"version": self.CACHE_VERSION, "files": {}}
|
|
30
138
|
|
|
31
139
|
def save(self) -> None:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
140
|
+
try:
|
|
141
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
142
|
+
|
|
143
|
+
# Add signature
|
|
144
|
+
data_with_sig = {**self.data, "_signature": self._sign_data(self.data)}
|
|
145
|
+
|
|
146
|
+
self.path.write_text(
|
|
147
|
+
json.dumps(data_with_sig, ensure_ascii=False, indent=2),
|
|
148
|
+
"utf-8",
|
|
149
|
+
)
|
|
150
|
+
except OSError as e:
|
|
151
|
+
raise CacheError(f"Failed to save cache: {e}") from e
|
|
152
|
+
|
|
153
|
+
def get_file_entry(self, filepath: str) -> CacheEntry | None:
|
|
154
|
+
entry = self.data["files"].get(filepath)
|
|
155
|
+
|
|
156
|
+
if entry is None:
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
if not isinstance(entry, dict):
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
required = {"stat", "units", "blocks"}
|
|
163
|
+
if not required.issubset(entry.keys()):
|
|
164
|
+
return None
|
|
37
165
|
|
|
38
|
-
|
|
39
|
-
return self.data.get("files", {}).get(filepath)
|
|
166
|
+
return entry
|
|
40
167
|
|
|
41
168
|
def put_file_entry(
|
|
42
|
-
self,
|
|
169
|
+
self,
|
|
170
|
+
filepath: str,
|
|
171
|
+
stat_sig: FileStat,
|
|
172
|
+
units: list[Unit],
|
|
173
|
+
blocks: list[BlockUnit],
|
|
43
174
|
) -> None:
|
|
44
|
-
self.data
|
|
175
|
+
self.data["files"][filepath] = {
|
|
45
176
|
"stat": stat_sig,
|
|
46
|
-
"units": [asdict(u) for u in units],
|
|
47
|
-
"blocks": [asdict(b) for b in blocks],
|
|
177
|
+
"units": cast(list[UnitDict], cast(object, [asdict(u) for u in units])),
|
|
178
|
+
"blocks": cast(list[BlockDict], cast(object, [asdict(b) for b in blocks])),
|
|
48
179
|
}
|
|
49
180
|
|
|
50
181
|
|
|
51
|
-
def file_stat_signature(path: str) ->
|
|
182
|
+
def file_stat_signature(path: str) -> FileStat:
|
|
52
183
|
st = os.stat(path)
|
|
53
184
|
return {
|
|
54
185
|
"mtime_ns": st.st_mtime_ns,
|
codeclone/cfg.py
CHANGED
|
@@ -9,48 +9,21 @@ Licensed under the MIT License.
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
11
|
import ast
|
|
12
|
-
from
|
|
13
|
-
from typing import
|
|
12
|
+
from collections.abc import Iterable
|
|
13
|
+
from typing import Protocol, cast
|
|
14
14
|
|
|
15
|
+
from .cfg_model import CFG, Block
|
|
15
16
|
|
|
16
|
-
|
|
17
|
-
# Core CFG structures
|
|
18
|
-
# =========================
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@dataclass(eq=False)
|
|
22
|
-
class Block:
|
|
23
|
-
id: int
|
|
24
|
-
statements: list[ast.stmt] = field(default_factory=list)
|
|
25
|
-
successors: set["Block"] = field(default_factory=set)
|
|
26
|
-
is_terminated: bool = False
|
|
27
|
-
|
|
28
|
-
def add_successor(self, block: Block) -> None:
|
|
29
|
-
self.successors.add(block)
|
|
30
|
-
|
|
31
|
-
def __hash__(self) -> int:
|
|
32
|
-
return hash(self.id)
|
|
17
|
+
__all__ = ["CFG", "CFGBuilder"]
|
|
33
18
|
|
|
34
|
-
|
|
35
|
-
return isinstance(other, Block) and self.id == other.id
|
|
19
|
+
TryStar = getattr(ast, "TryStar", ast.Try)
|
|
36
20
|
|
|
37
21
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
entry: Block = field(init=False)
|
|
44
|
-
exit: Block = field(init=False)
|
|
45
|
-
|
|
46
|
-
def __post_init__(self) -> None:
|
|
47
|
-
self.entry = self.create_block()
|
|
48
|
-
self.exit = self.create_block()
|
|
49
|
-
|
|
50
|
-
def create_block(self) -> Block:
|
|
51
|
-
block = Block(id=len(self.blocks))
|
|
52
|
-
self.blocks.append(block)
|
|
53
|
-
return block
|
|
22
|
+
class _TryLike(Protocol):
|
|
23
|
+
body: list[ast.stmt]
|
|
24
|
+
handlers: list[ast.ExceptHandler]
|
|
25
|
+
orelse: list[ast.stmt]
|
|
26
|
+
finalbody: list[ast.stmt]
|
|
54
27
|
|
|
55
28
|
|
|
56
29
|
# =========================
|
|
@@ -59,6 +32,8 @@ class CFG:
|
|
|
59
32
|
|
|
60
33
|
|
|
61
34
|
class CFGBuilder:
|
|
35
|
+
__slots__ = ("cfg", "current")
|
|
36
|
+
|
|
62
37
|
def __init__(self) -> None:
|
|
63
38
|
self.cfg: CFG
|
|
64
39
|
self.current: Block
|
|
@@ -110,8 +85,10 @@ class CFGBuilder:
|
|
|
110
85
|
case ast.AsyncFor():
|
|
111
86
|
self._visit_for(stmt) # Structure is identical to For
|
|
112
87
|
|
|
113
|
-
case ast.Try()
|
|
114
|
-
self._visit_try(stmt)
|
|
88
|
+
case ast.Try():
|
|
89
|
+
self._visit_try(cast(_TryLike, stmt))
|
|
90
|
+
case _ if TryStar is not None and isinstance(stmt, TryStar):
|
|
91
|
+
self._visit_try(cast(_TryLike, stmt))
|
|
115
92
|
|
|
116
93
|
case ast.With() | ast.AsyncWith():
|
|
117
94
|
self._visit_with(stmt)
|
|
@@ -185,7 +162,8 @@ class CFGBuilder:
|
|
|
185
162
|
self.current = after_block
|
|
186
163
|
|
|
187
164
|
def _visit_with(self, stmt: ast.With | ast.AsyncWith) -> None:
|
|
188
|
-
# Treat WITH as linear flow (enter -> body -> exit), but preserve
|
|
165
|
+
# Treat WITH as linear flow (enter -> body -> exit), but preserve
|
|
166
|
+
# block structure
|
|
189
167
|
# We record the context manager expression in the current block
|
|
190
168
|
# Then we enter a new block for the body (to separate it structurally)
|
|
191
169
|
# Then we enter a new block for 'after' (exit)
|
|
@@ -210,126 +188,73 @@ class CFGBuilder:
|
|
|
210
188
|
|
|
211
189
|
self.current = after_block
|
|
212
190
|
|
|
213
|
-
def _visit_try(self, stmt:
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
try_block = self.cfg.create_block()
|
|
219
|
-
self.current.add_successor(try_block)
|
|
220
|
-
|
|
221
|
-
# We don't know WHERE in the try block exception happens, so we assume
|
|
222
|
-
# any point in try block *could* jump to handlers.
|
|
223
|
-
# But for structural hashing, we just process the body.
|
|
224
|
-
# Ideally, we should link the try_block (or its end) to handlers?
|
|
225
|
-
# A simple approximation:
|
|
226
|
-
# 1. Process body.
|
|
227
|
-
# 2. Link entry (or end of body) to handlers?
|
|
228
|
-
# Let's do: Entry -> BodyBlock.
|
|
229
|
-
# Entry -> HandlerBlocks (to represent potential jump).
|
|
230
|
-
|
|
231
|
-
# Actually, let's keep it linear but branched.
|
|
232
|
-
# Current -> TryBody
|
|
233
|
-
# Current -> Handlers (Abstractly representing the jump)
|
|
191
|
+
def _visit_try(self, stmt: _TryLike) -> None:
|
|
192
|
+
try_entry = self.cfg.create_block()
|
|
193
|
+
self.current.add_successor(try_entry)
|
|
194
|
+
self.current = try_entry
|
|
234
195
|
|
|
235
196
|
handlers_blocks = [self.cfg.create_block() for _ in stmt.handlers]
|
|
236
197
|
else_block = self.cfg.create_block() if stmt.orelse else None
|
|
237
|
-
final_block = self.cfg.create_block()
|
|
198
|
+
final_block = self.cfg.create_block()
|
|
238
199
|
|
|
239
|
-
#
|
|
240
|
-
|
|
241
|
-
|
|
200
|
+
# Process each statement in try body
|
|
201
|
+
# Link each to exception handlers
|
|
202
|
+
for stmt_node in stmt.body:
|
|
203
|
+
if self.current.is_terminated:
|
|
204
|
+
break
|
|
205
|
+
|
|
206
|
+
# Current statement could raise exception
|
|
207
|
+
for h_block in handlers_blocks:
|
|
208
|
+
self.current.add_successor(h_block)
|
|
209
|
+
|
|
210
|
+
self._visit(stmt_node)
|
|
242
211
|
|
|
243
|
-
#
|
|
212
|
+
# Normal exit from try
|
|
244
213
|
if not self.current.is_terminated:
|
|
245
214
|
if else_block:
|
|
246
215
|
self.current.add_successor(else_block)
|
|
247
216
|
else:
|
|
248
217
|
self.current.add_successor(final_block)
|
|
249
218
|
|
|
250
|
-
#
|
|
251
|
-
|
|
252
|
-
self.current = else_block
|
|
253
|
-
self._visit_statements(stmt.orelse)
|
|
254
|
-
if not self.current.is_terminated:
|
|
255
|
-
self.current.add_successor(final_block)
|
|
256
|
-
|
|
257
|
-
# Handle Handlers
|
|
258
|
-
# We assume control flow *could* jump from start of Try to any handler
|
|
259
|
-
# (Technically from inside try, but we model structural containment)
|
|
260
|
-
# To make fingerprints stable, we just need to ensure handlers are visited
|
|
261
|
-
# and linked.
|
|
262
|
-
|
|
263
|
-
# We link the *original* predecessor (before try) or the try_block start to handlers?
|
|
264
|
-
# Let's link the `try_block` (as a container concept) to handlers.
|
|
265
|
-
# But `try_block` was mutated by `_visit_statements`.
|
|
266
|
-
# Let's use the `try_block` (start of try) to link to handlers.
|
|
267
|
-
for h_block in handlers_blocks:
|
|
268
|
-
try_block.add_successor(h_block)
|
|
269
|
-
|
|
270
|
-
for handler, h_block in zip(stmt.handlers, handlers_blocks):
|
|
219
|
+
# Process handlers
|
|
220
|
+
for handler, h_block in zip(stmt.handlers, handlers_blocks, strict=True):
|
|
271
221
|
self.current = h_block
|
|
272
|
-
# Record exception type
|
|
273
222
|
if handler.type:
|
|
274
223
|
self.current.statements.append(ast.Expr(value=handler.type))
|
|
224
|
+
|
|
275
225
|
self._visit_statements(handler.body)
|
|
276
226
|
if not self.current.is_terminated:
|
|
277
227
|
self.current.add_successor(final_block)
|
|
278
228
|
|
|
279
|
-
#
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
229
|
+
# Process else
|
|
230
|
+
if else_block:
|
|
231
|
+
self.current = else_block
|
|
232
|
+
self._visit_statements(stmt.orelse)
|
|
233
|
+
if not self.current.is_terminated:
|
|
234
|
+
self.current.add_successor(final_block)
|
|
283
235
|
|
|
236
|
+
# Process finally
|
|
237
|
+
self.current = final_block
|
|
284
238
|
if stmt.finalbody:
|
|
285
|
-
self.current = final_block
|
|
286
239
|
self._visit_statements(stmt.finalbody)
|
|
287
|
-
# And then continue to next code?
|
|
288
|
-
# Yes, finally flows to next statement.
|
|
289
|
-
# Unless terminated.
|
|
290
|
-
|
|
291
|
-
# If no finally, `final_block` is just the merge point (after).
|
|
292
|
-
self.current = final_block
|
|
293
240
|
|
|
294
241
|
def _visit_match(self, stmt: ast.Match) -> None:
|
|
295
|
-
# Match subject -> Cases -> After
|
|
296
|
-
|
|
297
242
|
self.current.statements.append(ast.Expr(value=stmt.subject))
|
|
298
243
|
|
|
299
|
-
after_block = self.cfg.create_block()
|
|
300
|
-
|
|
301
|
-
for case_ in stmt.cases:
|
|
302
|
-
case_block = self.cfg.create_block()
|
|
303
|
-
self.current.add_successor(case_block)
|
|
304
|
-
|
|
305
|
-
# Save current context to restore for next case branching?
|
|
306
|
-
# No, 'current' is the match subject block. It branches to ALL cases.
|
|
307
|
-
|
|
308
|
-
# Visit Case
|
|
309
|
-
# We must set self.current to case_block for visiting body
|
|
310
|
-
# But we lose reference to 'match subject block' to link next case!
|
|
311
|
-
# So we need a variable `subject_block`.
|
|
312
|
-
pass
|
|
313
|
-
|
|
314
|
-
# Re-implementing loop correctly
|
|
315
244
|
subject_block = self.current
|
|
245
|
+
after_block = self.cfg.create_block()
|
|
316
246
|
|
|
317
247
|
for case_ in stmt.cases:
|
|
318
248
|
case_block = self.cfg.create_block()
|
|
319
249
|
subject_block.add_successor(case_block)
|
|
320
250
|
|
|
321
251
|
self.current = case_block
|
|
322
|
-
|
|
323
|
-
#
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
# Ideally: `self.current.statements.append(case_.pattern)` but pattern is not stmt.
|
|
329
|
-
# We can wrap in Expr? `ast.Expr(value=case_.pattern)`?
|
|
330
|
-
# Pattern is NOT an Expr subclass in 3.10. It's `ast.pattern`.
|
|
331
|
-
# So we cannot append it to `statements` list which expects `ast.stmt`.
|
|
332
|
-
# We will ignore pattern structure for now (it's structural flow we care about).
|
|
252
|
+
|
|
253
|
+
# Record pattern structure
|
|
254
|
+
pattern_repr = ast.dump(case_.pattern, annotate_fields=False)
|
|
255
|
+
self.current.statements.append(
|
|
256
|
+
ast.Expr(value=ast.Constant(value=f"PATTERN:{pattern_repr}"))
|
|
257
|
+
)
|
|
333
258
|
|
|
334
259
|
self._visit_statements(case_.body)
|
|
335
260
|
if not self.current.is_terminated:
|
codeclone/cfg_model.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import ast
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(eq=False, slots=True)
|
|
16
|
+
class Block:
|
|
17
|
+
id: int
|
|
18
|
+
statements: list[ast.stmt] = field(default_factory=list)
|
|
19
|
+
successors: set[Block] = field(default_factory=set)
|
|
20
|
+
is_terminated: bool = False
|
|
21
|
+
|
|
22
|
+
def add_successor(self, block: Block) -> None:
|
|
23
|
+
self.successors.add(block)
|
|
24
|
+
|
|
25
|
+
def __hash__(self) -> int:
|
|
26
|
+
return hash(self.id)
|
|
27
|
+
|
|
28
|
+
def __eq__(self, other: object) -> bool:
|
|
29
|
+
return isinstance(other, Block) and self.id == other.id
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(slots=True)
|
|
33
|
+
class CFG:
|
|
34
|
+
qualname: str
|
|
35
|
+
blocks: list[Block] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
entry: Block = field(init=False)
|
|
38
|
+
exit: Block = field(init=False)
|
|
39
|
+
|
|
40
|
+
def __post_init__(self) -> None:
|
|
41
|
+
self.entry = self.create_block()
|
|
42
|
+
self.exit = self.create_block()
|
|
43
|
+
|
|
44
|
+
def create_block(self) -> Block:
|
|
45
|
+
block = Block(id=len(self.blocks))
|
|
46
|
+
self.blocks.append(block)
|
|
47
|
+
return block
|