codeclone 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeclone/__init__.py +16 -0
- codeclone/baseline.py +8 -0
- codeclone/blockhash.py +10 -1
- codeclone/blocks.py +26 -16
- codeclone/cache.py +8 -0
- codeclone/cfg.py +173 -0
- codeclone/cli.py +92 -58
- codeclone/extractor.py +92 -32
- codeclone/fingerprint.py +11 -1
- codeclone/html_report.py +953 -0
- codeclone/normalize.py +50 -26
- codeclone/report.py +25 -9
- codeclone/scanner.py +24 -4
- codeclone-1.1.0.dist-info/METADATA +254 -0
- codeclone-1.1.0.dist-info/RECORD +19 -0
- codeclone-1.0.0.dist-info/METADATA +0 -211
- codeclone-1.0.0.dist-info/RECORD +0 -17
- {codeclone-1.0.0.dist-info → codeclone-1.1.0.dist-info}/WHEEL +0 -0
- {codeclone-1.0.0.dist-info → codeclone-1.1.0.dist-info}/entry_points.txt +0 -0
- {codeclone-1.0.0.dist-info → codeclone-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {codeclone-1.0.0.dist-info → codeclone-1.1.0.dist-info}/top_level.txt +0 -0
codeclone/__init__.py
CHANGED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
__version__ = version("codeclone")
|
|
13
|
+
except PackageNotFoundError:
|
|
14
|
+
__version__ = "dev"
|
|
15
|
+
|
|
16
|
+
__all__ = ["__version__"]
|
codeclone/baseline.py
CHANGED
codeclone/blockhash.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from __future__ import annotations
|
|
2
10
|
|
|
3
11
|
import ast
|
|
@@ -5,8 +13,9 @@ import hashlib
|
|
|
5
13
|
|
|
6
14
|
from .normalize import NormalizationConfig, AstNormalizer
|
|
7
15
|
|
|
16
|
+
|
|
8
17
|
def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
|
|
9
18
|
normalizer = AstNormalizer(cfg)
|
|
10
19
|
stmt = ast.fix_missing_locations(normalizer.visit(stmt))
|
|
11
20
|
dump = ast.dump(stmt, annotate_fields=True, include_attributes=False)
|
|
12
|
-
return hashlib.sha1(dump.encode("utf-8")).hexdigest()
|
|
21
|
+
return hashlib.sha1(dump.encode("utf-8")).hexdigest()
|
codeclone/blocks.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from __future__ import annotations
|
|
2
10
|
|
|
3
11
|
import ast
|
|
@@ -18,13 +26,13 @@ class BlockUnit:
|
|
|
18
26
|
|
|
19
27
|
|
|
20
28
|
def extract_blocks(
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
29
|
+
func_node: ast.AST,
|
|
30
|
+
*,
|
|
31
|
+
filepath: str,
|
|
32
|
+
qualname: str,
|
|
33
|
+
cfg: NormalizationConfig,
|
|
34
|
+
block_size: int,
|
|
35
|
+
max_blocks: int,
|
|
28
36
|
) -> list[BlockUnit]:
|
|
29
37
|
body = getattr(func_node, "body", None)
|
|
30
38
|
if not isinstance(body, list) or len(body) < block_size:
|
|
@@ -45,16 +53,18 @@ def extract_blocks(
|
|
|
45
53
|
if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
|
|
46
54
|
continue
|
|
47
55
|
|
|
48
|
-
bh = "|".join(stmt_hashes[i:i + block_size])
|
|
56
|
+
bh = "|".join(stmt_hashes[i : i + block_size])
|
|
49
57
|
|
|
50
|
-
blocks.append(
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
+
blocks.append(
|
|
59
|
+
BlockUnit(
|
|
60
|
+
block_hash=bh,
|
|
61
|
+
filepath=filepath,
|
|
62
|
+
qualname=qualname,
|
|
63
|
+
start_line=start,
|
|
64
|
+
end_line=end,
|
|
65
|
+
size=block_size,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
58
68
|
|
|
59
69
|
last_start = start
|
|
60
70
|
if len(blocks) >= max_blocks:
|
codeclone/cache.py
CHANGED
codeclone/cfg.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import ast
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Iterable
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# =========================
|
|
17
|
+
# Core CFG structures
|
|
18
|
+
# =========================
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(eq=False)
|
|
22
|
+
class Block:
|
|
23
|
+
id: int
|
|
24
|
+
statements: list[ast.stmt] = field(default_factory=list)
|
|
25
|
+
successors: set["Block"] = field(default_factory=set)
|
|
26
|
+
is_terminated: bool = False
|
|
27
|
+
|
|
28
|
+
def add_successor(self, block: Block) -> None:
|
|
29
|
+
self.successors.add(block)
|
|
30
|
+
|
|
31
|
+
def __hash__(self) -> int:
|
|
32
|
+
return hash(self.id)
|
|
33
|
+
|
|
34
|
+
def __eq__(self, other: object) -> bool:
|
|
35
|
+
return isinstance(other, Block) and self.id == other.id
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class CFG:
|
|
40
|
+
qualname: str
|
|
41
|
+
blocks: list[Block] = field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
entry: Block = field(init=False)
|
|
44
|
+
exit: Block = field(init=False)
|
|
45
|
+
|
|
46
|
+
def __post_init__(self) -> None:
|
|
47
|
+
self.entry = self.create_block()
|
|
48
|
+
self.exit = self.create_block()
|
|
49
|
+
|
|
50
|
+
def create_block(self) -> Block:
|
|
51
|
+
block = Block(id=len(self.blocks))
|
|
52
|
+
self.blocks.append(block)
|
|
53
|
+
return block
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# =========================
|
|
57
|
+
# CFG Builder
|
|
58
|
+
# =========================
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CFGBuilder:
|
|
62
|
+
def __init__(self) -> None:
|
|
63
|
+
self.cfg: CFG
|
|
64
|
+
self.current: Block
|
|
65
|
+
|
|
66
|
+
def build(
|
|
67
|
+
self,
|
|
68
|
+
qualname: str,
|
|
69
|
+
node: ast.FunctionDef | ast.AsyncFunctionDef,
|
|
70
|
+
) -> CFG:
|
|
71
|
+
self.cfg = CFG(qualname)
|
|
72
|
+
self.current = self.cfg.entry
|
|
73
|
+
|
|
74
|
+
self._visit_statements(node.body)
|
|
75
|
+
|
|
76
|
+
if not self.current.is_terminated:
|
|
77
|
+
self.current.add_successor(self.cfg.exit)
|
|
78
|
+
|
|
79
|
+
return self.cfg
|
|
80
|
+
|
|
81
|
+
# ---------- Internals ----------
|
|
82
|
+
|
|
83
|
+
def _visit_statements(self, stmts: Iterable[ast.stmt]) -> None:
|
|
84
|
+
for stmt in stmts:
|
|
85
|
+
if self.current.is_terminated:
|
|
86
|
+
break
|
|
87
|
+
self._visit(stmt)
|
|
88
|
+
|
|
89
|
+
def _visit(self, stmt: ast.stmt) -> None:
|
|
90
|
+
match stmt:
|
|
91
|
+
case ast.Return():
|
|
92
|
+
self.current.statements.append(stmt)
|
|
93
|
+
self.current.is_terminated = True
|
|
94
|
+
self.current.add_successor(self.cfg.exit)
|
|
95
|
+
|
|
96
|
+
case ast.Raise():
|
|
97
|
+
self.current.statements.append(stmt)
|
|
98
|
+
self.current.is_terminated = True
|
|
99
|
+
self.current.add_successor(self.cfg.exit)
|
|
100
|
+
|
|
101
|
+
case ast.If():
|
|
102
|
+
self._visit_if(stmt)
|
|
103
|
+
|
|
104
|
+
case ast.While():
|
|
105
|
+
self._visit_while(stmt)
|
|
106
|
+
|
|
107
|
+
case ast.For():
|
|
108
|
+
self._visit_for(stmt)
|
|
109
|
+
|
|
110
|
+
case _:
|
|
111
|
+
self.current.statements.append(stmt)
|
|
112
|
+
|
|
113
|
+
# ---------- Control Flow ----------
|
|
114
|
+
|
|
115
|
+
def _visit_if(self, stmt: ast.If) -> None:
|
|
116
|
+
self.current.statements.append(ast.Expr(value=stmt.test))
|
|
117
|
+
|
|
118
|
+
then_block = self.cfg.create_block()
|
|
119
|
+
else_block = self.cfg.create_block()
|
|
120
|
+
after_block = self.cfg.create_block()
|
|
121
|
+
|
|
122
|
+
self.current.add_successor(then_block)
|
|
123
|
+
self.current.add_successor(else_block)
|
|
124
|
+
|
|
125
|
+
self.current = then_block
|
|
126
|
+
self._visit_statements(stmt.body)
|
|
127
|
+
if not self.current.is_terminated:
|
|
128
|
+
self.current.add_successor(after_block)
|
|
129
|
+
|
|
130
|
+
self.current = else_block
|
|
131
|
+
self._visit_statements(stmt.orelse)
|
|
132
|
+
if not self.current.is_terminated:
|
|
133
|
+
self.current.add_successor(after_block)
|
|
134
|
+
|
|
135
|
+
self.current = after_block
|
|
136
|
+
|
|
137
|
+
def _visit_while(self, stmt: ast.While) -> None:
|
|
138
|
+
cond_block = self.cfg.create_block()
|
|
139
|
+
body_block = self.cfg.create_block()
|
|
140
|
+
after_block = self.cfg.create_block()
|
|
141
|
+
|
|
142
|
+
self.current.add_successor(cond_block)
|
|
143
|
+
|
|
144
|
+
self.current = cond_block
|
|
145
|
+
self.current.statements.append(ast.Expr(value=stmt.test))
|
|
146
|
+
self.current.add_successor(body_block)
|
|
147
|
+
self.current.add_successor(after_block)
|
|
148
|
+
|
|
149
|
+
self.current = body_block
|
|
150
|
+
self._visit_statements(stmt.body)
|
|
151
|
+
if not self.current.is_terminated:
|
|
152
|
+
self.current.add_successor(cond_block)
|
|
153
|
+
|
|
154
|
+
self.current = after_block
|
|
155
|
+
|
|
156
|
+
def _visit_for(self, stmt: ast.For) -> None:
|
|
157
|
+
iter_block = self.cfg.create_block()
|
|
158
|
+
body_block = self.cfg.create_block()
|
|
159
|
+
after_block = self.cfg.create_block()
|
|
160
|
+
|
|
161
|
+
self.current.add_successor(iter_block)
|
|
162
|
+
|
|
163
|
+
self.current = iter_block
|
|
164
|
+
self.current.statements.append(ast.Expr(value=stmt.iter))
|
|
165
|
+
self.current.add_successor(body_block)
|
|
166
|
+
self.current.add_successor(after_block)
|
|
167
|
+
|
|
168
|
+
self.current = body_block
|
|
169
|
+
self._visit_statements(stmt.body)
|
|
170
|
+
if not self.current.is_terminated:
|
|
171
|
+
self.current.add_successor(iter_block)
|
|
172
|
+
|
|
173
|
+
self.current = after_block
|
codeclone/cli.py
CHANGED
|
@@ -1,39 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from __future__ import annotations
|
|
2
10
|
|
|
3
11
|
import argparse
|
|
12
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
4
13
|
from pathlib import Path
|
|
5
14
|
|
|
6
15
|
from .baseline import Baseline
|
|
7
16
|
from .cache import Cache, file_stat_signature
|
|
8
17
|
from .extractor import extract_units_from_source
|
|
18
|
+
from .html_report import build_html_report
|
|
9
19
|
from .normalize import NormalizationConfig
|
|
10
20
|
from .report import build_groups, build_block_groups, to_json, to_text
|
|
11
21
|
from .scanner import iter_py_files, module_name_from_path
|
|
12
22
|
|
|
13
23
|
|
|
14
|
-
def
|
|
24
|
+
def process_file(
|
|
25
|
+
filepath: str,
|
|
26
|
+
root: str,
|
|
27
|
+
cfg: NormalizationConfig,
|
|
28
|
+
min_loc: int,
|
|
29
|
+
min_stmt: int,
|
|
30
|
+
) -> tuple[str, dict, list, list] | None:
|
|
31
|
+
try:
|
|
32
|
+
source = Path(filepath).read_text("utf-8")
|
|
33
|
+
except UnicodeDecodeError:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
stat = file_stat_signature(filepath)
|
|
37
|
+
module_name = module_name_from_path(root, filepath)
|
|
38
|
+
|
|
39
|
+
units, blocks = extract_units_from_source(
|
|
40
|
+
source=source,
|
|
41
|
+
filepath=filepath,
|
|
42
|
+
module_name=module_name,
|
|
43
|
+
cfg=cfg,
|
|
44
|
+
min_loc=min_loc,
|
|
45
|
+
min_stmt=min_stmt,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return filepath, stat, units, blocks
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def main() -> None:
|
|
15
52
|
ap = argparse.ArgumentParser("codeclone")
|
|
16
53
|
ap.add_argument("root", help="Project root")
|
|
54
|
+
ap.add_argument("--processes", type=int, default=4)
|
|
17
55
|
ap.add_argument("--cache", default="~/.cache/codeclone/")
|
|
18
56
|
ap.add_argument("--min-loc", type=int, default=15)
|
|
19
57
|
ap.add_argument("--min-stmt", type=int, default=6)
|
|
20
58
|
ap.add_argument("--json-out", default="")
|
|
21
59
|
ap.add_argument("--text-out", default="")
|
|
60
|
+
ap.add_argument("--html-out", default="")
|
|
22
61
|
ap.add_argument("--fail-if-groups", type=int, default=-1)
|
|
23
62
|
ap.add_argument("--baseline", default="~/.config/codeclone/baseline.json")
|
|
24
|
-
ap.add_argument("--update-baseline", action="store_true"
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
help="Fail if new clones appear vs baseline")
|
|
63
|
+
ap.add_argument("--update-baseline", action="store_true")
|
|
64
|
+
ap.add_argument("--fail-on-new", action="store_true")
|
|
65
|
+
|
|
28
66
|
args = ap.parse_args()
|
|
29
67
|
|
|
30
|
-
cfg = NormalizationConfig(
|
|
31
|
-
ignore_docstrings=True,
|
|
32
|
-
ignore_type_annotations=True,
|
|
33
|
-
normalize_attributes=True,
|
|
34
|
-
normalize_constants=True,
|
|
35
|
-
normalize_names=True,
|
|
36
|
-
)
|
|
68
|
+
cfg = NormalizationConfig()
|
|
37
69
|
|
|
38
70
|
cache = Cache(args.cache)
|
|
39
71
|
cache.load()
|
|
@@ -42,6 +74,8 @@ def main():
|
|
|
42
74
|
all_blocks: list[dict] = []
|
|
43
75
|
changed = 0
|
|
44
76
|
|
|
77
|
+
files_to_process: list[str] = []
|
|
78
|
+
|
|
45
79
|
for fp in iter_py_files(args.root):
|
|
46
80
|
stat = file_stat_signature(fp)
|
|
47
81
|
cached = cache.get_file_entry(fp)
|
|
@@ -49,32 +83,52 @@ def main():
|
|
|
49
83
|
if cached and cached.get("stat") == stat:
|
|
50
84
|
all_units.extend(cached.get("units", []))
|
|
51
85
|
all_blocks.extend(cached.get("blocks", []))
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
86
|
+
else:
|
|
87
|
+
files_to_process.append(fp)
|
|
88
|
+
|
|
89
|
+
with ProcessPoolExecutor(max_workers=args.processes) as executor:
|
|
90
|
+
futures = [
|
|
91
|
+
executor.submit(
|
|
92
|
+
process_file,
|
|
93
|
+
fp,
|
|
94
|
+
args.root,
|
|
95
|
+
cfg,
|
|
96
|
+
args.min_loc,
|
|
97
|
+
args.min_stmt,
|
|
98
|
+
)
|
|
99
|
+
for fp in files_to_process
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
for future in futures:
|
|
103
|
+
result = future.result()
|
|
104
|
+
if result is None:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
fp, stat, units, blocks = result
|
|
108
|
+
|
|
109
|
+
cache.put_file_entry(fp, stat, units, blocks)
|
|
110
|
+
changed += 1
|
|
111
|
+
|
|
112
|
+
all_units.extend([u.__dict__ for u in units])
|
|
113
|
+
all_blocks.extend([b.__dict__ for b in blocks])
|
|
74
114
|
|
|
75
115
|
func_groups = build_groups(all_units)
|
|
76
116
|
block_groups = build_block_groups(all_blocks)
|
|
77
117
|
|
|
118
|
+
if args.html_out:
|
|
119
|
+
out = Path(args.html_out)
|
|
120
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
121
|
+
out.write_text(
|
|
122
|
+
build_html_report(
|
|
123
|
+
func_groups=func_groups,
|
|
124
|
+
block_groups=block_groups,
|
|
125
|
+
title="CodeClone Report",
|
|
126
|
+
context_lines=3,
|
|
127
|
+
max_snippet_lines=220,
|
|
128
|
+
),
|
|
129
|
+
"utf-8",
|
|
130
|
+
)
|
|
131
|
+
|
|
78
132
|
baseline = Baseline(args.baseline)
|
|
79
133
|
baseline.load()
|
|
80
134
|
|
|
@@ -91,10 +145,7 @@ def main():
|
|
|
91
145
|
out = Path(args.json_out)
|
|
92
146
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
93
147
|
out.write_text(
|
|
94
|
-
to_json({
|
|
95
|
-
"functions": func_groups,
|
|
96
|
-
"blocks": block_groups,
|
|
97
|
-
}),
|
|
148
|
+
to_json({"functions": func_groups, "blocks": block_groups}),
|
|
98
149
|
"utf-8",
|
|
99
150
|
)
|
|
100
151
|
|
|
@@ -114,26 +165,9 @@ def main():
|
|
|
114
165
|
print(f"Function clone groups: {len(func_groups)}")
|
|
115
166
|
print(f"Block clone groups: {len(block_groups)}")
|
|
116
167
|
|
|
117
|
-
if args.fail_on_new:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if new_func:
|
|
122
|
-
print(f"New FUNCTION clone groups: {len(new_func)}")
|
|
123
|
-
for k in sorted(new_func):
|
|
124
|
-
print(f" - {k}")
|
|
125
|
-
|
|
126
|
-
if new_block:
|
|
127
|
-
print(f"New BLOCK clone groups: {len(new_block)}")
|
|
128
|
-
for k in sorted(new_block):
|
|
129
|
-
print(f" - {k}")
|
|
130
|
-
|
|
131
|
-
raise SystemExit(3)
|
|
132
|
-
|
|
133
|
-
print(f"Baseline function clones: {len(baseline.functions)}")
|
|
134
|
-
print(f"Baseline block clones: {len(baseline.blocks)}")
|
|
135
|
-
print(f"New function clones: {len(new_func)}")
|
|
136
|
-
print(f"New block clones: {len(new_block)}")
|
|
168
|
+
if args.fail_on_new and (new_func or new_block):
|
|
169
|
+
print("\n❌ New code clones detected\n")
|
|
170
|
+
raise SystemExit(3)
|
|
137
171
|
|
|
138
172
|
cache.save()
|
|
139
173
|
|
codeclone/extractor.py
CHANGED
|
@@ -1,11 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from __future__ import annotations
|
|
2
10
|
|
|
3
11
|
import ast
|
|
4
12
|
from dataclasses import dataclass
|
|
13
|
+
from typing import Sequence
|
|
5
14
|
|
|
6
15
|
from .blocks import extract_blocks, BlockUnit
|
|
16
|
+
from .cfg import CFGBuilder
|
|
7
17
|
from .fingerprint import sha1, bucket_loc
|
|
8
|
-
from .normalize import NormalizationConfig,
|
|
18
|
+
from .normalize import NormalizationConfig, normalized_ast_dump_from_list
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# =========================
|
|
22
|
+
# Data structures
|
|
23
|
+
# =========================
|
|
9
24
|
|
|
10
25
|
|
|
11
26
|
@dataclass(frozen=True)
|
|
@@ -20,37 +35,83 @@ class Unit:
|
|
|
20
35
|
loc_bucket: str
|
|
21
36
|
|
|
22
37
|
|
|
38
|
+
# =========================
|
|
39
|
+
# Helpers
|
|
40
|
+
# =========================
|
|
41
|
+
|
|
42
|
+
|
|
23
43
|
def _stmt_count(node: ast.AST) -> int:
|
|
24
44
|
body = getattr(node, "body", None)
|
|
25
45
|
return len(body) if isinstance(body, list) else 0
|
|
26
46
|
|
|
27
47
|
|
|
28
48
|
class _QualnameBuilder(ast.NodeVisitor):
|
|
29
|
-
def __init__(self):
|
|
49
|
+
def __init__(self) -> None:
|
|
30
50
|
self.stack: list[str] = []
|
|
31
|
-
self.units: list[tuple[str, ast.
|
|
51
|
+
self.units: list[tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = []
|
|
32
52
|
|
|
33
|
-
def visit_ClassDef(self, node: ast.ClassDef):
|
|
53
|
+
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
|
34
54
|
self.stack.append(node.name)
|
|
35
55
|
self.generic_visit(node)
|
|
36
56
|
self.stack.pop()
|
|
37
57
|
|
|
38
|
-
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
58
|
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
|
39
59
|
name = ".".join(self.stack + [node.name]) if self.stack else node.name
|
|
40
60
|
self.units.append((name, node))
|
|
41
61
|
|
|
42
|
-
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
|
|
62
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
|
43
63
|
name = ".".join(self.stack + [node.name]) if self.stack else node.name
|
|
44
64
|
self.units.append((name, node))
|
|
45
65
|
|
|
46
66
|
|
|
67
|
+
# =========================
|
|
68
|
+
# CFG fingerprinting
|
|
69
|
+
# =========================
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_cfg_fingerprint(
|
|
73
|
+
node: ast.FunctionDef | ast.AsyncFunctionDef,
|
|
74
|
+
cfg: NormalizationConfig,
|
|
75
|
+
qualname: str,
|
|
76
|
+
) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Build CFG, normalize it into a canonical form, and hash it.
|
|
79
|
+
"""
|
|
80
|
+
builder = CFGBuilder()
|
|
81
|
+
graph = builder.build(qualname, node)
|
|
82
|
+
|
|
83
|
+
parts: list[str] = []
|
|
84
|
+
|
|
85
|
+
# Stable order for deterministic hash
|
|
86
|
+
for block in sorted(graph.blocks, key=lambda b: b.id):
|
|
87
|
+
# NOTE: normalized_ast_dump_from_list must accept Sequence[ast.AST] (covariant),
|
|
88
|
+
# but even if it still accepts list[ast.AST], passing list[ast.stmt] will fail
|
|
89
|
+
# due to invariance. We pass as Sequence[ast.AST] via a typed view.
|
|
90
|
+
stmts_as_ast: Sequence[ast.AST] = block.statements
|
|
91
|
+
normalized_stmts = normalized_ast_dump_from_list(stmts_as_ast, cfg)
|
|
92
|
+
|
|
93
|
+
successor_ids = sorted(succ.id for succ in block.successors)
|
|
94
|
+
|
|
95
|
+
parts.append(
|
|
96
|
+
f"BLOCK[{block.id}]:{normalized_stmts}"
|
|
97
|
+
f"|SUCCESSORS:{','.join(map(str, successor_ids))}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return sha1("|".join(parts))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# =========================
|
|
104
|
+
# Public API
|
|
105
|
+
# =========================
|
|
106
|
+
|
|
107
|
+
|
|
47
108
|
def extract_units_from_source(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
109
|
+
source: str,
|
|
110
|
+
filepath: str,
|
|
111
|
+
module_name: str,
|
|
112
|
+
cfg: NormalizationConfig,
|
|
113
|
+
min_loc: int,
|
|
114
|
+
min_stmt: int,
|
|
54
115
|
) -> tuple[list[Unit], list[BlockUnit]]:
|
|
55
116
|
try:
|
|
56
117
|
tree = ast.parse(source)
|
|
@@ -66,6 +127,7 @@ def extract_units_from_source(
|
|
|
66
127
|
for local_name, node in qb.units:
|
|
67
128
|
start = getattr(node, "lineno", None)
|
|
68
129
|
end = getattr(node, "end_lineno", None)
|
|
130
|
+
|
|
69
131
|
if not start or not end or end < start:
|
|
70
132
|
continue
|
|
71
133
|
|
|
@@ -76,26 +138,24 @@ def extract_units_from_source(
|
|
|
76
138
|
continue
|
|
77
139
|
|
|
78
140
|
qualname = f"{module_name}:{local_name}"
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
and stmt_count >= 10
|
|
98
|
-
):
|
|
141
|
+
fingerprint = get_cfg_fingerprint(node, cfg, qualname)
|
|
142
|
+
|
|
143
|
+
# Function-level unit (including __init__)
|
|
144
|
+
units.append(
|
|
145
|
+
Unit(
|
|
146
|
+
qualname=qualname,
|
|
147
|
+
filepath=filepath,
|
|
148
|
+
start_line=start,
|
|
149
|
+
end_line=end,
|
|
150
|
+
loc=loc,
|
|
151
|
+
stmt_count=stmt_count,
|
|
152
|
+
fingerprint=fingerprint,
|
|
153
|
+
loc_bucket=bucket_loc(loc),
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Block-level units (exclude __init__)
|
|
158
|
+
if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
|
|
99
159
|
blocks = extract_blocks(
|
|
100
160
|
node,
|
|
101
161
|
filepath=filepath,
|