codeclone 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeclone/__init__.py +16 -0
- codeclone/baseline.py +21 -9
- codeclone/blockhash.py +10 -1
- codeclone/blocks.py +26 -16
- codeclone/cache.py +20 -6
- codeclone/cfg.py +338 -0
- codeclone/cli.py +357 -93
- codeclone/extractor.py +92 -32
- codeclone/fingerprint.py +11 -1
- codeclone/html_report.py +936 -0
- codeclone/normalize.py +73 -26
- codeclone/report.py +29 -13
- codeclone/scanner.py +24 -4
- codeclone-1.2.0.dist-info/METADATA +264 -0
- codeclone-1.2.0.dist-info/RECORD +19 -0
- {codeclone-1.0.0.dist-info → codeclone-1.2.0.dist-info}/WHEEL +1 -1
- codeclone-1.0.0.dist-info/METADATA +0 -211
- codeclone-1.0.0.dist-info/RECORD +0 -17
- {codeclone-1.0.0.dist-info → codeclone-1.2.0.dist-info}/entry_points.txt +0 -0
- {codeclone-1.0.0.dist-info → codeclone-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {codeclone-1.0.0.dist-info → codeclone-1.2.0.dist-info}/top_level.txt +0 -0
codeclone/cli.py
CHANGED
|
@@ -1,105 +1,377 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from __future__ import annotations
|
|
2
10
|
|
|
3
11
|
import argparse
|
|
12
|
+
import sys
|
|
13
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
4
14
|
from pathlib import Path
|
|
5
15
|
|
|
16
|
+
from rich.console import Console
|
|
17
|
+
from rich.panel import Panel
|
|
18
|
+
from rich.progress import (
|
|
19
|
+
Progress,
|
|
20
|
+
SpinnerColumn,
|
|
21
|
+
TextColumn,
|
|
22
|
+
BarColumn,
|
|
23
|
+
TimeElapsedColumn,
|
|
24
|
+
)
|
|
25
|
+
from rich.table import Table
|
|
26
|
+
from rich.theme import Theme
|
|
27
|
+
|
|
6
28
|
from .baseline import Baseline
|
|
7
29
|
from .cache import Cache, file_stat_signature
|
|
8
30
|
from .extractor import extract_units_from_source
|
|
31
|
+
from .html_report import build_html_report
|
|
9
32
|
from .normalize import NormalizationConfig
|
|
10
33
|
from .report import build_groups, build_block_groups, to_json, to_text
|
|
11
34
|
from .scanner import iter_py_files, module_name_from_path
|
|
12
35
|
|
|
36
|
+
# Custom theme for Rich
|
|
37
|
+
custom_theme = Theme(
|
|
38
|
+
{
|
|
39
|
+
"info": "cyan",
|
|
40
|
+
"warning": "yellow",
|
|
41
|
+
"error": "bold red",
|
|
42
|
+
"success": "bold green",
|
|
43
|
+
"dim": "dim",
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
console = Console(theme=custom_theme, width=200)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def expand_path(p: str) -> Path:
|
|
50
|
+
return Path(p).expanduser().resolve()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def process_file(
|
|
54
|
+
filepath: str,
|
|
55
|
+
root: str,
|
|
56
|
+
cfg: NormalizationConfig,
|
|
57
|
+
min_loc: int,
|
|
58
|
+
min_stmt: int,
|
|
59
|
+
) -> tuple[str, dict, list, list] | None:
|
|
60
|
+
try:
|
|
61
|
+
source = Path(filepath).read_text("utf-8")
|
|
62
|
+
except UnicodeDecodeError:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
stat = file_stat_signature(filepath)
|
|
66
|
+
module_name = module_name_from_path(root, filepath)
|
|
67
|
+
|
|
68
|
+
units, blocks = extract_units_from_source(
|
|
69
|
+
source=source,
|
|
70
|
+
filepath=filepath,
|
|
71
|
+
module_name=module_name,
|
|
72
|
+
cfg=cfg,
|
|
73
|
+
min_loc=min_loc,
|
|
74
|
+
min_stmt=min_stmt,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return filepath, stat, units, blocks
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def print_banner():
|
|
81
|
+
console.print(
|
|
82
|
+
Panel.fit(
|
|
83
|
+
"[bold white]CodeClone[/bold white] [dim]v1.2.0[/dim]\n"
|
|
84
|
+
"[italic]Architectural duplication detector[/italic]",
|
|
85
|
+
border_style="blue",
|
|
86
|
+
padding=(0, 2),
|
|
87
|
+
)
|
|
88
|
+
)
|
|
13
89
|
|
|
14
|
-
def main():
|
|
15
|
-
ap = argparse.ArgumentParser("codeclone")
|
|
16
|
-
ap.add_argument("root", help="Project root")
|
|
17
|
-
ap.add_argument("--cache", default="~/.cache/codeclone/")
|
|
18
|
-
ap.add_argument("--min-loc", type=int, default=15)
|
|
19
|
-
ap.add_argument("--min-stmt", type=int, default=6)
|
|
20
|
-
ap.add_argument("--json-out", default="")
|
|
21
|
-
ap.add_argument("--text-out", default="")
|
|
22
|
-
ap.add_argument("--fail-if-groups", type=int, default=-1)
|
|
23
|
-
ap.add_argument("--baseline", default="~/.config/codeclone/baseline.json")
|
|
24
|
-
ap.add_argument("--update-baseline", action="store_true",
|
|
25
|
-
help="Write current clones as baseline")
|
|
26
|
-
ap.add_argument("--fail-on-new", action="store_true",
|
|
27
|
-
help="Fail if new clones appear vs baseline")
|
|
28
|
-
args = ap.parse_args()
|
|
29
90
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
91
|
+
def main() -> None:
|
|
92
|
+
ap = argparse.ArgumentParser(
|
|
93
|
+
prog="codeclone",
|
|
94
|
+
description="AST and CFG-based code clone detector for Python.",
|
|
95
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Core Arguments
|
|
99
|
+
core_group = ap.add_argument_group("Target")
|
|
100
|
+
core_group.add_argument(
|
|
101
|
+
"root",
|
|
102
|
+
nargs="?",
|
|
103
|
+
default=".",
|
|
104
|
+
help="Project root directory to scan.",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Tuning
|
|
108
|
+
tune_group = ap.add_argument_group("Analysis Tuning")
|
|
109
|
+
tune_group.add_argument(
|
|
110
|
+
"--min-loc",
|
|
111
|
+
type=int,
|
|
112
|
+
default=15,
|
|
113
|
+
help="Minimum Lines of Code (LOC) to consider.",
|
|
114
|
+
)
|
|
115
|
+
tune_group.add_argument(
|
|
116
|
+
"--min-stmt",
|
|
117
|
+
type=int,
|
|
118
|
+
default=6,
|
|
119
|
+
help="Minimum AST statements to consider.",
|
|
120
|
+
)
|
|
121
|
+
tune_group.add_argument(
|
|
122
|
+
"--processes",
|
|
123
|
+
type=int,
|
|
124
|
+
default=4,
|
|
125
|
+
help="Number of parallel worker processes.",
|
|
126
|
+
)
|
|
127
|
+
tune_group.add_argument(
|
|
128
|
+
"--cache-dir",
|
|
129
|
+
default="~/.cache/codeclone/cache.json",
|
|
130
|
+
help="Path to the cache file to speed up subsequent runs.",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Baseline & CI
|
|
134
|
+
ci_group = ap.add_argument_group("Baseline & CI/CD")
|
|
135
|
+
ci_group.add_argument(
|
|
136
|
+
"--baseline",
|
|
137
|
+
default="codeclone.baseline.json",
|
|
138
|
+
help="Path to the baseline file (stored in repo).",
|
|
139
|
+
)
|
|
140
|
+
ci_group.add_argument(
|
|
141
|
+
"--update-baseline",
|
|
142
|
+
action="store_true",
|
|
143
|
+
help="Overwrite the baseline file with current results.",
|
|
144
|
+
)
|
|
145
|
+
ci_group.add_argument(
|
|
146
|
+
"--fail-on-new",
|
|
147
|
+
action="store_true",
|
|
148
|
+
help="Exit with error if NEW clones (not in baseline) are detected.",
|
|
149
|
+
)
|
|
150
|
+
ci_group.add_argument(
|
|
151
|
+
"--fail-threshold",
|
|
152
|
+
type=int,
|
|
153
|
+
default=-1,
|
|
154
|
+
metavar="MAX_CLONES",
|
|
155
|
+
help="Exit with error if total clone groups exceed this number.",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Output
|
|
159
|
+
out_group = ap.add_argument_group("Reporting")
|
|
160
|
+
out_group.add_argument(
|
|
161
|
+
"--html",
|
|
162
|
+
dest="html_out",
|
|
163
|
+
metavar="FILE",
|
|
164
|
+
help="Generate an HTML report to FILE.",
|
|
165
|
+
)
|
|
166
|
+
out_group.add_argument(
|
|
167
|
+
"--json",
|
|
168
|
+
dest="json_out",
|
|
169
|
+
metavar="FILE",
|
|
170
|
+
help="Generate a JSON report to FILE.",
|
|
171
|
+
)
|
|
172
|
+
out_group.add_argument(
|
|
173
|
+
"--text",
|
|
174
|
+
dest="text_out",
|
|
175
|
+
metavar="FILE",
|
|
176
|
+
help="Generate a text report to FILE.",
|
|
36
177
|
)
|
|
178
|
+
out_group.add_argument(
|
|
179
|
+
"--no-progress",
|
|
180
|
+
action="store_true",
|
|
181
|
+
help="Disable the progress bar (recommended for CI logs).",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
args = ap.parse_args()
|
|
185
|
+
|
|
186
|
+
print_banner()
|
|
37
187
|
|
|
38
|
-
|
|
188
|
+
root_path = Path(args.root).resolve()
|
|
189
|
+
if not root_path.exists():
|
|
190
|
+
console.print(f"[error]Root path does not exist: {root_path}[/error]")
|
|
191
|
+
sys.exit(1)
|
|
192
|
+
|
|
193
|
+
console.print(f"[info]Scanning root:[/info] {root_path}")
|
|
194
|
+
|
|
195
|
+
# Initialize Cache
|
|
196
|
+
cfg = NormalizationConfig()
|
|
197
|
+
cache_path = Path(args.cache_dir).expanduser()
|
|
198
|
+
cache = Cache(cache_path)
|
|
39
199
|
cache.load()
|
|
40
200
|
|
|
41
201
|
all_units: list[dict] = []
|
|
42
202
|
all_blocks: list[dict] = []
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
except UnicodeDecodeError:
|
|
57
|
-
continue
|
|
58
|
-
|
|
59
|
-
module_name = module_name_from_path(args.root, fp)
|
|
60
|
-
units, blocks = extract_units_from_source(
|
|
61
|
-
source=source,
|
|
62
|
-
filepath=fp,
|
|
63
|
-
module_name=module_name,
|
|
64
|
-
cfg=cfg,
|
|
65
|
-
min_loc=args.min_loc,
|
|
66
|
-
min_stmt=args.min_stmt,
|
|
67
|
-
)
|
|
203
|
+
changed_files_count = 0
|
|
204
|
+
files_to_process: list[str] = []
|
|
205
|
+
|
|
206
|
+
# Discovery phase
|
|
207
|
+
with console.status("[bold green]Discovering Python files...", spinner="dots"):
|
|
208
|
+
for fp in iter_py_files(str(root_path)):
|
|
209
|
+
stat = file_stat_signature(fp)
|
|
210
|
+
cached = cache.get_file_entry(fp)
|
|
211
|
+
if cached and cached.get("stat") == stat:
|
|
212
|
+
all_units.extend(cached.get("units", []))
|
|
213
|
+
all_blocks.extend(cached.get("blocks", []))
|
|
214
|
+
else:
|
|
215
|
+
files_to_process.append(fp)
|
|
68
216
|
|
|
69
|
-
|
|
70
|
-
changed += 1
|
|
217
|
+
total_files = len(files_to_process)
|
|
71
218
|
|
|
72
|
-
|
|
73
|
-
|
|
219
|
+
# Processing phase
|
|
220
|
+
if total_files > 0:
|
|
221
|
+
if args.no_progress:
|
|
222
|
+
console.print(f"[info]Processing {total_files} changed files...[/info]")
|
|
223
|
+
with ProcessPoolExecutor(max_workers=args.processes) as executor:
|
|
224
|
+
futures = [
|
|
225
|
+
executor.submit(
|
|
226
|
+
process_file,
|
|
227
|
+
fp,
|
|
228
|
+
str(root_path),
|
|
229
|
+
cfg,
|
|
230
|
+
args.min_loc,
|
|
231
|
+
args.min_stmt,
|
|
232
|
+
)
|
|
233
|
+
for fp in files_to_process
|
|
234
|
+
]
|
|
235
|
+
for future in as_completed(futures):
|
|
236
|
+
try:
|
|
237
|
+
result = future.result()
|
|
238
|
+
except Exception as e:
|
|
239
|
+
console.print(f"[warning]Failed to process file: {e}[/warning]")
|
|
240
|
+
continue
|
|
74
241
|
|
|
75
|
-
|
|
76
|
-
|
|
242
|
+
if result:
|
|
243
|
+
fp, stat, units, blocks = result
|
|
244
|
+
cache.put_file_entry(fp, stat, units, blocks)
|
|
245
|
+
changed_files_count += 1
|
|
246
|
+
all_units.extend([u.__dict__ for u in units])
|
|
247
|
+
all_blocks.extend([b.__dict__ for b in blocks])
|
|
248
|
+
else:
|
|
249
|
+
with Progress(
|
|
250
|
+
SpinnerColumn(),
|
|
251
|
+
TextColumn("[progress.description]{task.description}"),
|
|
252
|
+
BarColumn(),
|
|
253
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
254
|
+
TimeElapsedColumn(),
|
|
255
|
+
console=console,
|
|
256
|
+
) as progress:
|
|
257
|
+
task = progress.add_task(
|
|
258
|
+
f"Analyzing {total_files} files...", total=total_files
|
|
259
|
+
)
|
|
260
|
+
with ProcessPoolExecutor(max_workers=args.processes) as executor:
|
|
261
|
+
futures = [
|
|
262
|
+
executor.submit(
|
|
263
|
+
process_file,
|
|
264
|
+
fp,
|
|
265
|
+
str(root_path),
|
|
266
|
+
cfg,
|
|
267
|
+
args.min_loc,
|
|
268
|
+
args.min_stmt,
|
|
269
|
+
)
|
|
270
|
+
for fp in files_to_process
|
|
271
|
+
]
|
|
272
|
+
for future in as_completed(futures):
|
|
273
|
+
try:
|
|
274
|
+
result = future.result()
|
|
275
|
+
except Exception:
|
|
276
|
+
# Log error but keep progress bar moving?
|
|
277
|
+
# console.print might break progress bar layout, better to rely on rich logging or just skip
|
|
278
|
+
# console.print(f"[warning]Failed to process file: {e}[/warning]")
|
|
279
|
+
continue
|
|
280
|
+
finally:
|
|
281
|
+
progress.advance(task)
|
|
77
282
|
|
|
78
|
-
|
|
79
|
-
|
|
283
|
+
if result:
|
|
284
|
+
fp, stat, units, blocks = result
|
|
285
|
+
cache.put_file_entry(fp, stat, units, blocks)
|
|
286
|
+
changed_files_count += 1
|
|
287
|
+
all_units.extend([u.__dict__ for u in units])
|
|
288
|
+
all_blocks.extend([b.__dict__ for b in blocks])
|
|
289
|
+
|
|
290
|
+
# Analysis phase
|
|
291
|
+
with console.status("[bold green]Grouping clones...", spinner="dots"):
|
|
292
|
+
func_groups = build_groups(all_units)
|
|
293
|
+
block_groups = build_block_groups(all_blocks)
|
|
294
|
+
cache.save()
|
|
295
|
+
|
|
296
|
+
# Reporting
|
|
297
|
+
func_clones_count = len(func_groups)
|
|
298
|
+
block_clones_count = len(block_groups)
|
|
299
|
+
|
|
300
|
+
# Baseline Logic
|
|
301
|
+
baseline_path = Path(args.baseline).expanduser().resolve()
|
|
302
|
+
|
|
303
|
+
# If user didn't specify path, and default logic applies, baseline_path is now ./codeclone_baseline.json
|
|
304
|
+
|
|
305
|
+
baseline = Baseline(baseline_path)
|
|
306
|
+
baseline_exists = baseline_path.exists()
|
|
307
|
+
|
|
308
|
+
if baseline_exists:
|
|
309
|
+
baseline.load()
|
|
310
|
+
else:
|
|
311
|
+
if not args.update_baseline:
|
|
312
|
+
console.print(
|
|
313
|
+
f"[warning]Baseline file not found at: [bold]{baseline_path}[/bold][/warning]\n"
|
|
314
|
+
"[dim]Comparing against an empty baseline. "
|
|
315
|
+
"Use --update-baseline to create it.[/dim]"
|
|
316
|
+
)
|
|
80
317
|
|
|
81
318
|
if args.update_baseline:
|
|
82
|
-
new_baseline = Baseline.from_groups(
|
|
83
|
-
|
|
319
|
+
new_baseline = Baseline.from_groups(
|
|
320
|
+
func_groups, block_groups, path=baseline_path
|
|
321
|
+
)
|
|
84
322
|
new_baseline.save()
|
|
85
|
-
print(f"Baseline updated: {
|
|
86
|
-
|
|
323
|
+
console.print(f"[success]✔ Baseline updated:[/success] {baseline_path}")
|
|
324
|
+
# When updating, we don't fail on new, we just saved the new state.
|
|
325
|
+
# But we might still want to print the summary.
|
|
87
326
|
|
|
327
|
+
# Diff
|
|
88
328
|
new_func, new_block = baseline.diff(func_groups, block_groups)
|
|
329
|
+
new_clones_count = len(new_func) + len(new_block)
|
|
330
|
+
|
|
331
|
+
# Summary Table
|
|
332
|
+
table = Table(title="Analysis Summary", border_style="blue")
|
|
333
|
+
table.add_column("Metric", style="cyan")
|
|
334
|
+
table.add_column("Value", style="bold white")
|
|
335
|
+
|
|
336
|
+
table.add_row("Files Processed", str(changed_files_count))
|
|
337
|
+
table.add_row("Total Function Clones", str(func_clones_count))
|
|
338
|
+
table.add_row("Total Block Clones", str(block_clones_count))
|
|
339
|
+
|
|
340
|
+
if baseline_exists:
|
|
341
|
+
style = "error" if new_clones_count > 0 else "success"
|
|
342
|
+
table.add_row(
|
|
343
|
+
"New Clones (vs Baseline)", f"[{style}]{new_clones_count}[/{style}]"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
console.print(table)
|
|
347
|
+
|
|
348
|
+
# Outputs
|
|
349
|
+
if args.html_out:
|
|
350
|
+
out = Path(args.html_out).expanduser().resolve()
|
|
351
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
352
|
+
out.write_text(
|
|
353
|
+
build_html_report(
|
|
354
|
+
func_groups=func_groups,
|
|
355
|
+
block_groups=block_groups,
|
|
356
|
+
title="CodeClone Report",
|
|
357
|
+
context_lines=3,
|
|
358
|
+
max_snippet_lines=220,
|
|
359
|
+
),
|
|
360
|
+
"utf-8",
|
|
361
|
+
)
|
|
362
|
+
console.print(f"[info]HTML report saved:[/info] {out}")
|
|
89
363
|
|
|
90
364
|
if args.json_out:
|
|
91
|
-
out = Path(args.json_out)
|
|
365
|
+
out = Path(args.json_out).expanduser().resolve()
|
|
92
366
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
93
367
|
out.write_text(
|
|
94
|
-
to_json({
|
|
95
|
-
"functions": func_groups,
|
|
96
|
-
"blocks": block_groups,
|
|
97
|
-
}),
|
|
368
|
+
to_json({"functions": func_groups, "blocks": block_groups}),
|
|
98
369
|
"utf-8",
|
|
99
370
|
)
|
|
371
|
+
console.print(f"[info]JSON report saved:[/info] {out}")
|
|
100
372
|
|
|
101
373
|
if args.text_out:
|
|
102
|
-
out = Path(args.text_out)
|
|
374
|
+
out = Path(args.text_out).expanduser().resolve()
|
|
103
375
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
104
376
|
out.write_text(
|
|
105
377
|
"FUNCTION CLONES\n"
|
|
@@ -108,37 +380,29 @@ def main():
|
|
|
108
380
|
+ to_text(block_groups),
|
|
109
381
|
"utf-8",
|
|
110
382
|
)
|
|
383
|
+
console.print(f"[info]Text report saved:[/info] {out}")
|
|
111
384
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if new_func:
|
|
122
|
-
print(f"New FUNCTION clone groups: {len(new_func)}")
|
|
123
|
-
for k in sorted(new_func):
|
|
124
|
-
print(f" - {k}")
|
|
125
|
-
|
|
126
|
-
if new_block:
|
|
127
|
-
print(f"New BLOCK clone groups: {len(new_block)}")
|
|
128
|
-
for k in sorted(new_block):
|
|
129
|
-
print(f" - {k}")
|
|
385
|
+
# Exit Codes
|
|
386
|
+
if args.fail_on_new and (new_func or new_block):
|
|
387
|
+
console.print("\n[error]❌ FAILED: New code clones detected![/error]")
|
|
388
|
+
if new_func:
|
|
389
|
+
console.print(f" New Functions: {', '.join(sorted(new_func))}")
|
|
390
|
+
if new_block:
|
|
391
|
+
console.print(f" New Blocks: {', '.join(sorted(new_block))}")
|
|
392
|
+
sys.exit(3)
|
|
130
393
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
cache.save()
|
|
394
|
+
if 0 <= args.fail_threshold < (func_clones_count + block_clones_count):
|
|
395
|
+
console.print(
|
|
396
|
+
f"\n[error]❌ FAILED: Total clones ({func_clones_count + block_clones_count}) "
|
|
397
|
+
f"exceed threshold ({args.fail_threshold})![/error]"
|
|
398
|
+
)
|
|
399
|
+
sys.exit(2)
|
|
139
400
|
|
|
140
|
-
if
|
|
141
|
-
|
|
401
|
+
if not args.update_baseline and not args.fail_on_new and new_clones_count > 0:
|
|
402
|
+
console.print(
|
|
403
|
+
"\n[warning]New clones detected but --fail-on-new not set.[/warning]\n"
|
|
404
|
+
"Run with --update-baseline to accept them as technical debt."
|
|
405
|
+
)
|
|
142
406
|
|
|
143
407
|
|
|
144
408
|
if __name__ == "__main__":
|
codeclone/extractor.py
CHANGED
|
@@ -1,11 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from __future__ import annotations
|
|
2
10
|
|
|
3
11
|
import ast
|
|
4
12
|
from dataclasses import dataclass
|
|
13
|
+
from typing import Sequence
|
|
5
14
|
|
|
6
15
|
from .blocks import extract_blocks, BlockUnit
|
|
16
|
+
from .cfg import CFGBuilder
|
|
7
17
|
from .fingerprint import sha1, bucket_loc
|
|
8
|
-
from .normalize import NormalizationConfig,
|
|
18
|
+
from .normalize import NormalizationConfig, normalized_ast_dump_from_list
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# =========================
|
|
22
|
+
# Data structures
|
|
23
|
+
# =========================
|
|
9
24
|
|
|
10
25
|
|
|
11
26
|
@dataclass(frozen=True)
|
|
@@ -20,37 +35,83 @@ class Unit:
|
|
|
20
35
|
loc_bucket: str
|
|
21
36
|
|
|
22
37
|
|
|
38
|
+
# =========================
|
|
39
|
+
# Helpers
|
|
40
|
+
# =========================
|
|
41
|
+
|
|
42
|
+
|
|
23
43
|
def _stmt_count(node: ast.AST) -> int:
|
|
24
44
|
body = getattr(node, "body", None)
|
|
25
45
|
return len(body) if isinstance(body, list) else 0
|
|
26
46
|
|
|
27
47
|
|
|
28
48
|
class _QualnameBuilder(ast.NodeVisitor):
|
|
29
|
-
def __init__(self):
|
|
49
|
+
def __init__(self) -> None:
|
|
30
50
|
self.stack: list[str] = []
|
|
31
|
-
self.units: list[tuple[str, ast.
|
|
51
|
+
self.units: list[tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = []
|
|
32
52
|
|
|
33
|
-
def visit_ClassDef(self, node: ast.ClassDef):
|
|
53
|
+
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
|
34
54
|
self.stack.append(node.name)
|
|
35
55
|
self.generic_visit(node)
|
|
36
56
|
self.stack.pop()
|
|
37
57
|
|
|
38
|
-
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
58
|
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
|
39
59
|
name = ".".join(self.stack + [node.name]) if self.stack else node.name
|
|
40
60
|
self.units.append((name, node))
|
|
41
61
|
|
|
42
|
-
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
|
|
62
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
|
43
63
|
name = ".".join(self.stack + [node.name]) if self.stack else node.name
|
|
44
64
|
self.units.append((name, node))
|
|
45
65
|
|
|
46
66
|
|
|
67
|
+
# =========================
|
|
68
|
+
# CFG fingerprinting
|
|
69
|
+
# =========================
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_cfg_fingerprint(
|
|
73
|
+
node: ast.FunctionDef | ast.AsyncFunctionDef,
|
|
74
|
+
cfg: NormalizationConfig,
|
|
75
|
+
qualname: str,
|
|
76
|
+
) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Build CFG, normalize it into a canonical form, and hash it.
|
|
79
|
+
"""
|
|
80
|
+
builder = CFGBuilder()
|
|
81
|
+
graph = builder.build(qualname, node)
|
|
82
|
+
|
|
83
|
+
parts: list[str] = []
|
|
84
|
+
|
|
85
|
+
# Stable order for deterministic hash
|
|
86
|
+
for block in sorted(graph.blocks, key=lambda b: b.id):
|
|
87
|
+
# NOTE: normalized_ast_dump_from_list must accept Sequence[ast.AST] (covariant),
|
|
88
|
+
# but even if it still accepts list[ast.AST], passing list[ast.stmt] will fail
|
|
89
|
+
# due to invariance. We pass as Sequence[ast.AST] via a typed view.
|
|
90
|
+
stmts_as_ast: Sequence[ast.AST] = block.statements
|
|
91
|
+
normalized_stmts = normalized_ast_dump_from_list(stmts_as_ast, cfg)
|
|
92
|
+
|
|
93
|
+
successor_ids = sorted(succ.id for succ in block.successors)
|
|
94
|
+
|
|
95
|
+
parts.append(
|
|
96
|
+
f"BLOCK[{block.id}]:{normalized_stmts}"
|
|
97
|
+
f"|SUCCESSORS:{','.join(map(str, successor_ids))}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return sha1("|".join(parts))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# =========================
|
|
104
|
+
# Public API
|
|
105
|
+
# =========================
|
|
106
|
+
|
|
107
|
+
|
|
47
108
|
def extract_units_from_source(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
109
|
+
source: str,
|
|
110
|
+
filepath: str,
|
|
111
|
+
module_name: str,
|
|
112
|
+
cfg: NormalizationConfig,
|
|
113
|
+
min_loc: int,
|
|
114
|
+
min_stmt: int,
|
|
54
115
|
) -> tuple[list[Unit], list[BlockUnit]]:
|
|
55
116
|
try:
|
|
56
117
|
tree = ast.parse(source)
|
|
@@ -66,6 +127,7 @@ def extract_units_from_source(
|
|
|
66
127
|
for local_name, node in qb.units:
|
|
67
128
|
start = getattr(node, "lineno", None)
|
|
68
129
|
end = getattr(node, "end_lineno", None)
|
|
130
|
+
|
|
69
131
|
if not start or not end or end < start:
|
|
70
132
|
continue
|
|
71
133
|
|
|
@@ -76,26 +138,24 @@ def extract_units_from_source(
|
|
|
76
138
|
continue
|
|
77
139
|
|
|
78
140
|
qualname = f"{module_name}:{local_name}"
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
and stmt_count >= 10
|
|
98
|
-
):
|
|
141
|
+
fingerprint = get_cfg_fingerprint(node, cfg, qualname)
|
|
142
|
+
|
|
143
|
+
# Function-level unit (including __init__)
|
|
144
|
+
units.append(
|
|
145
|
+
Unit(
|
|
146
|
+
qualname=qualname,
|
|
147
|
+
filepath=filepath,
|
|
148
|
+
start_line=start,
|
|
149
|
+
end_line=end,
|
|
150
|
+
loc=loc,
|
|
151
|
+
stmt_count=stmt_count,
|
|
152
|
+
fingerprint=fingerprint,
|
|
153
|
+
loc_bucket=bucket_loc(loc),
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Block-level units (exclude __init__)
|
|
158
|
+
if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
|
|
99
159
|
blocks = extract_blocks(
|
|
100
160
|
node,
|
|
101
161
|
filepath=filepath,
|