codeclone 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeclone/__init__.py +0 -0
- codeclone/baseline.py +46 -0
- codeclone/blockhash.py +12 -0
- codeclone/blocks.py +63 -0
- codeclone/cache.py +42 -0
- codeclone/cli.py +145 -0
- codeclone/extractor.py +109 -0
- codeclone/fingerprint.py +16 -0
- codeclone/normalize.py +83 -0
- codeclone/report.py +51 -0
- codeclone/scanner.py +28 -0
- codeclone-1.0.0.dist-info/METADATA +211 -0
- codeclone-1.0.0.dist-info/RECORD +17 -0
- codeclone-1.0.0.dist-info/WHEEL +5 -0
- codeclone-1.0.0.dist-info/entry_points.txt +2 -0
- codeclone-1.0.0.dist-info/licenses/LICENSE +21 -0
- codeclone-1.0.0.dist-info/top_level.txt +1 -0
codeclone/__init__.py
ADDED
|
File without changes
|
codeclone/baseline.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Set
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Baseline:
|
|
9
|
+
def __init__(self, path: str):
|
|
10
|
+
self.path = Path(path)
|
|
11
|
+
self.functions: Set[str] = set()
|
|
12
|
+
self.blocks: Set[str] = set()
|
|
13
|
+
|
|
14
|
+
def load(self) -> None:
|
|
15
|
+
if not self.path.exists():
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
data = json.loads(self.path.read_text("utf-8"))
|
|
19
|
+
self.functions = set(data.get("functions", []))
|
|
20
|
+
self.blocks = set(data.get("blocks", []))
|
|
21
|
+
|
|
22
|
+
def save(self) -> None:
|
|
23
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
self.path.write_text(
|
|
25
|
+
json.dumps(
|
|
26
|
+
{
|
|
27
|
+
"functions": sorted(self.functions),
|
|
28
|
+
"blocks": sorted(self.blocks),
|
|
29
|
+
},
|
|
30
|
+
indent=2,
|
|
31
|
+
ensure_ascii=False,
|
|
32
|
+
),
|
|
33
|
+
"utf-8",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def from_groups(func_groups: dict, block_groups: dict) -> "Baseline":
|
|
38
|
+
bl = Baseline("")
|
|
39
|
+
bl.functions = set(func_groups.keys())
|
|
40
|
+
bl.blocks = set(block_groups.keys())
|
|
41
|
+
return bl
|
|
42
|
+
|
|
43
|
+
def diff(self, func_groups: dict, block_groups: dict) -> tuple[set, set]:
|
|
44
|
+
new_funcs = set(func_groups.keys()) - self.functions
|
|
45
|
+
new_blocks = set(block_groups.keys()) - self.blocks
|
|
46
|
+
return new_funcs, new_blocks
|
codeclone/blockhash.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import hashlib
|
|
5
|
+
|
|
6
|
+
from .normalize import NormalizationConfig, AstNormalizer
|
|
7
|
+
|
|
8
|
+
def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
|
|
9
|
+
normalizer = AstNormalizer(cfg)
|
|
10
|
+
stmt = ast.fix_missing_locations(normalizer.visit(stmt))
|
|
11
|
+
dump = ast.dump(stmt, annotate_fields=True, include_attributes=False)
|
|
12
|
+
return hashlib.sha1(dump.encode("utf-8")).hexdigest()
|
codeclone/blocks.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from .blockhash import stmt_hash
|
|
7
|
+
from .normalize import NormalizationConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class BlockUnit:
|
|
12
|
+
block_hash: str
|
|
13
|
+
filepath: str
|
|
14
|
+
qualname: str
|
|
15
|
+
start_line: int
|
|
16
|
+
end_line: int
|
|
17
|
+
size: int
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def extract_blocks(
|
|
21
|
+
func_node: ast.AST,
|
|
22
|
+
*,
|
|
23
|
+
filepath: str,
|
|
24
|
+
qualname: str,
|
|
25
|
+
cfg: NormalizationConfig,
|
|
26
|
+
block_size: int,
|
|
27
|
+
max_blocks: int,
|
|
28
|
+
) -> list[BlockUnit]:
|
|
29
|
+
body = getattr(func_node, "body", None)
|
|
30
|
+
if not isinstance(body, list) or len(body) < block_size:
|
|
31
|
+
return []
|
|
32
|
+
|
|
33
|
+
stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
|
|
34
|
+
|
|
35
|
+
blocks: list[BlockUnit] = []
|
|
36
|
+
last_start: int | None = None
|
|
37
|
+
MIN_LINE_DISTANCE = 5 # suppress overlapping windows
|
|
38
|
+
|
|
39
|
+
for i in range(len(stmt_hashes) - block_size + 1):
|
|
40
|
+
start = getattr(body[i], "lineno", None)
|
|
41
|
+
end = getattr(body[i + block_size - 1], "end_lineno", None)
|
|
42
|
+
if not start or not end:
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
bh = "|".join(stmt_hashes[i:i + block_size])
|
|
49
|
+
|
|
50
|
+
blocks.append(BlockUnit(
|
|
51
|
+
block_hash=bh,
|
|
52
|
+
filepath=filepath,
|
|
53
|
+
qualname=qualname,
|
|
54
|
+
start_line=start,
|
|
55
|
+
end_line=end,
|
|
56
|
+
size=block_size,
|
|
57
|
+
))
|
|
58
|
+
|
|
59
|
+
last_start = start
|
|
60
|
+
if len(blocks) >= max_blocks:
|
|
61
|
+
break
|
|
62
|
+
|
|
63
|
+
return blocks
|
codeclone/cache.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import asdict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Cache:
|
|
11
|
+
def __init__(self, path: str):
|
|
12
|
+
self.path = Path(path)
|
|
13
|
+
self.data: dict = {"files": {}}
|
|
14
|
+
|
|
15
|
+
def load(self) -> None:
|
|
16
|
+
if self.path.exists():
|
|
17
|
+
self.data = json.loads(self.path.read_text("utf-8"))
|
|
18
|
+
|
|
19
|
+
def save(self) -> None:
|
|
20
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
self.path.write_text(
|
|
22
|
+
json.dumps(self.data, ensure_ascii=False, indent=2),
|
|
23
|
+
"utf-8",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def get_file_entry(self, filepath: str) -> Optional[dict]:
|
|
27
|
+
return self.data.get("files", {}).get(filepath)
|
|
28
|
+
|
|
29
|
+
def put_file_entry(self, filepath: str, stat_sig: dict, units, blocks) -> None:
|
|
30
|
+
self.data.setdefault("files", {})[filepath] = {
|
|
31
|
+
"stat": stat_sig,
|
|
32
|
+
"units": [asdict(u) for u in units],
|
|
33
|
+
"blocks": [asdict(b) for b in blocks],
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def file_stat_signature(path: str) -> dict:
|
|
38
|
+
st = os.stat(path)
|
|
39
|
+
return {
|
|
40
|
+
"mtime_ns": st.st_mtime_ns,
|
|
41
|
+
"size": st.st_size,
|
|
42
|
+
}
|
codeclone/cli.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .baseline import Baseline
|
|
7
|
+
from .cache import Cache, file_stat_signature
|
|
8
|
+
from .extractor import extract_units_from_source
|
|
9
|
+
from .normalize import NormalizationConfig
|
|
10
|
+
from .report import build_groups, build_block_groups, to_json, to_text
|
|
11
|
+
from .scanner import iter_py_files, module_name_from_path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main():
|
|
15
|
+
ap = argparse.ArgumentParser("codeclone")
|
|
16
|
+
ap.add_argument("root", help="Project root")
|
|
17
|
+
ap.add_argument("--cache", default="~/.cache/codeclone/")
|
|
18
|
+
ap.add_argument("--min-loc", type=int, default=15)
|
|
19
|
+
ap.add_argument("--min-stmt", type=int, default=6)
|
|
20
|
+
ap.add_argument("--json-out", default="")
|
|
21
|
+
ap.add_argument("--text-out", default="")
|
|
22
|
+
ap.add_argument("--fail-if-groups", type=int, default=-1)
|
|
23
|
+
ap.add_argument("--baseline", default="~/.config/codeclone/baseline.json")
|
|
24
|
+
ap.add_argument("--update-baseline", action="store_true",
|
|
25
|
+
help="Write current clones as baseline")
|
|
26
|
+
ap.add_argument("--fail-on-new", action="store_true",
|
|
27
|
+
help="Fail if new clones appear vs baseline")
|
|
28
|
+
args = ap.parse_args()
|
|
29
|
+
|
|
30
|
+
cfg = NormalizationConfig(
|
|
31
|
+
ignore_docstrings=True,
|
|
32
|
+
ignore_type_annotations=True,
|
|
33
|
+
normalize_attributes=True,
|
|
34
|
+
normalize_constants=True,
|
|
35
|
+
normalize_names=True,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
cache = Cache(args.cache)
|
|
39
|
+
cache.load()
|
|
40
|
+
|
|
41
|
+
all_units: list[dict] = []
|
|
42
|
+
all_blocks: list[dict] = []
|
|
43
|
+
changed = 0
|
|
44
|
+
|
|
45
|
+
for fp in iter_py_files(args.root):
|
|
46
|
+
stat = file_stat_signature(fp)
|
|
47
|
+
cached = cache.get_file_entry(fp)
|
|
48
|
+
|
|
49
|
+
if cached and cached.get("stat") == stat:
|
|
50
|
+
all_units.extend(cached.get("units", []))
|
|
51
|
+
all_blocks.extend(cached.get("blocks", []))
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
source = Path(fp).read_text("utf-8")
|
|
56
|
+
except UnicodeDecodeError:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
module_name = module_name_from_path(args.root, fp)
|
|
60
|
+
units, blocks = extract_units_from_source(
|
|
61
|
+
source=source,
|
|
62
|
+
filepath=fp,
|
|
63
|
+
module_name=module_name,
|
|
64
|
+
cfg=cfg,
|
|
65
|
+
min_loc=args.min_loc,
|
|
66
|
+
min_stmt=args.min_stmt,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
cache.put_file_entry(fp, stat, units, blocks)
|
|
70
|
+
changed += 1
|
|
71
|
+
|
|
72
|
+
all_units.extend([u.__dict__ for u in units])
|
|
73
|
+
all_blocks.extend([b.__dict__ for b in blocks])
|
|
74
|
+
|
|
75
|
+
func_groups = build_groups(all_units)
|
|
76
|
+
block_groups = build_block_groups(all_blocks)
|
|
77
|
+
|
|
78
|
+
baseline = Baseline(args.baseline)
|
|
79
|
+
baseline.load()
|
|
80
|
+
|
|
81
|
+
if args.update_baseline:
|
|
82
|
+
new_baseline = Baseline.from_groups(func_groups, block_groups)
|
|
83
|
+
new_baseline.path = Path(args.baseline)
|
|
84
|
+
new_baseline.save()
|
|
85
|
+
print(f"Baseline updated: {args.baseline}")
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
new_func, new_block = baseline.diff(func_groups, block_groups)
|
|
89
|
+
|
|
90
|
+
if args.json_out:
|
|
91
|
+
out = Path(args.json_out)
|
|
92
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
out.write_text(
|
|
94
|
+
to_json({
|
|
95
|
+
"functions": func_groups,
|
|
96
|
+
"blocks": block_groups,
|
|
97
|
+
}),
|
|
98
|
+
"utf-8",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if args.text_out:
|
|
102
|
+
out = Path(args.text_out)
|
|
103
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
104
|
+
out.write_text(
|
|
105
|
+
"FUNCTION CLONES\n"
|
|
106
|
+
+ to_text(func_groups)
|
|
107
|
+
+ "\nBLOCK CLONES\n"
|
|
108
|
+
+ to_text(block_groups),
|
|
109
|
+
"utf-8",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
print(f"Scanned root: {args.root}")
|
|
113
|
+
print(f"Changed files parsed: {changed}")
|
|
114
|
+
print(f"Function clone groups: {len(func_groups)}")
|
|
115
|
+
print(f"Block clone groups: {len(block_groups)}")
|
|
116
|
+
|
|
117
|
+
if args.fail_on_new:
|
|
118
|
+
if new_func or new_block:
|
|
119
|
+
print("\n❌ New code clones detected\n")
|
|
120
|
+
|
|
121
|
+
if new_func:
|
|
122
|
+
print(f"New FUNCTION clone groups: {len(new_func)}")
|
|
123
|
+
for k in sorted(new_func):
|
|
124
|
+
print(f" - {k}")
|
|
125
|
+
|
|
126
|
+
if new_block:
|
|
127
|
+
print(f"New BLOCK clone groups: {len(new_block)}")
|
|
128
|
+
for k in sorted(new_block):
|
|
129
|
+
print(f" - {k}")
|
|
130
|
+
|
|
131
|
+
raise SystemExit(3)
|
|
132
|
+
|
|
133
|
+
print(f"Baseline function clones: {len(baseline.functions)}")
|
|
134
|
+
print(f"Baseline block clones: {len(baseline.blocks)}")
|
|
135
|
+
print(f"New function clones: {len(new_func)}")
|
|
136
|
+
print(f"New block clones: {len(new_block)}")
|
|
137
|
+
|
|
138
|
+
cache.save()
|
|
139
|
+
|
|
140
|
+
if 0 <= args.fail_if_groups < len(func_groups):
|
|
141
|
+
raise SystemExit(2)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
if __name__ == "__main__":
|
|
145
|
+
main()
|
codeclone/extractor.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from .blocks import extract_blocks, BlockUnit
|
|
7
|
+
from .fingerprint import sha1, bucket_loc
|
|
8
|
+
from .normalize import NormalizationConfig, normalized_ast_dump
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class Unit:
|
|
13
|
+
qualname: str
|
|
14
|
+
filepath: str
|
|
15
|
+
start_line: int
|
|
16
|
+
end_line: int
|
|
17
|
+
loc: int
|
|
18
|
+
stmt_count: int
|
|
19
|
+
fingerprint: str
|
|
20
|
+
loc_bucket: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _stmt_count(node: ast.AST) -> int:
|
|
24
|
+
body = getattr(node, "body", None)
|
|
25
|
+
return len(body) if isinstance(body, list) else 0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _QualnameBuilder(ast.NodeVisitor):
|
|
29
|
+
def __init__(self):
|
|
30
|
+
self.stack: list[str] = []
|
|
31
|
+
self.units: list[tuple[str, ast.AST]] = []
|
|
32
|
+
|
|
33
|
+
def visit_ClassDef(self, node: ast.ClassDef):
|
|
34
|
+
self.stack.append(node.name)
|
|
35
|
+
self.generic_visit(node)
|
|
36
|
+
self.stack.pop()
|
|
37
|
+
|
|
38
|
+
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
39
|
+
name = ".".join(self.stack + [node.name]) if self.stack else node.name
|
|
40
|
+
self.units.append((name, node))
|
|
41
|
+
|
|
42
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
|
|
43
|
+
name = ".".join(self.stack + [node.name]) if self.stack else node.name
|
|
44
|
+
self.units.append((name, node))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def extract_units_from_source(
|
|
48
|
+
source: str,
|
|
49
|
+
filepath: str,
|
|
50
|
+
module_name: str,
|
|
51
|
+
cfg: NormalizationConfig,
|
|
52
|
+
min_loc: int,
|
|
53
|
+
min_stmt: int,
|
|
54
|
+
) -> tuple[list[Unit], list[BlockUnit]]:
|
|
55
|
+
try:
|
|
56
|
+
tree = ast.parse(source)
|
|
57
|
+
except SyntaxError:
|
|
58
|
+
return [], []
|
|
59
|
+
|
|
60
|
+
qb = _QualnameBuilder()
|
|
61
|
+
qb.visit(tree)
|
|
62
|
+
|
|
63
|
+
units: list[Unit] = []
|
|
64
|
+
block_units: list[BlockUnit] = []
|
|
65
|
+
|
|
66
|
+
for local_name, node in qb.units:
|
|
67
|
+
start = getattr(node, "lineno", None)
|
|
68
|
+
end = getattr(node, "end_lineno", None)
|
|
69
|
+
if not start or not end or end < start:
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
loc = end - start + 1
|
|
73
|
+
stmt_count = _stmt_count(node)
|
|
74
|
+
|
|
75
|
+
if loc < min_loc or stmt_count < min_stmt:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
qualname = f"{module_name}:{local_name}"
|
|
79
|
+
dump = normalized_ast_dump(node, cfg)
|
|
80
|
+
fp = sha1(dump)
|
|
81
|
+
|
|
82
|
+
# ✅ __init__ INCLUDED as function-level unit
|
|
83
|
+
units.append(Unit(
|
|
84
|
+
qualname=qualname,
|
|
85
|
+
filepath=filepath,
|
|
86
|
+
start_line=start,
|
|
87
|
+
end_line=end,
|
|
88
|
+
loc=loc,
|
|
89
|
+
stmt_count=stmt_count,
|
|
90
|
+
fingerprint=fp,
|
|
91
|
+
loc_bucket=bucket_loc(loc),
|
|
92
|
+
))
|
|
93
|
+
|
|
94
|
+
if (
|
|
95
|
+
not local_name.endswith("__init__")
|
|
96
|
+
and loc >= 40
|
|
97
|
+
and stmt_count >= 10
|
|
98
|
+
):
|
|
99
|
+
blocks = extract_blocks(
|
|
100
|
+
node,
|
|
101
|
+
filepath=filepath,
|
|
102
|
+
qualname=qualname,
|
|
103
|
+
cfg=cfg,
|
|
104
|
+
block_size=4,
|
|
105
|
+
max_blocks=15,
|
|
106
|
+
)
|
|
107
|
+
block_units.extend(blocks)
|
|
108
|
+
|
|
109
|
+
return units, block_units
|
codeclone/fingerprint.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
|
|
5
|
+
def sha1(s: str) -> str:
|
|
6
|
+
return hashlib.sha1(s.encode("utf-8")).hexdigest()
|
|
7
|
+
|
|
8
|
+
def bucket_loc(loc: int) -> str:
|
|
9
|
+
# Helps avoid grouping wildly different sizes if desired
|
|
10
|
+
if loc < 20:
|
|
11
|
+
return "0-19"
|
|
12
|
+
if loc < 50:
|
|
13
|
+
return "20-49"
|
|
14
|
+
if loc < 100:
|
|
15
|
+
return "50-99"
|
|
16
|
+
return "100+"
|
codeclone/normalize.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class NormalizationConfig:
|
|
9
|
+
ignore_docstrings: bool = True
|
|
10
|
+
ignore_type_annotations: bool = True
|
|
11
|
+
normalize_attributes: bool = True # obj.foo -> obj._ATTR_
|
|
12
|
+
normalize_constants: bool = True # 123/"x"/None -> _CONST_
|
|
13
|
+
normalize_names: bool = True # x,y,z -> _VAR_
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AstNormalizer(ast.NodeTransformer):
|
|
17
|
+
def __init__(self, cfg: NormalizationConfig):
|
|
18
|
+
self.cfg = cfg
|
|
19
|
+
super().__init__()
|
|
20
|
+
|
|
21
|
+
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
22
|
+
return self._visit_func(node)
|
|
23
|
+
|
|
24
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
|
|
25
|
+
return self._visit_func(node)
|
|
26
|
+
|
|
27
|
+
def _visit_func(self, node):
|
|
28
|
+
# Drop docstring (first Expr(Constant(str)))
|
|
29
|
+
if self.cfg.ignore_docstrings and node.body:
|
|
30
|
+
first = node.body[0]
|
|
31
|
+
if isinstance(first, ast.Expr) and isinstance(getattr(first, "value", None), ast.Constant):
|
|
32
|
+
if isinstance(first.value.value, str):
|
|
33
|
+
node.body = node.body[1:]
|
|
34
|
+
|
|
35
|
+
if self.cfg.ignore_type_annotations:
|
|
36
|
+
# Remove annotations in args + returns
|
|
37
|
+
if hasattr(node, "returns"):
|
|
38
|
+
node.returns = None
|
|
39
|
+
args = node.args
|
|
40
|
+
for a in getattr(args, "posonlyargs", []):
|
|
41
|
+
a.annotation = None
|
|
42
|
+
for a in getattr(args, "args", []):
|
|
43
|
+
a.annotation = None
|
|
44
|
+
for a in getattr(args, "kwonlyargs", []):
|
|
45
|
+
a.annotation = None
|
|
46
|
+
if getattr(args, "vararg", None):
|
|
47
|
+
args.vararg.annotation = None
|
|
48
|
+
if getattr(args, "kwarg", None):
|
|
49
|
+
args.kwarg.annotation = None
|
|
50
|
+
|
|
51
|
+
return self.generic_visit(node)
|
|
52
|
+
|
|
53
|
+
def visit_arg(self, node: ast.arg):
|
|
54
|
+
if self.cfg.ignore_type_annotations:
|
|
55
|
+
node.annotation = None
|
|
56
|
+
return node
|
|
57
|
+
|
|
58
|
+
def visit_Name(self, node: ast.Name):
|
|
59
|
+
if self.cfg.normalize_names:
|
|
60
|
+
node.id = "_VAR_"
|
|
61
|
+
return node
|
|
62
|
+
|
|
63
|
+
def visit_Attribute(self, node: ast.Attribute):
|
|
64
|
+
node = self.generic_visit(node)
|
|
65
|
+
if self.cfg.normalize_attributes:
|
|
66
|
+
node.attr = "_ATTR_"
|
|
67
|
+
return node
|
|
68
|
+
|
|
69
|
+
def visit_Constant(self, node: ast.Constant):
|
|
70
|
+
if self.cfg.normalize_constants:
|
|
71
|
+
# Preserve booleans? up to you; default: normalize everything
|
|
72
|
+
node.value = "_CONST_"
|
|
73
|
+
return node
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def normalized_ast_dump(func_node: ast.AST, cfg: NormalizationConfig) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Returns stable string representation of normalized AST.
|
|
79
|
+
"""
|
|
80
|
+
normalizer = AstNormalizer(cfg)
|
|
81
|
+
new_node = ast.fix_missing_locations(normalizer.visit(ast.copy_location(func_node, func_node)))
|
|
82
|
+
# include_attributes=False => more stable; annotate_fields=True => default
|
|
83
|
+
return ast.dump(new_node, annotate_fields=True, include_attributes=False)
|
codeclone/report.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def build_groups(units: list[dict[str, Any]]) -> dict[str, list[dict]]:
|
|
8
|
+
groups: dict[str, list[dict]] = {}
|
|
9
|
+
for u in units:
|
|
10
|
+
key = f"{u['fingerprint']}|{u['loc_bucket']}"
|
|
11
|
+
groups.setdefault(key, []).append(u)
|
|
12
|
+
return {k: v for k, v in groups.items() if len(v) > 1}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_block_groups(blocks: list[dict], min_functions: int = 2) -> dict[str, list[dict]]:
|
|
16
|
+
groups: dict[str, list[dict]] = {}
|
|
17
|
+
for b in blocks:
|
|
18
|
+
groups.setdefault(b["block_hash"], []).append(b)
|
|
19
|
+
|
|
20
|
+
filtered: dict[str, list[dict]] = {}
|
|
21
|
+
for h, items in groups.items():
|
|
22
|
+
functions = {i["qualname"] for i in items}
|
|
23
|
+
if len(functions) >= min_functions:
|
|
24
|
+
filtered[h] = items
|
|
25
|
+
|
|
26
|
+
return filtered
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def to_json(groups: dict) -> str:
|
|
30
|
+
return json.dumps({
|
|
31
|
+
"group_count": len(groups),
|
|
32
|
+
"groups": [
|
|
33
|
+
{"key": k, "count": len(v), "items": v}
|
|
34
|
+
for k, v in sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)
|
|
35
|
+
],
|
|
36
|
+
}, ensure_ascii=False, indent=2)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def to_text(groups: dict) -> str:
|
|
40
|
+
lines: list[str] = []
|
|
41
|
+
for i, (_, v) in enumerate(
|
|
42
|
+
sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)
|
|
43
|
+
):
|
|
44
|
+
lines.append(f"\n=== Clone group #{i + 1} (count={len(v)}) ===")
|
|
45
|
+
for item in v:
|
|
46
|
+
lines.append(
|
|
47
|
+
f"- {item['qualname']} "
|
|
48
|
+
f"{item['filepath']}:{item['start_line']}-{item['end_line']} "
|
|
49
|
+
f"loc={item.get('loc', item.get('size'))}"
|
|
50
|
+
)
|
|
51
|
+
return "\n".join(lines).strip() + "\n"
|
codeclone/scanner.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
DEFAULT_EXCLUDES = (
|
|
7
|
+
".git", ".venv", "venv", "__pycache__", "site-packages",
|
|
8
|
+
"migrations", "alembic", "dist", "build", ".tox",
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
def iter_py_files(root: str, excludes: tuple[str, ...] = DEFAULT_EXCLUDES) -> Iterable[str]:
|
|
12
|
+
rootp = Path(root)
|
|
13
|
+
for p in rootp.rglob("*.py"):
|
|
14
|
+
parts = set(p.parts)
|
|
15
|
+
if any(ex in parts for ex in excludes):
|
|
16
|
+
continue
|
|
17
|
+
yield str(p)
|
|
18
|
+
|
|
19
|
+
def module_name_from_path(root: str, filepath: str) -> str:
|
|
20
|
+
rootp = Path(root).resolve()
|
|
21
|
+
fp = Path(filepath).resolve()
|
|
22
|
+
rel = fp.relative_to(rootp)
|
|
23
|
+
# strip ".py"
|
|
24
|
+
stem = rel.with_suffix("")
|
|
25
|
+
# __init__.py -> package name
|
|
26
|
+
if stem.name == "__init__":
|
|
27
|
+
stem = stem.parent
|
|
28
|
+
return ".".join(stem.parts)
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codeclone
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: AST-based code clone detector for Python focused on architectural duplication
|
|
5
|
+
Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
6
|
+
Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/orenlab/codeclone
|
|
9
|
+
Project-URL: Repository, https://github.com/orenlab/codeclone
|
|
10
|
+
Project-URL: Issues, https://github.com/orenlab/codeclone/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/orenlab/codeclone/releases
|
|
12
|
+
Keywords: python,ast,code-clone,duplication,static-analysis,ci,architecture
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
16
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
17
|
+
Classifier: Topic :: Software Development :: Testing
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Operating System :: OS Independent
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=9.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
31
|
+
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# CodeClone
|
|
35
|
+
|
|
36
|
+
**CodeClone** is an AST-based code clone detector for Python, focused on **architectural duplication**, not simple
|
|
37
|
+
copy-paste.
|
|
38
|
+
|
|
39
|
+
It is designed to help teams:
|
|
40
|
+
|
|
41
|
+
- discover structural and logical code duplication,
|
|
42
|
+
- understand architectural hotspots,
|
|
43
|
+
- and prevent *new* duplication from entering the codebase via CI.
|
|
44
|
+
|
|
45
|
+
Unlike token- or text-based tools, CodeClone works on **normalized Python AST**, which makes it robust against renaming,
|
|
46
|
+
formatting, and minor refactoring.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Why CodeClone?
|
|
51
|
+
|
|
52
|
+
Most existing tools detect *textual* duplication.
|
|
53
|
+
CodeClone detects **structural and block-level duplication** that usually indicates missing abstractions or
|
|
54
|
+
architectural drift.
|
|
55
|
+
|
|
56
|
+
Typical use cases:
|
|
57
|
+
|
|
58
|
+
- duplicated service logic across layers (API ↔ application),
|
|
59
|
+
- repeated validation or guard blocks,
|
|
60
|
+
- copy-pasted request/handler flows,
|
|
61
|
+
- duplicated orchestration logic in routers, handlers, or services.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Features
|
|
66
|
+
|
|
67
|
+
### Function-level clone detection (Type-2)
|
|
68
|
+
|
|
69
|
+
- Detects functions and methods with identical structure.
|
|
70
|
+
- Robust to:
|
|
71
|
+
- variable renaming,
|
|
72
|
+
- constant changes,
|
|
73
|
+
- formatting differences.
|
|
74
|
+
- Ideal for spotting architectural duplication between layers.
|
|
75
|
+
|
|
76
|
+
### Block-level clone detection (Type-3-lite)
|
|
77
|
+
|
|
78
|
+
- Detects repeated **statement blocks** inside larger functions.
|
|
79
|
+
- Targets:
|
|
80
|
+
- validation blocks,
|
|
81
|
+
- guard clauses,
|
|
82
|
+
- repeated orchestration logic.
|
|
83
|
+
- Carefully filtered to avoid noise:
|
|
84
|
+
- no overlapping windows,
|
|
85
|
+
- no clones inside the same function,
|
|
86
|
+
- no `__init__` noise.
|
|
87
|
+
|
|
88
|
+
### Low-noise by design
|
|
89
|
+
|
|
90
|
+
- AST normalization instead of token matching.
|
|
91
|
+
- Size and statement-count thresholds.
|
|
92
|
+
- Conservative defaults tuned for real-world Python projects.
|
|
93
|
+
|
|
94
|
+
### CI-friendly baseline mode
|
|
95
|
+
|
|
96
|
+
- Establish a baseline of existing clones.
|
|
97
|
+
- Fail CI **only when new clones are introduced**.
|
|
98
|
+
- Safe for legacy codebases.
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Installation
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
pip install codeclone
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Python 3.10+ is required.
|
|
109
|
+
|
|
110
|
+
⸻
|
|
111
|
+
|
|
112
|
+
Quick Start
|
|
113
|
+
|
|
114
|
+
Run on a project:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
codeclone .
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
This will:
|
|
121
|
+
|
|
122
|
+
* scan Python files,
|
|
123
|
+
* detect function-level and block-level clones,
|
|
124
|
+
* print a summary to stdout.
|
|
125
|
+
|
|
126
|
+
Generate reports:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
codeclone . \
|
|
130
|
+
--json-out .cache/codeclone/report.json \
|
|
131
|
+
--text-out .cache/codeclone/report.txt
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
⸻
|
|
135
|
+
|
|
136
|
+
Baseline Workflow (Recommended)
|
|
137
|
+
|
|
138
|
+
1. Create a baseline
|
|
139
|
+
|
|
140
|
+
Run once on your current codebase:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
codeclone . --update-baseline
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
This creates a file:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
.codeclone-baseline.json
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Commit this file to the repository.
|
|
153
|
+
|
|
154
|
+
⸻
|
|
155
|
+
|
|
156
|
+
2. Use in CI
|
|
157
|
+
|
|
158
|
+
In CI, run:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
codeclone . --fail-on-new
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Behavior:
|
|
165
|
+
|
|
166
|
+
* ✅ existing clones are allowed,
|
|
167
|
+
* ❌ build fails if new function or block clones appear,
|
|
168
|
+
* ✅ refactoring that removes duplication is always allowed.
|
|
169
|
+
|
|
170
|
+
This enables gradual improvement without breaking existing development flow.
|
|
171
|
+
|
|
172
|
+
⸻
|
|
173
|
+
|
|
174
|
+
What CodeClone Is (and Is Not)
|
|
175
|
+
|
|
176
|
+
CodeClone is
|
|
177
|
+
|
|
178
|
+
* an architectural analysis tool,
|
|
179
|
+
* a duplication radar,
|
|
180
|
+
* a CI guard against copy-paste.
|
|
181
|
+
|
|
182
|
+
CodeClone is not
|
|
183
|
+
|
|
184
|
+
* a linter,
|
|
185
|
+
* a formatter,
|
|
186
|
+
* a replacement for SonarQube or static analyzers,
|
|
187
|
+
* a semantic equivalence prover.
|
|
188
|
+
|
|
189
|
+
It intentionally focuses on high-signal duplication.
|
|
190
|
+
|
|
191
|
+
⸻
|
|
192
|
+
|
|
193
|
+
How It Works (High Level)
|
|
194
|
+
|
|
195
|
+
* Parses Python source into AST.
|
|
196
|
+
* Normalizes:
|
|
197
|
+
- variable names,
|
|
198
|
+
- constants,
|
|
199
|
+
- attributes,
|
|
200
|
+
- docstrings and annotations.
|
|
201
|
+
* Computes stable structural fingerprints.
|
|
202
|
+
* Detects:
|
|
203
|
+
- identical function structures,
|
|
204
|
+
- repeated statement blocks across functions.
|
|
205
|
+
* Applies filters to suppress noise.
|
|
206
|
+
|
|
207
|
+
⸻
|
|
208
|
+
|
|
209
|
+
License
|
|
210
|
+
|
|
211
|
+
MIT License
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
codeclone/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
codeclone/baseline.py,sha256=1848Ugh4l3czOUIgXN68oPx_tu2nzbnnZQUixD1OXXA,1367
|
|
3
|
+
codeclone/blockhash.py,sha256=QewX6jSCc7Q2tDsBXicFGzaMzKPb2S6unMEZEvwuwDs,414
|
|
4
|
+
codeclone/blocks.py,sha256=6nXELQsH2OKl7ScNyLQiR7rMY2jfnsnGTt5_yXbwh3Y,1533
|
|
5
|
+
codeclone/cache.py,sha256=kiqfj5V3evW3hyhKMVqW7EFUiN9AO4mntFPUzfXAjsA,1156
|
|
6
|
+
codeclone/cli.py,sha256=uggGIVDw2QLQKKh5BsZYb2XGpe0ysEsQKx-7JDcepXA,4526
|
|
7
|
+
codeclone/extractor.py,sha256=ubMfYfM87F1apEmzBnnv9W4daY7Gv2nQHthiNoeTTno,2884
|
|
8
|
+
codeclone/fingerprint.py,sha256=pSucv648MGe6LwNczxJBbQjnAOcpHgkXSokaHwGr5zw,364
|
|
9
|
+
codeclone/normalize.py,sha256=hG__ZqJCtUMVIv7c_a9PHfzSVGDbrAIOH5JYXnLfuOk,2930
|
|
10
|
+
codeclone/report.py,sha256=Ptgne99-nsyvAGyJL3SsPNH9fQLorV2mVf--a0KXfxE,1639
|
|
11
|
+
codeclone/scanner.py,sha256=_xomEXvx1mLhMVRiMXW-gkBUV_9Z3GixFV5nK0Pqeq4,831
|
|
12
|
+
codeclone-1.0.0.dist-info/licenses/LICENSE,sha256=ndXAbunvN-jCQjgCaoBOF5AN4FcRlAa0R7hK1lWDuBU,1073
|
|
13
|
+
codeclone-1.0.0.dist-info/METADATA,sha256=F9-0TneuHJuI2USqeIgW0Ayrue4KVTL-5Um69vaN3-I,4993
|
|
14
|
+
codeclone-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
codeclone-1.0.0.dist-info/entry_points.txt,sha256=_MI9DVTLOmv3OlxpyogdOmMAduiLVIdHeOlZ_KBsrIY,49
|
|
16
|
+
codeclone-1.0.0.dist-info/top_level.txt,sha256=4tQa_d-4Zle-qV9KmNDkWq0WHYgZsW9vdaeF30rNntg,10
|
|
17
|
+
codeclone-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Denis Rozhnovskiy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
codeclone
|