codeclone 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeclone/__init__.py +1 -1
- codeclone/baseline.py +44 -14
- codeclone/blockhash.py +1 -1
- codeclone/blocks.py +4 -3
- codeclone/cache.py +154 -17
- codeclone/cfg.py +128 -38
- codeclone/cfg_model.py +47 -0
- codeclone/cli.py +524 -100
- codeclone/errors.py +27 -0
- codeclone/extractor.py +101 -24
- codeclone/html_report.py +230 -691
- codeclone/normalize.py +43 -13
- codeclone/py.typed +0 -0
- codeclone/report.py +23 -12
- codeclone/scanner.py +66 -3
- codeclone/templates.py +1262 -0
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/METADATA +62 -34
- codeclone-1.2.1.dist-info/RECORD +23 -0
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/WHEEL +1 -1
- codeclone-1.1.0.dist-info/RECORD +0 -19
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/entry_points.txt +0 -0
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/top_level.txt +0 -0
codeclone/normalize.py
CHANGED
|
@@ -9,11 +9,13 @@ Licensed under the MIT License.
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
11
|
import ast
|
|
12
|
+
import copy
|
|
13
|
+
from ast import AST
|
|
12
14
|
from collections.abc import Sequence
|
|
13
15
|
from dataclasses import dataclass
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
@dataclass(frozen=True)
|
|
18
|
+
@dataclass(frozen=True, slots=True)
|
|
17
19
|
class NormalizationConfig:
|
|
18
20
|
ignore_docstrings: bool = True
|
|
19
21
|
ignore_type_annotations: bool = True
|
|
@@ -23,17 +25,19 @@ class NormalizationConfig:
|
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
class AstNormalizer(ast.NodeTransformer):
|
|
28
|
+
__slots__ = ("cfg",)
|
|
29
|
+
|
|
26
30
|
def __init__(self, cfg: NormalizationConfig):
|
|
27
31
|
super().__init__()
|
|
28
32
|
self.cfg = cfg
|
|
29
33
|
|
|
30
|
-
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
34
|
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
|
|
31
35
|
return self._visit_func(node)
|
|
32
36
|
|
|
33
|
-
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
|
|
37
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
|
|
34
38
|
return self._visit_func(node)
|
|
35
39
|
|
|
36
|
-
def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef):
|
|
40
|
+
def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef) -> ast.AST:
|
|
37
41
|
# Drop docstring
|
|
38
42
|
if self.cfg.ignore_docstrings and node.body:
|
|
39
43
|
first = node.body[0]
|
|
@@ -61,12 +65,12 @@ class AstNormalizer(ast.NodeTransformer):
|
|
|
61
65
|
|
|
62
66
|
return self.generic_visit(node)
|
|
63
67
|
|
|
64
|
-
def visit_arg(self, node: ast.arg):
|
|
68
|
+
def visit_arg(self, node: ast.arg) -> ast.arg:
|
|
65
69
|
if self.cfg.ignore_type_annotations:
|
|
66
70
|
node.annotation = None
|
|
67
71
|
return node
|
|
68
72
|
|
|
69
|
-
def visit_Name(self, node: ast.Name):
|
|
73
|
+
def visit_Name(self, node: ast.Name) -> ast.Name:
|
|
70
74
|
if self.cfg.normalize_names:
|
|
71
75
|
node.id = "_VAR_"
|
|
72
76
|
return node
|
|
@@ -78,30 +82,56 @@ class AstNormalizer(ast.NodeTransformer):
|
|
|
78
82
|
new_node.attr = "_ATTR_"
|
|
79
83
|
return new_node
|
|
80
84
|
|
|
81
|
-
def visit_Constant(self, node: ast.Constant):
|
|
85
|
+
def visit_Constant(self, node: ast.Constant) -> ast.Constant:
|
|
82
86
|
if self.cfg.normalize_constants:
|
|
83
87
|
node.value = "_CONST_"
|
|
84
88
|
return node
|
|
85
89
|
|
|
90
|
+
def visit_AugAssign(self, node: ast.AugAssign) -> AST:
|
|
91
|
+
# Normalize x += 1 to x = x + 1
|
|
92
|
+
# This allows detecting clones where one uses += and another uses = +
|
|
93
|
+
# We transform AugAssign(target, op, value) to Assign([target],
|
|
94
|
+
# BinOp(target, op, value))
|
|
95
|
+
|
|
96
|
+
# Deepcopy target to avoid reuse issues in the AST
|
|
97
|
+
target_load = copy.deepcopy(node.target)
|
|
98
|
+
# Ensure context is Load() for the right-hand side usage
|
|
99
|
+
if hasattr(target_load, "ctx"):
|
|
100
|
+
target_load.ctx = ast.Load()
|
|
101
|
+
|
|
102
|
+
new_node = ast.Assign(
|
|
103
|
+
targets=[node.target],
|
|
104
|
+
value=ast.BinOp(left=target_load, op=node.op, right=node.value),
|
|
105
|
+
lineno=node.lineno,
|
|
106
|
+
col_offset=node.col_offset,
|
|
107
|
+
end_lineno=getattr(node, "end_lineno", None),
|
|
108
|
+
end_col_offset=getattr(node, "end_col_offset", None),
|
|
109
|
+
)
|
|
110
|
+
return self.generic_visit(new_node)
|
|
111
|
+
|
|
86
112
|
|
|
87
113
|
def normalized_ast_dump(func_node: ast.AST, cfg: NormalizationConfig) -> str:
|
|
114
|
+
"""
|
|
115
|
+
Dump the normalized AST.
|
|
116
|
+
WARNING: This modifies the AST in-place for performance.
|
|
117
|
+
"""
|
|
88
118
|
normalizer = AstNormalizer(cfg)
|
|
89
|
-
new_node = ast.fix_missing_locations(
|
|
90
|
-
normalizer.visit(ast.copy_location(func_node, func_node))
|
|
91
|
-
)
|
|
119
|
+
new_node = ast.fix_missing_locations(normalizer.visit(func_node))
|
|
92
120
|
return ast.dump(new_node, annotate_fields=True, include_attributes=False)
|
|
93
121
|
|
|
94
122
|
|
|
95
123
|
def normalized_ast_dump_from_list(
|
|
96
124
|
nodes: Sequence[ast.AST], cfg: NormalizationConfig
|
|
97
125
|
) -> str:
|
|
126
|
+
"""
|
|
127
|
+
Dump a list of AST nodes after normalization.
|
|
128
|
+
WARNING: This modifies the AST nodes in-place for performance.
|
|
129
|
+
"""
|
|
98
130
|
normalizer = AstNormalizer(cfg)
|
|
99
131
|
dumps: list[str] = []
|
|
100
132
|
|
|
101
133
|
for node in nodes:
|
|
102
|
-
new_node = ast.fix_missing_locations(
|
|
103
|
-
normalizer.visit(ast.copy_location(node, node))
|
|
104
|
-
)
|
|
134
|
+
new_node = ast.fix_missing_locations(normalizer.visit(node))
|
|
105
135
|
dumps.append(ast.dump(new_node, annotate_fields=True, include_attributes=False))
|
|
106
136
|
|
|
107
137
|
return ";".join(dumps)
|
codeclone/py.typed
ADDED
|
File without changes
|
codeclone/report.py
CHANGED
|
@@ -11,23 +11,24 @@ from __future__ import annotations
|
|
|
11
11
|
import json
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
+
GroupItem = dict[str, Any]
|
|
15
|
+
GroupMap = dict[str, list[GroupItem]]
|
|
14
16
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
|
|
18
|
+
def build_groups(units: list[GroupItem]) -> GroupMap:
|
|
19
|
+
groups: GroupMap = {}
|
|
17
20
|
for u in units:
|
|
18
21
|
key = f"{u['fingerprint']}|{u['loc_bucket']}"
|
|
19
22
|
groups.setdefault(key, []).append(u)
|
|
20
23
|
return {k: v for k, v in groups.items() if len(v) > 1}
|
|
21
24
|
|
|
22
25
|
|
|
23
|
-
def build_block_groups(
|
|
24
|
-
|
|
25
|
-
) -> dict[str, list[dict]]:
|
|
26
|
-
groups: dict[str, list[dict]] = {}
|
|
26
|
+
def build_block_groups(blocks: list[GroupItem], min_functions: int = 2) -> GroupMap:
|
|
27
|
+
groups: GroupMap = {}
|
|
27
28
|
for b in blocks:
|
|
28
29
|
groups.setdefault(b["block_hash"], []).append(b)
|
|
29
30
|
|
|
30
|
-
filtered:
|
|
31
|
+
filtered: GroupMap = {}
|
|
31
32
|
for h, items in groups.items():
|
|
32
33
|
functions = {i["qualname"] for i in items}
|
|
33
34
|
if len(functions) >= min_functions:
|
|
@@ -36,7 +37,7 @@ def build_block_groups(
|
|
|
36
37
|
return filtered
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
def to_json(groups:
|
|
40
|
+
def to_json(groups: GroupMap) -> str:
|
|
40
41
|
return json.dumps(
|
|
41
42
|
{
|
|
42
43
|
"group_count": len(groups),
|
|
@@ -52,16 +53,26 @@ def to_json(groups: dict) -> str:
|
|
|
52
53
|
)
|
|
53
54
|
|
|
54
55
|
|
|
55
|
-
def
|
|
56
|
+
def to_json_report(func_groups: GroupMap, block_groups: GroupMap) -> str:
|
|
57
|
+
return json.dumps(
|
|
58
|
+
{"functions": func_groups, "blocks": block_groups},
|
|
59
|
+
ensure_ascii=False,
|
|
60
|
+
indent=2,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def to_text(groups: GroupMap) -> str:
|
|
56
65
|
lines: list[str] = []
|
|
57
66
|
for i, (_, v) in enumerate(
|
|
58
67
|
sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)
|
|
59
68
|
):
|
|
60
69
|
lines.append(f"\n=== Clone group #{i + 1} (count={len(v)}) ===")
|
|
61
|
-
|
|
62
|
-
|
|
70
|
+
lines.extend(
|
|
71
|
+
[
|
|
63
72
|
f"- {item['qualname']} "
|
|
64
73
|
f"{item['filepath']}:{item['start_line']}-{item['end_line']} "
|
|
65
74
|
f"loc={item.get('loc', item.get('size'))}"
|
|
66
|
-
|
|
75
|
+
for item in v
|
|
76
|
+
]
|
|
77
|
+
)
|
|
67
78
|
return "\n".join(lines).strip() + "\n"
|
codeclone/scanner.py
CHANGED
|
@@ -8,8 +8,11 @@ Licensed under the MIT License.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import tempfile
|
|
12
|
+
from collections.abc import Iterable
|
|
11
13
|
from pathlib import Path
|
|
12
|
-
|
|
14
|
+
|
|
15
|
+
from .errors import ValidationError
|
|
13
16
|
|
|
14
17
|
DEFAULT_EXCLUDES = (
|
|
15
18
|
".git",
|
|
@@ -24,15 +27,75 @@ DEFAULT_EXCLUDES = (
|
|
|
24
27
|
".tox",
|
|
25
28
|
)
|
|
26
29
|
|
|
30
|
+
SENSITIVE_DIRS = {
|
|
31
|
+
"/etc",
|
|
32
|
+
"/sys",
|
|
33
|
+
"/proc",
|
|
34
|
+
"/dev",
|
|
35
|
+
"/root",
|
|
36
|
+
"/boot",
|
|
37
|
+
"/var",
|
|
38
|
+
"/private/var",
|
|
39
|
+
"/usr/bin",
|
|
40
|
+
"/usr/sbin",
|
|
41
|
+
"/private/etc",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_tempdir() -> Path:
|
|
46
|
+
return Path(tempfile.gettempdir()).resolve()
|
|
47
|
+
|
|
27
48
|
|
|
28
49
|
def iter_py_files(
|
|
29
|
-
root: str,
|
|
50
|
+
root: str,
|
|
51
|
+
excludes: tuple[str, ...] = DEFAULT_EXCLUDES,
|
|
52
|
+
*,
|
|
53
|
+
max_files: int = 100_000,
|
|
30
54
|
) -> Iterable[str]:
|
|
31
|
-
|
|
55
|
+
try:
|
|
56
|
+
rootp = Path(root).resolve(strict=True)
|
|
57
|
+
except (OSError, RuntimeError) as e:
|
|
58
|
+
raise ValidationError(f"Invalid root path '{root}': {e}") from e
|
|
59
|
+
|
|
60
|
+
if not rootp.is_dir():
|
|
61
|
+
raise ValidationError(f"Root must be a directory: {root}")
|
|
62
|
+
|
|
63
|
+
root_str = str(rootp)
|
|
64
|
+
temp_root = _get_tempdir()
|
|
65
|
+
in_temp = False
|
|
66
|
+
try:
|
|
67
|
+
rootp.relative_to(temp_root)
|
|
68
|
+
in_temp = True
|
|
69
|
+
except ValueError:
|
|
70
|
+
in_temp = False
|
|
71
|
+
|
|
72
|
+
if not in_temp:
|
|
73
|
+
if root_str in SENSITIVE_DIRS:
|
|
74
|
+
raise ValidationError(f"Cannot scan sensitive directory: {root}")
|
|
75
|
+
|
|
76
|
+
for sensitive in SENSITIVE_DIRS:
|
|
77
|
+
if root_str.startswith(sensitive + "/"):
|
|
78
|
+
raise ValidationError(f"Cannot scan under sensitive directory: {root}")
|
|
79
|
+
|
|
80
|
+
file_count = 0
|
|
32
81
|
for p in rootp.rglob("*.py"):
|
|
82
|
+
# Verify path is actually under root (prevent symlink attacks)
|
|
83
|
+
try:
|
|
84
|
+
p.resolve().relative_to(rootp)
|
|
85
|
+
except ValueError:
|
|
86
|
+
# Skipping file outside root (possible symlink traversal)
|
|
87
|
+
continue
|
|
88
|
+
|
|
33
89
|
parts = set(p.parts)
|
|
34
90
|
if any(ex in parts for ex in excludes):
|
|
35
91
|
continue
|
|
92
|
+
|
|
93
|
+
file_count += 1
|
|
94
|
+
if file_count > max_files:
|
|
95
|
+
raise ValidationError(
|
|
96
|
+
f"File count exceeds limit of {max_files}. "
|
|
97
|
+
"Use more specific root or increase limit."
|
|
98
|
+
)
|
|
36
99
|
yield str(p)
|
|
37
100
|
|
|
38
101
|
|