codeclone 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeclone/__init__.py +1 -1
- codeclone/baseline.py +33 -7
- codeclone/blockhash.py +1 -1
- codeclone/blocks.py +4 -3
- codeclone/cache.py +151 -20
- codeclone/cfg.py +53 -128
- codeclone/cfg_model.py +47 -0
- codeclone/cli.py +308 -114
- codeclone/errors.py +27 -0
- codeclone/extractor.py +101 -24
- codeclone/html_report.py +196 -640
- codeclone/normalize.py +21 -14
- codeclone/py.typed +0 -0
- codeclone/report.py +23 -12
- codeclone/scanner.py +66 -3
- codeclone/templates.py +1262 -0
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/METADATA +53 -35
- codeclone-1.2.1.dist-info/RECORD +23 -0
- codeclone-1.2.0.dist-info/RECORD +0 -19
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/WHEEL +0 -0
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/entry_points.txt +0 -0
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/top_level.txt +0 -0
codeclone/normalize.py
CHANGED
|
@@ -15,7 +15,7 @@ from collections.abc import Sequence
|
|
|
15
15
|
from dataclasses import dataclass
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
@dataclass(frozen=True)
|
|
18
|
+
@dataclass(frozen=True, slots=True)
|
|
19
19
|
class NormalizationConfig:
|
|
20
20
|
ignore_docstrings: bool = True
|
|
21
21
|
ignore_type_annotations: bool = True
|
|
@@ -25,17 +25,19 @@ class NormalizationConfig:
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class AstNormalizer(ast.NodeTransformer):
|
|
28
|
+
__slots__ = ("cfg",)
|
|
29
|
+
|
|
28
30
|
def __init__(self, cfg: NormalizationConfig):
|
|
29
31
|
super().__init__()
|
|
30
32
|
self.cfg = cfg
|
|
31
33
|
|
|
32
|
-
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
34
|
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
|
|
33
35
|
return self._visit_func(node)
|
|
34
36
|
|
|
35
|
-
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
|
|
37
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
|
|
36
38
|
return self._visit_func(node)
|
|
37
39
|
|
|
38
|
-
def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef):
|
|
40
|
+
def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef) -> ast.AST:
|
|
39
41
|
# Drop docstring
|
|
40
42
|
if self.cfg.ignore_docstrings and node.body:
|
|
41
43
|
first = node.body[0]
|
|
@@ -63,12 +65,12 @@ class AstNormalizer(ast.NodeTransformer):
|
|
|
63
65
|
|
|
64
66
|
return self.generic_visit(node)
|
|
65
67
|
|
|
66
|
-
def visit_arg(self, node: ast.arg):
|
|
68
|
+
def visit_arg(self, node: ast.arg) -> ast.arg:
|
|
67
69
|
if self.cfg.ignore_type_annotations:
|
|
68
70
|
node.annotation = None
|
|
69
71
|
return node
|
|
70
72
|
|
|
71
|
-
def visit_Name(self, node: ast.Name):
|
|
73
|
+
def visit_Name(self, node: ast.Name) -> ast.Name:
|
|
72
74
|
if self.cfg.normalize_names:
|
|
73
75
|
node.id = "_VAR_"
|
|
74
76
|
return node
|
|
@@ -80,7 +82,7 @@ class AstNormalizer(ast.NodeTransformer):
|
|
|
80
82
|
new_node.attr = "_ATTR_"
|
|
81
83
|
return new_node
|
|
82
84
|
|
|
83
|
-
def visit_Constant(self, node: ast.Constant):
|
|
85
|
+
def visit_Constant(self, node: ast.Constant) -> ast.Constant:
|
|
84
86
|
if self.cfg.normalize_constants:
|
|
85
87
|
node.value = "_CONST_"
|
|
86
88
|
return node
|
|
@@ -88,7 +90,8 @@ class AstNormalizer(ast.NodeTransformer):
|
|
|
88
90
|
def visit_AugAssign(self, node: ast.AugAssign) -> AST:
|
|
89
91
|
# Normalize x += 1 to x = x + 1
|
|
90
92
|
# This allows detecting clones where one uses += and another uses = +
|
|
91
|
-
# We transform AugAssign(target, op, value) to Assign([target],
|
|
93
|
+
# We transform AugAssign(target, op, value) to Assign([target],
|
|
94
|
+
# BinOp(target, op, value))
|
|
92
95
|
|
|
93
96
|
# Deepcopy target to avoid reuse issues in the AST
|
|
94
97
|
target_load = copy.deepcopy(node.target)
|
|
@@ -108,23 +111,27 @@ class AstNormalizer(ast.NodeTransformer):
|
|
|
108
111
|
|
|
109
112
|
|
|
110
113
|
def normalized_ast_dump(func_node: ast.AST, cfg: NormalizationConfig) -> str:
|
|
114
|
+
"""
|
|
115
|
+
Dump the normalized AST.
|
|
116
|
+
WARNING: This modifies the AST in-place for performance.
|
|
117
|
+
"""
|
|
111
118
|
normalizer = AstNormalizer(cfg)
|
|
112
|
-
|
|
113
|
-
node_copy = copy.deepcopy(func_node)
|
|
114
|
-
new_node = ast.fix_missing_locations(normalizer.visit(node_copy))
|
|
119
|
+
new_node = ast.fix_missing_locations(normalizer.visit(func_node))
|
|
115
120
|
return ast.dump(new_node, annotate_fields=True, include_attributes=False)
|
|
116
121
|
|
|
117
122
|
|
|
118
123
|
def normalized_ast_dump_from_list(
|
|
119
124
|
nodes: Sequence[ast.AST], cfg: NormalizationConfig
|
|
120
125
|
) -> str:
|
|
126
|
+
"""
|
|
127
|
+
Dump a list of AST nodes after normalization.
|
|
128
|
+
WARNING: This modifies the AST nodes in-place for performance.
|
|
129
|
+
"""
|
|
121
130
|
normalizer = AstNormalizer(cfg)
|
|
122
131
|
dumps: list[str] = []
|
|
123
132
|
|
|
124
133
|
for node in nodes:
|
|
125
|
-
|
|
126
|
-
node_copy = copy.deepcopy(node)
|
|
127
|
-
new_node = ast.fix_missing_locations(normalizer.visit(node_copy))
|
|
134
|
+
new_node = ast.fix_missing_locations(normalizer.visit(node))
|
|
128
135
|
dumps.append(ast.dump(new_node, annotate_fields=True, include_attributes=False))
|
|
129
136
|
|
|
130
137
|
return ";".join(dumps)
|
codeclone/py.typed
ADDED
|
File without changes
|
codeclone/report.py
CHANGED
|
@@ -11,23 +11,24 @@ from __future__ import annotations
|
|
|
11
11
|
import json
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
+
GroupItem = dict[str, Any]
|
|
15
|
+
GroupMap = dict[str, list[GroupItem]]
|
|
14
16
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
|
|
18
|
+
def build_groups(units: list[GroupItem]) -> GroupMap:
|
|
19
|
+
groups: GroupMap = {}
|
|
17
20
|
for u in units:
|
|
18
21
|
key = f"{u['fingerprint']}|{u['loc_bucket']}"
|
|
19
22
|
groups.setdefault(key, []).append(u)
|
|
20
23
|
return {k: v for k, v in groups.items() if len(v) > 1}
|
|
21
24
|
|
|
22
25
|
|
|
23
|
-
def build_block_groups(
|
|
24
|
-
|
|
25
|
-
) -> dict[str, list[dict[str, Any]]]:
|
|
26
|
-
groups: dict[str, list[dict[str, Any]]] = {}
|
|
26
|
+
def build_block_groups(blocks: list[GroupItem], min_functions: int = 2) -> GroupMap:
|
|
27
|
+
groups: GroupMap = {}
|
|
27
28
|
for b in blocks:
|
|
28
29
|
groups.setdefault(b["block_hash"], []).append(b)
|
|
29
30
|
|
|
30
|
-
filtered:
|
|
31
|
+
filtered: GroupMap = {}
|
|
31
32
|
for h, items in groups.items():
|
|
32
33
|
functions = {i["qualname"] for i in items}
|
|
33
34
|
if len(functions) >= min_functions:
|
|
@@ -36,7 +37,7 @@ def build_block_groups(
|
|
|
36
37
|
return filtered
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
def to_json(groups:
|
|
40
|
+
def to_json(groups: GroupMap) -> str:
|
|
40
41
|
return json.dumps(
|
|
41
42
|
{
|
|
42
43
|
"group_count": len(groups),
|
|
@@ -52,16 +53,26 @@ def to_json(groups: dict) -> str:
|
|
|
52
53
|
)
|
|
53
54
|
|
|
54
55
|
|
|
55
|
-
def
|
|
56
|
+
def to_json_report(func_groups: GroupMap, block_groups: GroupMap) -> str:
|
|
57
|
+
return json.dumps(
|
|
58
|
+
{"functions": func_groups, "blocks": block_groups},
|
|
59
|
+
ensure_ascii=False,
|
|
60
|
+
indent=2,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def to_text(groups: GroupMap) -> str:
|
|
56
65
|
lines: list[str] = []
|
|
57
66
|
for i, (_, v) in enumerate(
|
|
58
67
|
sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)
|
|
59
68
|
):
|
|
60
69
|
lines.append(f"\n=== Clone group #{i + 1} (count={len(v)}) ===")
|
|
61
|
-
|
|
62
|
-
|
|
70
|
+
lines.extend(
|
|
71
|
+
[
|
|
63
72
|
f"- {item['qualname']} "
|
|
64
73
|
f"{item['filepath']}:{item['start_line']}-{item['end_line']} "
|
|
65
74
|
f"loc={item.get('loc', item.get('size'))}"
|
|
66
|
-
|
|
75
|
+
for item in v
|
|
76
|
+
]
|
|
77
|
+
)
|
|
67
78
|
return "\n".join(lines).strip() + "\n"
|
codeclone/scanner.py
CHANGED
|
@@ -8,8 +8,11 @@ Licensed under the MIT License.
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
+
import tempfile
|
|
12
|
+
from collections.abc import Iterable
|
|
11
13
|
from pathlib import Path
|
|
12
|
-
|
|
14
|
+
|
|
15
|
+
from .errors import ValidationError
|
|
13
16
|
|
|
14
17
|
DEFAULT_EXCLUDES = (
|
|
15
18
|
".git",
|
|
@@ -24,15 +27,75 @@ DEFAULT_EXCLUDES = (
|
|
|
24
27
|
".tox",
|
|
25
28
|
)
|
|
26
29
|
|
|
30
|
+
SENSITIVE_DIRS = {
|
|
31
|
+
"/etc",
|
|
32
|
+
"/sys",
|
|
33
|
+
"/proc",
|
|
34
|
+
"/dev",
|
|
35
|
+
"/root",
|
|
36
|
+
"/boot",
|
|
37
|
+
"/var",
|
|
38
|
+
"/private/var",
|
|
39
|
+
"/usr/bin",
|
|
40
|
+
"/usr/sbin",
|
|
41
|
+
"/private/etc",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_tempdir() -> Path:
|
|
46
|
+
return Path(tempfile.gettempdir()).resolve()
|
|
47
|
+
|
|
27
48
|
|
|
28
49
|
def iter_py_files(
|
|
29
|
-
root: str,
|
|
50
|
+
root: str,
|
|
51
|
+
excludes: tuple[str, ...] = DEFAULT_EXCLUDES,
|
|
52
|
+
*,
|
|
53
|
+
max_files: int = 100_000,
|
|
30
54
|
) -> Iterable[str]:
|
|
31
|
-
|
|
55
|
+
try:
|
|
56
|
+
rootp = Path(root).resolve(strict=True)
|
|
57
|
+
except (OSError, RuntimeError) as e:
|
|
58
|
+
raise ValidationError(f"Invalid root path '{root}': {e}") from e
|
|
59
|
+
|
|
60
|
+
if not rootp.is_dir():
|
|
61
|
+
raise ValidationError(f"Root must be a directory: {root}")
|
|
62
|
+
|
|
63
|
+
root_str = str(rootp)
|
|
64
|
+
temp_root = _get_tempdir()
|
|
65
|
+
in_temp = False
|
|
66
|
+
try:
|
|
67
|
+
rootp.relative_to(temp_root)
|
|
68
|
+
in_temp = True
|
|
69
|
+
except ValueError:
|
|
70
|
+
in_temp = False
|
|
71
|
+
|
|
72
|
+
if not in_temp:
|
|
73
|
+
if root_str in SENSITIVE_DIRS:
|
|
74
|
+
raise ValidationError(f"Cannot scan sensitive directory: {root}")
|
|
75
|
+
|
|
76
|
+
for sensitive in SENSITIVE_DIRS:
|
|
77
|
+
if root_str.startswith(sensitive + "/"):
|
|
78
|
+
raise ValidationError(f"Cannot scan under sensitive directory: {root}")
|
|
79
|
+
|
|
80
|
+
file_count = 0
|
|
32
81
|
for p in rootp.rglob("*.py"):
|
|
82
|
+
# Verify path is actually under root (prevent symlink attacks)
|
|
83
|
+
try:
|
|
84
|
+
p.resolve().relative_to(rootp)
|
|
85
|
+
except ValueError:
|
|
86
|
+
# Skipping file outside root (possible symlink traversal)
|
|
87
|
+
continue
|
|
88
|
+
|
|
33
89
|
parts = set(p.parts)
|
|
34
90
|
if any(ex in parts for ex in excludes):
|
|
35
91
|
continue
|
|
92
|
+
|
|
93
|
+
file_count += 1
|
|
94
|
+
if file_count > max_files:
|
|
95
|
+
raise ValidationError(
|
|
96
|
+
f"File count exceeds limit of {max_files}. "
|
|
97
|
+
"Use more specific root or increase limit."
|
|
98
|
+
)
|
|
36
99
|
yield str(p)
|
|
37
100
|
|
|
38
101
|
|