codeclone 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeclone/normalize.py CHANGED
@@ -15,7 +15,7 @@ from collections.abc import Sequence
15
15
  from dataclasses import dataclass
16
16
 
17
17
 
18
- @dataclass(frozen=True)
18
+ @dataclass(frozen=True, slots=True)
19
19
  class NormalizationConfig:
20
20
  ignore_docstrings: bool = True
21
21
  ignore_type_annotations: bool = True
@@ -25,17 +25,19 @@ class NormalizationConfig:
25
25
 
26
26
 
27
27
  class AstNormalizer(ast.NodeTransformer):
28
+ __slots__ = ("cfg",)
29
+
28
30
  def __init__(self, cfg: NormalizationConfig):
29
31
  super().__init__()
30
32
  self.cfg = cfg
31
33
 
32
- def visit_FunctionDef(self, node: ast.FunctionDef):
34
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
33
35
  return self._visit_func(node)
34
36
 
35
- def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
37
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
36
38
  return self._visit_func(node)
37
39
 
38
- def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef):
40
+ def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef) -> ast.AST:
39
41
  # Drop docstring
40
42
  if self.cfg.ignore_docstrings and node.body:
41
43
  first = node.body[0]
@@ -63,12 +65,12 @@ class AstNormalizer(ast.NodeTransformer):
63
65
 
64
66
  return self.generic_visit(node)
65
67
 
66
- def visit_arg(self, node: ast.arg):
68
+ def visit_arg(self, node: ast.arg) -> ast.arg:
67
69
  if self.cfg.ignore_type_annotations:
68
70
  node.annotation = None
69
71
  return node
70
72
 
71
- def visit_Name(self, node: ast.Name):
73
+ def visit_Name(self, node: ast.Name) -> ast.Name:
72
74
  if self.cfg.normalize_names:
73
75
  node.id = "_VAR_"
74
76
  return node
@@ -80,7 +82,7 @@ class AstNormalizer(ast.NodeTransformer):
80
82
  new_node.attr = "_ATTR_"
81
83
  return new_node
82
84
 
83
- def visit_Constant(self, node: ast.Constant):
85
+ def visit_Constant(self, node: ast.Constant) -> ast.Constant:
84
86
  if self.cfg.normalize_constants:
85
87
  node.value = "_CONST_"
86
88
  return node
@@ -88,7 +90,8 @@ class AstNormalizer(ast.NodeTransformer):
88
90
  def visit_AugAssign(self, node: ast.AugAssign) -> AST:
89
91
  # Normalize x += 1 to x = x + 1
90
92
  # This allows detecting clones where one uses += and another uses = +
91
- # We transform AugAssign(target, op, value) to Assign([target], BinOp(target, op, value))
93
+ # We transform AugAssign(target, op, value) to Assign([target],
94
+ # BinOp(target, op, value))
92
95
 
93
96
  # Deepcopy target to avoid reuse issues in the AST
94
97
  target_load = copy.deepcopy(node.target)
@@ -108,23 +111,27 @@ class AstNormalizer(ast.NodeTransformer):
108
111
 
109
112
 
110
113
  def normalized_ast_dump(func_node: ast.AST, cfg: NormalizationConfig) -> str:
114
+ """
115
+ Dump the normalized AST.
116
+ WARNING: This modifies the AST in-place for performance.
117
+ """
111
118
  normalizer = AstNormalizer(cfg)
112
- # Deepcopy to prevent side effects on the original AST
113
- node_copy = copy.deepcopy(func_node)
114
- new_node = ast.fix_missing_locations(normalizer.visit(node_copy))
119
+ new_node = ast.fix_missing_locations(normalizer.visit(func_node))
115
120
  return ast.dump(new_node, annotate_fields=True, include_attributes=False)
116
121
 
117
122
 
118
123
  def normalized_ast_dump_from_list(
119
124
  nodes: Sequence[ast.AST], cfg: NormalizationConfig
120
125
  ) -> str:
126
+ """
127
+ Dump a list of AST nodes after normalization.
128
+ WARNING: This modifies the AST nodes in-place for performance.
129
+ """
121
130
  normalizer = AstNormalizer(cfg)
122
131
  dumps: list[str] = []
123
132
 
124
133
  for node in nodes:
125
- # Deepcopy to prevent side effects
126
- node_copy = copy.deepcopy(node)
127
- new_node = ast.fix_missing_locations(normalizer.visit(node_copy))
134
+ new_node = ast.fix_missing_locations(normalizer.visit(node))
128
135
  dumps.append(ast.dump(new_node, annotate_fields=True, include_attributes=False))
129
136
 
130
137
  return ";".join(dumps)
codeclone/py.typed ADDED
File without changes
codeclone/report.py CHANGED
@@ -11,23 +11,24 @@ from __future__ import annotations
11
11
  import json
12
12
  from typing import Any
13
13
 
14
+ GroupItem = dict[str, Any]
15
+ GroupMap = dict[str, list[GroupItem]]
14
16
 
15
- def build_groups(units: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]:
16
- groups: dict[str, list[dict[str, Any]]] = {}
17
+
18
+ def build_groups(units: list[GroupItem]) -> GroupMap:
19
+ groups: GroupMap = {}
17
20
  for u in units:
18
21
  key = f"{u['fingerprint']}|{u['loc_bucket']}"
19
22
  groups.setdefault(key, []).append(u)
20
23
  return {k: v for k, v in groups.items() if len(v) > 1}
21
24
 
22
25
 
23
- def build_block_groups(
24
- blocks: list[dict[str, Any]], min_functions: int = 2
25
- ) -> dict[str, list[dict[str, Any]]]:
26
- groups: dict[str, list[dict[str, Any]]] = {}
26
+ def build_block_groups(blocks: list[GroupItem], min_functions: int = 2) -> GroupMap:
27
+ groups: GroupMap = {}
27
28
  for b in blocks:
28
29
  groups.setdefault(b["block_hash"], []).append(b)
29
30
 
30
- filtered: dict[str, list[dict[str, Any]]] = {}
31
+ filtered: GroupMap = {}
31
32
  for h, items in groups.items():
32
33
  functions = {i["qualname"] for i in items}
33
34
  if len(functions) >= min_functions:
@@ -36,7 +37,7 @@ def build_block_groups(
36
37
  return filtered
37
38
 
38
39
 
39
- def to_json(groups: dict) -> str:
40
+ def to_json(groups: GroupMap) -> str:
40
41
  return json.dumps(
41
42
  {
42
43
  "group_count": len(groups),
@@ -52,16 +53,26 @@ def to_json(groups: dict) -> str:
52
53
  )
53
54
 
54
55
 
55
- def to_text(groups: dict) -> str:
56
+ def to_json_report(func_groups: GroupMap, block_groups: GroupMap) -> str:
57
+ return json.dumps(
58
+ {"functions": func_groups, "blocks": block_groups},
59
+ ensure_ascii=False,
60
+ indent=2,
61
+ )
62
+
63
+
64
+ def to_text(groups: GroupMap) -> str:
56
65
  lines: list[str] = []
57
66
  for i, (_, v) in enumerate(
58
67
  sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)
59
68
  ):
60
69
  lines.append(f"\n=== Clone group #{i + 1} (count={len(v)}) ===")
61
- for item in v:
62
- lines.append(
70
+ lines.extend(
71
+ [
63
72
  f"- {item['qualname']} "
64
73
  f"{item['filepath']}:{item['start_line']}-{item['end_line']} "
65
74
  f"loc={item.get('loc', item.get('size'))}"
66
- )
75
+ for item in v
76
+ ]
77
+ )
67
78
  return "\n".join(lines).strip() + "\n"
codeclone/scanner.py CHANGED
@@ -8,8 +8,11 @@ Licensed under the MIT License.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import tempfile
12
+ from collections.abc import Iterable
11
13
  from pathlib import Path
12
- from typing import Iterable
14
+
15
+ from .errors import ValidationError
13
16
 
14
17
  DEFAULT_EXCLUDES = (
15
18
  ".git",
@@ -24,15 +27,75 @@ DEFAULT_EXCLUDES = (
24
27
  ".tox",
25
28
  )
26
29
 
30
+ SENSITIVE_DIRS = {
31
+ "/etc",
32
+ "/sys",
33
+ "/proc",
34
+ "/dev",
35
+ "/root",
36
+ "/boot",
37
+ "/var",
38
+ "/private/var",
39
+ "/usr/bin",
40
+ "/usr/sbin",
41
+ "/private/etc",
42
+ }
43
+
44
+
45
+ def _get_tempdir() -> Path:
46
+ return Path(tempfile.gettempdir()).resolve()
47
+
27
48
 
28
49
  def iter_py_files(
29
- root: str, excludes: tuple[str, ...] = DEFAULT_EXCLUDES
50
+ root: str,
51
+ excludes: tuple[str, ...] = DEFAULT_EXCLUDES,
52
+ *,
53
+ max_files: int = 100_000,
30
54
  ) -> Iterable[str]:
31
- rootp = Path(root)
55
+ try:
56
+ rootp = Path(root).resolve(strict=True)
57
+ except (OSError, RuntimeError) as e:
58
+ raise ValidationError(f"Invalid root path '{root}': {e}") from e
59
+
60
+ if not rootp.is_dir():
61
+ raise ValidationError(f"Root must be a directory: {root}")
62
+
63
+ root_str = str(rootp)
64
+ temp_root = _get_tempdir()
65
+ in_temp = False
66
+ try:
67
+ rootp.relative_to(temp_root)
68
+ in_temp = True
69
+ except ValueError:
70
+ in_temp = False
71
+
72
+ if not in_temp:
73
+ if root_str in SENSITIVE_DIRS:
74
+ raise ValidationError(f"Cannot scan sensitive directory: {root}")
75
+
76
+ for sensitive in SENSITIVE_DIRS:
77
+ if root_str.startswith(sensitive + "/"):
78
+ raise ValidationError(f"Cannot scan under sensitive directory: {root}")
79
+
80
+ file_count = 0
32
81
  for p in rootp.rglob("*.py"):
82
+ # Verify path is actually under root (prevent symlink attacks)
83
+ try:
84
+ p.resolve().relative_to(rootp)
85
+ except ValueError:
86
+ # Skipping file outside root (possible symlink traversal)
87
+ continue
88
+
33
89
  parts = set(p.parts)
34
90
  if any(ex in parts for ex in excludes):
35
91
  continue
92
+
93
+ file_count += 1
94
+ if file_count > max_files:
95
+ raise ValidationError(
96
+ f"File count exceeds limit of {max_files}. "
97
+ "Use more specific root or increase limit."
98
+ )
36
99
  yield str(p)
37
100
 
38
101