codeclone 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeclone/normalize.py CHANGED
@@ -9,11 +9,13 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import ast
12
+ import copy
13
+ from ast import AST
12
14
  from collections.abc import Sequence
13
15
  from dataclasses import dataclass
14
16
 
15
17
 
16
- @dataclass(frozen=True)
18
+ @dataclass(frozen=True, slots=True)
17
19
  class NormalizationConfig:
18
20
  ignore_docstrings: bool = True
19
21
  ignore_type_annotations: bool = True
@@ -23,17 +25,19 @@ class NormalizationConfig:
23
25
 
24
26
 
25
27
  class AstNormalizer(ast.NodeTransformer):
28
+ __slots__ = ("cfg",)
29
+
26
30
  def __init__(self, cfg: NormalizationConfig):
27
31
  super().__init__()
28
32
  self.cfg = cfg
29
33
 
30
- def visit_FunctionDef(self, node: ast.FunctionDef):
34
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
31
35
  return self._visit_func(node)
32
36
 
33
- def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
37
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
34
38
  return self._visit_func(node)
35
39
 
36
- def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef):
40
+ def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef) -> ast.AST:
37
41
  # Drop docstring
38
42
  if self.cfg.ignore_docstrings and node.body:
39
43
  first = node.body[0]
@@ -61,12 +65,12 @@ class AstNormalizer(ast.NodeTransformer):
61
65
 
62
66
  return self.generic_visit(node)
63
67
 
64
- def visit_arg(self, node: ast.arg):
68
+ def visit_arg(self, node: ast.arg) -> ast.arg:
65
69
  if self.cfg.ignore_type_annotations:
66
70
  node.annotation = None
67
71
  return node
68
72
 
69
- def visit_Name(self, node: ast.Name):
73
+ def visit_Name(self, node: ast.Name) -> ast.Name:
70
74
  if self.cfg.normalize_names:
71
75
  node.id = "_VAR_"
72
76
  return node
@@ -78,30 +82,56 @@ class AstNormalizer(ast.NodeTransformer):
78
82
  new_node.attr = "_ATTR_"
79
83
  return new_node
80
84
 
81
- def visit_Constant(self, node: ast.Constant):
85
+ def visit_Constant(self, node: ast.Constant) -> ast.Constant:
82
86
  if self.cfg.normalize_constants:
83
87
  node.value = "_CONST_"
84
88
  return node
85
89
 
90
+ def visit_AugAssign(self, node: ast.AugAssign) -> AST:
91
+ # Normalize x += 1 to x = x + 1
92
+ # This allows detecting clones where one uses += and another uses = +
93
+ # We transform AugAssign(target, op, value) to Assign([target],
94
+ # BinOp(target, op, value))
95
+
96
+ # Deepcopy target to avoid reuse issues in the AST
97
+ target_load = copy.deepcopy(node.target)
98
+ # Ensure context is Load() for the right-hand side usage
99
+ if hasattr(target_load, "ctx"):
100
+ target_load.ctx = ast.Load()
101
+
102
+ new_node = ast.Assign(
103
+ targets=[node.target],
104
+ value=ast.BinOp(left=target_load, op=node.op, right=node.value),
105
+ lineno=node.lineno,
106
+ col_offset=node.col_offset,
107
+ end_lineno=getattr(node, "end_lineno", None),
108
+ end_col_offset=getattr(node, "end_col_offset", None),
109
+ )
110
+ return self.generic_visit(new_node)
111
+
86
112
 
87
113
  def normalized_ast_dump(func_node: ast.AST, cfg: NormalizationConfig) -> str:
114
+ """
115
+ Dump the normalized AST.
116
+ WARNING: This modifies the AST in-place for performance.
117
+ """
88
118
  normalizer = AstNormalizer(cfg)
89
- new_node = ast.fix_missing_locations(
90
- normalizer.visit(ast.copy_location(func_node, func_node))
91
- )
119
+ new_node = ast.fix_missing_locations(normalizer.visit(func_node))
92
120
  return ast.dump(new_node, annotate_fields=True, include_attributes=False)
93
121
 
94
122
 
95
123
  def normalized_ast_dump_from_list(
96
124
  nodes: Sequence[ast.AST], cfg: NormalizationConfig
97
125
  ) -> str:
126
+ """
127
+ Dump a list of AST nodes after normalization.
128
+ WARNING: This modifies the AST nodes in-place for performance.
129
+ """
98
130
  normalizer = AstNormalizer(cfg)
99
131
  dumps: list[str] = []
100
132
 
101
133
  for node in nodes:
102
- new_node = ast.fix_missing_locations(
103
- normalizer.visit(ast.copy_location(node, node))
104
- )
134
+ new_node = ast.fix_missing_locations(normalizer.visit(node))
105
135
  dumps.append(ast.dump(new_node, annotate_fields=True, include_attributes=False))
106
136
 
107
137
  return ";".join(dumps)
codeclone/py.typed ADDED
File without changes
codeclone/report.py CHANGED
@@ -11,23 +11,24 @@ from __future__ import annotations
11
11
  import json
12
12
  from typing import Any
13
13
 
14
+ GroupItem = dict[str, Any]
15
+ GroupMap = dict[str, list[GroupItem]]
14
16
 
15
- def build_groups(units: list[dict[str, Any]]) -> dict[str, list[dict]]:
16
- groups: dict[str, list[dict]] = {}
17
+
18
+ def build_groups(units: list[GroupItem]) -> GroupMap:
19
+ groups: GroupMap = {}
17
20
  for u in units:
18
21
  key = f"{u['fingerprint']}|{u['loc_bucket']}"
19
22
  groups.setdefault(key, []).append(u)
20
23
  return {k: v for k, v in groups.items() if len(v) > 1}
21
24
 
22
25
 
23
- def build_block_groups(
24
- blocks: list[dict], min_functions: int = 2
25
- ) -> dict[str, list[dict]]:
26
- groups: dict[str, list[dict]] = {}
26
+ def build_block_groups(blocks: list[GroupItem], min_functions: int = 2) -> GroupMap:
27
+ groups: GroupMap = {}
27
28
  for b in blocks:
28
29
  groups.setdefault(b["block_hash"], []).append(b)
29
30
 
30
- filtered: dict[str, list[dict]] = {}
31
+ filtered: GroupMap = {}
31
32
  for h, items in groups.items():
32
33
  functions = {i["qualname"] for i in items}
33
34
  if len(functions) >= min_functions:
@@ -36,7 +37,7 @@ def build_block_groups(
36
37
  return filtered
37
38
 
38
39
 
39
- def to_json(groups: dict) -> str:
40
+ def to_json(groups: GroupMap) -> str:
40
41
  return json.dumps(
41
42
  {
42
43
  "group_count": len(groups),
@@ -52,16 +53,26 @@ def to_json(groups: dict) -> str:
52
53
  )
53
54
 
54
55
 
55
- def to_text(groups: dict) -> str:
56
+ def to_json_report(func_groups: GroupMap, block_groups: GroupMap) -> str:
57
+ return json.dumps(
58
+ {"functions": func_groups, "blocks": block_groups},
59
+ ensure_ascii=False,
60
+ indent=2,
61
+ )
62
+
63
+
64
+ def to_text(groups: GroupMap) -> str:
56
65
  lines: list[str] = []
57
66
  for i, (_, v) in enumerate(
58
67
  sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)
59
68
  ):
60
69
  lines.append(f"\n=== Clone group #{i + 1} (count={len(v)}) ===")
61
- for item in v:
62
- lines.append(
70
+ lines.extend(
71
+ [
63
72
  f"- {item['qualname']} "
64
73
  f"{item['filepath']}:{item['start_line']}-{item['end_line']} "
65
74
  f"loc={item.get('loc', item.get('size'))}"
66
- )
75
+ for item in v
76
+ ]
77
+ )
67
78
  return "\n".join(lines).strip() + "\n"
codeclone/scanner.py CHANGED
@@ -8,8 +8,11 @@ Licensed under the MIT License.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import tempfile
12
+ from collections.abc import Iterable
11
13
  from pathlib import Path
12
- from typing import Iterable
14
+
15
+ from .errors import ValidationError
13
16
 
14
17
  DEFAULT_EXCLUDES = (
15
18
  ".git",
@@ -24,15 +27,75 @@ DEFAULT_EXCLUDES = (
24
27
  ".tox",
25
28
  )
26
29
 
30
+ SENSITIVE_DIRS = {
31
+ "/etc",
32
+ "/sys",
33
+ "/proc",
34
+ "/dev",
35
+ "/root",
36
+ "/boot",
37
+ "/var",
38
+ "/private/var",
39
+ "/usr/bin",
40
+ "/usr/sbin",
41
+ "/private/etc",
42
+ }
43
+
44
+
45
+ def _get_tempdir() -> Path:
46
+ return Path(tempfile.gettempdir()).resolve()
47
+
27
48
 
28
49
  def iter_py_files(
29
- root: str, excludes: tuple[str, ...] = DEFAULT_EXCLUDES
50
+ root: str,
51
+ excludes: tuple[str, ...] = DEFAULT_EXCLUDES,
52
+ *,
53
+ max_files: int = 100_000,
30
54
  ) -> Iterable[str]:
31
- rootp = Path(root)
55
+ try:
56
+ rootp = Path(root).resolve(strict=True)
57
+ except (OSError, RuntimeError) as e:
58
+ raise ValidationError(f"Invalid root path '{root}': {e}") from e
59
+
60
+ if not rootp.is_dir():
61
+ raise ValidationError(f"Root must be a directory: {root}")
62
+
63
+ root_str = str(rootp)
64
+ temp_root = _get_tempdir()
65
+ in_temp = False
66
+ try:
67
+ rootp.relative_to(temp_root)
68
+ in_temp = True
69
+ except ValueError:
70
+ in_temp = False
71
+
72
+ if not in_temp:
73
+ if root_str in SENSITIVE_DIRS:
74
+ raise ValidationError(f"Cannot scan sensitive directory: {root}")
75
+
76
+ for sensitive in SENSITIVE_DIRS:
77
+ if root_str.startswith(sensitive + "/"):
78
+ raise ValidationError(f"Cannot scan under sensitive directory: {root}")
79
+
80
+ file_count = 0
32
81
  for p in rootp.rglob("*.py"):
82
+ # Verify path is actually under root (prevent symlink attacks)
83
+ try:
84
+ p.resolve().relative_to(rootp)
85
+ except ValueError:
86
+ # Skipping file outside root (possible symlink traversal)
87
+ continue
88
+
33
89
  parts = set(p.parts)
34
90
  if any(ex in parts for ex in excludes):
35
91
  continue
92
+
93
+ file_count += 1
94
+ if file_count > max_files:
95
+ raise ValidationError(
96
+ f"File count exceeds limit of {max_files}. "
97
+ "Use more specific root or increase limit."
98
+ )
36
99
  yield str(p)
37
100
 
38
101