PyPI - codeclone - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

codeclone 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

codeclone/__init__.py +1 -1
codeclone/baseline.py +44 -14
codeclone/blockhash.py +1 -1
codeclone/blocks.py +4 -3
codeclone/cache.py +154 -17
codeclone/cfg.py +128 -38
codeclone/cfg_model.py +47 -0
codeclone/cli.py +524 -100
codeclone/errors.py +27 -0
codeclone/extractor.py +101 -24
codeclone/html_report.py +230 -691
codeclone/normalize.py +43 -13
codeclone/py.typed +0 -0
codeclone/report.py +23 -12
codeclone/scanner.py +66 -3
codeclone/templates.py +1262 -0
{codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/METADATA +62 -34
codeclone-1.2.1.dist-info/RECORD +23 -0
{codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/WHEEL +1 -1
codeclone-1.1.0.dist-info/RECORD +0 -19
{codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/entry_points.txt +0 -0
{codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/licenses/LICENSE +0 -0
{codeclone-1.1.0.dist-info → codeclone-1.2.1.dist-info}/top_level.txt +0 -0

codeclone/normalize.py CHANGED Viewed

@@ -9,11 +9,13 @@ Licensed under the MIT License.
 from __future__ import annotations
 import ast
+import copy
+from ast import AST
 from collections.abc import Sequence
 from dataclasses import dataclass
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class NormalizationConfig:
     ignore_docstrings: bool = True
     ignore_type_annotations: bool = True
@@ -23,17 +25,19 @@ class NormalizationConfig:
 class AstNormalizer(ast.NodeTransformer):
+    __slots__ = ("cfg",)
     def __init__(self, cfg: NormalizationConfig):
         super().__init__()
         self.cfg = cfg
-    def visit_FunctionDef(self, node: ast.FunctionDef):
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
         return self._visit_func(node)
-    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
         return self._visit_func(node)
-    def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef):
+    def _visit_func(self, node: ast.FunctionDef | ast.AsyncFunctionDef) -> ast.AST:
         # Drop docstring
         if self.cfg.ignore_docstrings and node.body:
             first = node.body[0]
@@ -61,12 +65,12 @@ class AstNormalizer(ast.NodeTransformer):
         return self.generic_visit(node)
-    def visit_arg(self, node: ast.arg):
+    def visit_arg(self, node: ast.arg) -> ast.arg:
         if self.cfg.ignore_type_annotations:
             node.annotation = None
         return node
-    def visit_Name(self, node: ast.Name):
+    def visit_Name(self, node: ast.Name) -> ast.Name:
         if self.cfg.normalize_names:
             node.id = "_VAR_"
         return node
@@ -78,30 +82,56 @@ class AstNormalizer(ast.NodeTransformer):
             new_node.attr = "_ATTR_"
         return new_node
-    def visit_Constant(self, node: ast.Constant):
+    def visit_Constant(self, node: ast.Constant) -> ast.Constant:
         if self.cfg.normalize_constants:
             node.value = "_CONST_"
         return node
+    def visit_AugAssign(self, node: ast.AugAssign) -> AST:
+        # Normalize x += 1 to x = x + 1
+        # This allows detecting clones where one uses += and another uses = +
+        # We transform AugAssign(target, op, value) to Assign([target],
+        # BinOp(target, op, value))
+        # Deepcopy target to avoid reuse issues in the AST
+        target_load = copy.deepcopy(node.target)
+        # Ensure context is Load() for the right-hand side usage
+        if hasattr(target_load, "ctx"):
+            target_load.ctx = ast.Load()
+        new_node = ast.Assign(
+            targets=[node.target],
+            value=ast.BinOp(left=target_load, op=node.op, right=node.value),
+            lineno=node.lineno,
+            col_offset=node.col_offset,
+            end_lineno=getattr(node, "end_lineno", None),
+            end_col_offset=getattr(node, "end_col_offset", None),
+        )
+        return self.generic_visit(new_node)
 def normalized_ast_dump(func_node: ast.AST, cfg: NormalizationConfig) -> str:
+    """
+    Dump the normalized AST.
+    WARNING: This modifies the AST in-place for performance.
+    """
     normalizer = AstNormalizer(cfg)
-    new_node = ast.fix_missing_locations(
-        normalizer.visit(ast.copy_location(func_node, func_node))
-    )
+    new_node = ast.fix_missing_locations(normalizer.visit(func_node))
     return ast.dump(new_node, annotate_fields=True, include_attributes=False)
 def normalized_ast_dump_from_list(
     nodes: Sequence[ast.AST], cfg: NormalizationConfig
 ) -> str:
+    """
+    Dump a list of AST nodes after normalization.
+    WARNING: This modifies the AST nodes in-place for performance.
+    """
     normalizer = AstNormalizer(cfg)
     dumps: list[str] = []
     for node in nodes:
-        new_node = ast.fix_missing_locations(
-            normalizer.visit(ast.copy_location(node, node))
-        )
+        new_node = ast.fix_missing_locations(normalizer.visit(node))
         dumps.append(ast.dump(new_node, annotate_fields=True, include_attributes=False))
     return ";".join(dumps)

codeclone/py.typed ADDED Viewed

File without changes

codeclone/report.py CHANGED Viewed

@@ -11,23 +11,24 @@ from __future__ import annotations
 import json
 from typing import Any
+GroupItem = dict[str, Any]
+GroupMap = dict[str, list[GroupItem]]
-def build_groups(units: list[dict[str, Any]]) -> dict[str, list[dict]]:
-    groups: dict[str, list[dict]] = {}
+def build_groups(units: list[GroupItem]) -> GroupMap:
+    groups: GroupMap = {}
     for u in units:
         key = f"{u['fingerprint']}|{u['loc_bucket']}"
         groups.setdefault(key, []).append(u)
     return {k: v for k, v in groups.items() if len(v) > 1}
-def build_block_groups(
-    blocks: list[dict], min_functions: int = 2
-) -> dict[str, list[dict]]:
-    groups: dict[str, list[dict]] = {}
+def build_block_groups(blocks: list[GroupItem], min_functions: int = 2) -> GroupMap:
+    groups: GroupMap = {}
     for b in blocks:
         groups.setdefault(b["block_hash"], []).append(b)
-    filtered: dict[str, list[dict]] = {}
+    filtered: GroupMap = {}
     for h, items in groups.items():
         functions = {i["qualname"] for i in items}
         if len(functions) >= min_functions:
@@ -36,7 +37,7 @@ def build_block_groups(
     return filtered
-def to_json(groups: dict) -> str:
+def to_json(groups: GroupMap) -> str:
     return json.dumps(
         {
             "group_count": len(groups),
@@ -52,16 +53,26 @@ def to_json(groups: dict) -> str:
     )
-def to_text(groups: dict) -> str:
+def to_json_report(func_groups: GroupMap, block_groups: GroupMap) -> str:
+    return json.dumps(
+        {"functions": func_groups, "blocks": block_groups},
+        ensure_ascii=False,
+        indent=2,
+    )
+def to_text(groups: GroupMap) -> str:
     lines: list[str] = []
     for i, (_, v) in enumerate(
         sorted(groups.items(), key=lambda kv: len(kv[1]), reverse=True)
     ):
         lines.append(f"\n=== Clone group #{i + 1} (count={len(v)}) ===")
-        for item in v:
-            lines.append(
+        lines.extend(
+            [
                 f"- {item['qualname']} "
                 f"{item['filepath']}:{item['start_line']}-{item['end_line']} "
                 f"loc={item.get('loc', item.get('size'))}"
-            )
+                for item in v
+            ]
+        )
     return "\n".join(lines).strip() + "\n"

codeclone/scanner.py CHANGED Viewed

@@ -8,8 +8,11 @@ Licensed under the MIT License.
 from __future__ import annotations
+import tempfile
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable
+from .errors import ValidationError
 DEFAULT_EXCLUDES = (
     ".git",
@@ -24,15 +27,75 @@ DEFAULT_EXCLUDES = (
     ".tox",
 )
+SENSITIVE_DIRS = {
+    "/etc",
+    "/sys",
+    "/proc",
+    "/dev",
+    "/root",
+    "/boot",
+    "/var",
+    "/private/var",
+    "/usr/bin",
+    "/usr/sbin",
+    "/private/etc",
+}
+def _get_tempdir() -> Path:
+    return Path(tempfile.gettempdir()).resolve()
 def iter_py_files(
-    root: str, excludes: tuple[str, ...] = DEFAULT_EXCLUDES
+    root: str,
+    excludes: tuple[str, ...] = DEFAULT_EXCLUDES,
+    *,
+    max_files: int = 100_000,
 ) -> Iterable[str]:
-    rootp = Path(root)
+    try:
+        rootp = Path(root).resolve(strict=True)
+    except (OSError, RuntimeError) as e:
+        raise ValidationError(f"Invalid root path '{root}': {e}") from e
+    if not rootp.is_dir():
+        raise ValidationError(f"Root must be a directory: {root}")
+    root_str = str(rootp)
+    temp_root = _get_tempdir()
+    in_temp = False
+    try:
+        rootp.relative_to(temp_root)
+        in_temp = True
+    except ValueError:
+        in_temp = False
+    if not in_temp:
+        if root_str in SENSITIVE_DIRS:
+            raise ValidationError(f"Cannot scan sensitive directory: {root}")
+        for sensitive in SENSITIVE_DIRS:
+            if root_str.startswith(sensitive + "/"):
+                raise ValidationError(f"Cannot scan under sensitive directory: {root}")
+    file_count = 0
     for p in rootp.rglob("*.py"):
+        # Verify path is actually under root (prevent symlink attacks)
+        try:
+            p.resolve().relative_to(rootp)
+        except ValueError:
+            # Skipping file outside root (possible symlink traversal)
+            continue
         parts = set(p.parts)
         if any(ex in parts for ex in excludes):
             continue
+        file_count += 1
+        if file_count > max_files:
+            raise ValidationError(
+                f"File count exceeds limit of {max_files}. "
+                "Use more specific root or increase limit."
+            )
         yield str(p)

codeclone 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

codeclone 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl