PyPI - codeclone - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

codeclone 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

codeclone/__init__.py +16 -0
codeclone/baseline.py +8 -0
codeclone/blockhash.py +10 -1
codeclone/blocks.py +26 -16
codeclone/cache.py +8 -0
codeclone/cfg.py +173 -0
codeclone/cli.py +92 -58
codeclone/extractor.py +92 -32
codeclone/fingerprint.py +11 -1
codeclone/html_report.py +953 -0
codeclone/normalize.py +50 -26
codeclone/report.py +25 -9
codeclone/scanner.py +24 -4
codeclone-1.1.0.dist-info/METADATA +254 -0
codeclone-1.1.0.dist-info/RECORD +19 -0
codeclone-1.0.0.dist-info/METADATA +0 -211
codeclone-1.0.0.dist-info/RECORD +0 -17
{codeclone-1.0.0.dist-info → codeclone-1.1.0.dist-info}/WHEEL +0 -0
{codeclone-1.0.0.dist-info → codeclone-1.1.0.dist-info}/entry_points.txt +0 -0
{codeclone-1.0.0.dist-info → codeclone-1.1.0.dist-info}/licenses/LICENSE +0 -0
{codeclone-1.0.0.dist-info → codeclone-1.1.0.dist-info}/top_level.txt +0 -0

codeclone/__init__.py CHANGED Viewed

@@ -0,0 +1,16 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
+from importlib.metadata import version, PackageNotFoundError
+try:
+    __version__ = version("codeclone")
+except PackageNotFoundError:
+    __version__ = "dev"
+__all__ = ["__version__"]

codeclone/baseline.py CHANGED Viewed

@@ -1,3 +1,11 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
 from __future__ import annotations
 import json

codeclone/blockhash.py CHANGED Viewed

@@ -1,3 +1,11 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
 from __future__ import annotations
 import ast
@@ -5,8 +13,9 @@ import hashlib
 from .normalize import NormalizationConfig, AstNormalizer
 def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
     normalizer = AstNormalizer(cfg)
     stmt = ast.fix_missing_locations(normalizer.visit(stmt))
     dump = ast.dump(stmt, annotate_fields=True, include_attributes=False)
-    return hashlib.sha1(dump.encode("utf-8")).hexdigest()
+    return hashlib.sha1(dump.encode("utf-8")).hexdigest()

codeclone/blocks.py CHANGED Viewed

@@ -1,3 +1,11 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
 from __future__ import annotations
 import ast
@@ -18,13 +26,13 @@ class BlockUnit:
 def extract_blocks(
-        func_node: ast.AST,
-        *,
-        filepath: str,
-        qualname: str,
-        cfg: NormalizationConfig,
-        block_size: int,
-        max_blocks: int,
+    func_node: ast.AST,
+    *,
+    filepath: str,
+    qualname: str,
+    cfg: NormalizationConfig,
+    block_size: int,
+    max_blocks: int,
 ) -> list[BlockUnit]:
     body = getattr(func_node, "body", None)
     if not isinstance(body, list) or len(body) < block_size:
@@ -45,16 +53,18 @@ def extract_blocks(
         if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
             continue
-        bh = "|".join(stmt_hashes[i:i + block_size])
+        bh = "|".join(stmt_hashes[i : i + block_size])
-        blocks.append(BlockUnit(
-            block_hash=bh,
-            filepath=filepath,
-            qualname=qualname,
-            start_line=start,
-            end_line=end,
-            size=block_size,
-        ))
+        blocks.append(
+            BlockUnit(
+                block_hash=bh,
+                filepath=filepath,
+                qualname=qualname,
+                start_line=start,
+                end_line=end,
+                size=block_size,
+            )
+        )
         last_start = start
         if len(blocks) >= max_blocks:

codeclone/cache.py CHANGED Viewed

@@ -1,3 +1,11 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
 from __future__ import annotations
 import json

codeclone/cfg.py ADDED Viewed

@@ -0,0 +1,173 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
+from __future__ import annotations
+import ast
+from dataclasses import dataclass, field
+from typing import Iterable
+# =========================
+# Core CFG structures
+# =========================
+@dataclass(eq=False)
+class Block:
+    id: int
+    statements: list[ast.stmt] = field(default_factory=list)
+    successors: set["Block"] = field(default_factory=set)
+    is_terminated: bool = False
+    def add_successor(self, block: Block) -> None:
+        self.successors.add(block)
+    def __hash__(self) -> int:
+        return hash(self.id)
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, Block) and self.id == other.id
+@dataclass
+class CFG:
+    qualname: str
+    blocks: list[Block] = field(default_factory=list)
+    entry: Block = field(init=False)
+    exit: Block = field(init=False)
+    def __post_init__(self) -> None:
+        self.entry = self.create_block()
+        self.exit = self.create_block()
+    def create_block(self) -> Block:
+        block = Block(id=len(self.blocks))
+        self.blocks.append(block)
+        return block
+# =========================
+# CFG Builder
+# =========================
+class CFGBuilder:
+    def __init__(self) -> None:
+        self.cfg: CFG
+        self.current: Block
+    def build(
+        self,
+        qualname: str,
+        node: ast.FunctionDef | ast.AsyncFunctionDef,
+    ) -> CFG:
+        self.cfg = CFG(qualname)
+        self.current = self.cfg.entry
+        self._visit_statements(node.body)
+        if not self.current.is_terminated:
+            self.current.add_successor(self.cfg.exit)
+        return self.cfg
+    # ---------- Internals ----------
+    def _visit_statements(self, stmts: Iterable[ast.stmt]) -> None:
+        for stmt in stmts:
+            if self.current.is_terminated:
+                break
+            self._visit(stmt)
+    def _visit(self, stmt: ast.stmt) -> None:
+        match stmt:
+            case ast.Return():
+                self.current.statements.append(stmt)
+                self.current.is_terminated = True
+                self.current.add_successor(self.cfg.exit)
+            case ast.Raise():
+                self.current.statements.append(stmt)
+                self.current.is_terminated = True
+                self.current.add_successor(self.cfg.exit)
+            case ast.If():
+                self._visit_if(stmt)
+            case ast.While():
+                self._visit_while(stmt)
+            case ast.For():
+                self._visit_for(stmt)
+            case _:
+                self.current.statements.append(stmt)
+    # ---------- Control Flow ----------
+    def _visit_if(self, stmt: ast.If) -> None:
+        self.current.statements.append(ast.Expr(value=stmt.test))
+        then_block = self.cfg.create_block()
+        else_block = self.cfg.create_block()
+        after_block = self.cfg.create_block()
+        self.current.add_successor(then_block)
+        self.current.add_successor(else_block)
+        self.current = then_block
+        self._visit_statements(stmt.body)
+        if not self.current.is_terminated:
+            self.current.add_successor(after_block)
+        self.current = else_block
+        self._visit_statements(stmt.orelse)
+        if not self.current.is_terminated:
+            self.current.add_successor(after_block)
+        self.current = after_block
+    def _visit_while(self, stmt: ast.While) -> None:
+        cond_block = self.cfg.create_block()
+        body_block = self.cfg.create_block()
+        after_block = self.cfg.create_block()
+        self.current.add_successor(cond_block)
+        self.current = cond_block
+        self.current.statements.append(ast.Expr(value=stmt.test))
+        self.current.add_successor(body_block)
+        self.current.add_successor(after_block)
+        self.current = body_block
+        self._visit_statements(stmt.body)
+        if not self.current.is_terminated:
+            self.current.add_successor(cond_block)
+        self.current = after_block
+    def _visit_for(self, stmt: ast.For) -> None:
+        iter_block = self.cfg.create_block()
+        body_block = self.cfg.create_block()
+        after_block = self.cfg.create_block()
+        self.current.add_successor(iter_block)
+        self.current = iter_block
+        self.current.statements.append(ast.Expr(value=stmt.iter))
+        self.current.add_successor(body_block)
+        self.current.add_successor(after_block)
+        self.current = body_block
+        self._visit_statements(stmt.body)
+        if not self.current.is_terminated:
+            self.current.add_successor(iter_block)
+        self.current = after_block

codeclone/cli.py CHANGED Viewed

@@ -1,39 +1,71 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
 from __future__ import annotations
 import argparse
+from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 from .baseline import Baseline
 from .cache import Cache, file_stat_signature
 from .extractor import extract_units_from_source
+from .html_report import build_html_report
 from .normalize import NormalizationConfig
 from .report import build_groups, build_block_groups, to_json, to_text
 from .scanner import iter_py_files, module_name_from_path
-def main():
+def process_file(
+    filepath: str,
+    root: str,
+    cfg: NormalizationConfig,
+    min_loc: int,
+    min_stmt: int,
+) -> tuple[str, dict, list, list] | None:
+    try:
+        source = Path(filepath).read_text("utf-8")
+    except UnicodeDecodeError:
+        return None
+    stat = file_stat_signature(filepath)
+    module_name = module_name_from_path(root, filepath)
+    units, blocks = extract_units_from_source(
+        source=source,
+        filepath=filepath,
+        module_name=module_name,
+        cfg=cfg,
+        min_loc=min_loc,
+        min_stmt=min_stmt,
+    )
+    return filepath, stat, units, blocks
+def main() -> None:
     ap = argparse.ArgumentParser("codeclone")
     ap.add_argument("root", help="Project root")
+    ap.add_argument("--processes", type=int, default=4)
     ap.add_argument("--cache", default="~/.cache/codeclone/")
     ap.add_argument("--min-loc", type=int, default=15)
     ap.add_argument("--min-stmt", type=int, default=6)
     ap.add_argument("--json-out", default="")
     ap.add_argument("--text-out", default="")
+    ap.add_argument("--html-out", default="")
     ap.add_argument("--fail-if-groups", type=int, default=-1)
     ap.add_argument("--baseline", default="~/.config/codeclone/baseline.json")
-    ap.add_argument("--update-baseline", action="store_true",
-                    help="Write current clones as baseline")
-    ap.add_argument("--fail-on-new", action="store_true",
-                    help="Fail if new clones appear vs baseline")
+    ap.add_argument("--update-baseline", action="store_true")
+    ap.add_argument("--fail-on-new", action="store_true")
     args = ap.parse_args()
-    cfg = NormalizationConfig(
-        ignore_docstrings=True,
-        ignore_type_annotations=True,
-        normalize_attributes=True,
-        normalize_constants=True,
-        normalize_names=True,
-    )
+    cfg = NormalizationConfig()
     cache = Cache(args.cache)
     cache.load()
@@ -42,6 +74,8 @@ def main():
     all_blocks: list[dict] = []
     changed = 0
+    files_to_process: list[str] = []
     for fp in iter_py_files(args.root):
         stat = file_stat_signature(fp)
         cached = cache.get_file_entry(fp)
@@ -49,32 +83,52 @@ def main():
         if cached and cached.get("stat") == stat:
             all_units.extend(cached.get("units", []))
             all_blocks.extend(cached.get("blocks", []))
-            continue
-        try:
-            source = Path(fp).read_text("utf-8")
-        except UnicodeDecodeError:
-            continue
-        module_name = module_name_from_path(args.root, fp)
-        units, blocks = extract_units_from_source(
-            source=source,
-            filepath=fp,
-            module_name=module_name,
-            cfg=cfg,
-            min_loc=args.min_loc,
-            min_stmt=args.min_stmt,
-        )
-        cache.put_file_entry(fp, stat, units, blocks)
-        changed += 1
-        all_units.extend([u.__dict__ for u in units])
-        all_blocks.extend([b.__dict__ for b in blocks])
+        else:
+            files_to_process.append(fp)
+    with ProcessPoolExecutor(max_workers=args.processes) as executor:
+        futures = [
+            executor.submit(
+                process_file,
+                fp,
+                args.root,
+                cfg,
+                args.min_loc,
+                args.min_stmt,
+            )
+            for fp in files_to_process
+        ]
+        for future in futures:
+            result = future.result()
+            if result is None:
+                continue
+            fp, stat, units, blocks = result
+            cache.put_file_entry(fp, stat, units, blocks)
+            changed += 1
+            all_units.extend([u.__dict__ for u in units])
+            all_blocks.extend([b.__dict__ for b in blocks])
     func_groups = build_groups(all_units)
     block_groups = build_block_groups(all_blocks)
+    if args.html_out:
+        out = Path(args.html_out)
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_text(
+            build_html_report(
+                func_groups=func_groups,
+                block_groups=block_groups,
+                title="CodeClone Report",
+                context_lines=3,
+                max_snippet_lines=220,
+            ),
+            "utf-8",
+        )
     baseline = Baseline(args.baseline)
     baseline.load()
@@ -91,10 +145,7 @@ def main():
         out = Path(args.json_out)
         out.parent.mkdir(parents=True, exist_ok=True)
         out.write_text(
-            to_json({
-                "functions": func_groups,
-                "blocks": block_groups,
-            }),
+            to_json({"functions": func_groups, "blocks": block_groups}),
             "utf-8",
         )
@@ -114,26 +165,9 @@ def main():
     print(f"Function clone groups: {len(func_groups)}")
     print(f"Block clone groups: {len(block_groups)}")
-    if args.fail_on_new:
-        if new_func or new_block:
-            print("\n❌ New code clones detected\n")
-            if new_func:
-                print(f"New FUNCTION clone groups: {len(new_func)}")
-                for k in sorted(new_func):
-                    print(f"  - {k}")
-            if new_block:
-                print(f"New BLOCK clone groups: {len(new_block)}")
-                for k in sorted(new_block):
-                    print(f"  - {k}")
-            raise SystemExit(3)
-    print(f"Baseline function clones: {len(baseline.functions)}")
-    print(f"Baseline block clones: {len(baseline.blocks)}")
-    print(f"New function clones: {len(new_func)}")
-    print(f"New block clones: {len(new_block)}")
+    if args.fail_on_new and (new_func or new_block):
+        print("\n❌ New code clones detected\n")
+        raise SystemExit(3)
     cache.save()

codeclone/extractor.py CHANGED Viewed

@@ -1,11 +1,26 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
 from __future__ import annotations
 import ast
 from dataclasses import dataclass
+from typing import Sequence
 from .blocks import extract_blocks, BlockUnit
+from .cfg import CFGBuilder
 from .fingerprint import sha1, bucket_loc
-from .normalize import NormalizationConfig, normalized_ast_dump
+from .normalize import NormalizationConfig, normalized_ast_dump_from_list
+# =========================
+# Data structures
+# =========================
 @dataclass(frozen=True)
@@ -20,37 +35,83 @@ class Unit:
     loc_bucket: str
+# =========================
+# Helpers
+# =========================
 def _stmt_count(node: ast.AST) -> int:
     body = getattr(node, "body", None)
     return len(body) if isinstance(body, list) else 0
 class _QualnameBuilder(ast.NodeVisitor):
-    def __init__(self):
+    def __init__(self) -> None:
         self.stack: list[str] = []
-        self.units: list[tuple[str, ast.AST]] = []
+        self.units: list[tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = []
-    def visit_ClassDef(self, node: ast.ClassDef):
+    def visit_ClassDef(self, node: ast.ClassDef) -> None:
         self.stack.append(node.name)
         self.generic_visit(node)
         self.stack.pop()
-    def visit_FunctionDef(self, node: ast.FunctionDef):
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
         name = ".".join(self.stack + [node.name]) if self.stack else node.name
         self.units.append((name, node))
-    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
         name = ".".join(self.stack + [node.name]) if self.stack else node.name
         self.units.append((name, node))
+# =========================
+# CFG fingerprinting
+# =========================
+def get_cfg_fingerprint(
+    node: ast.FunctionDef | ast.AsyncFunctionDef,
+    cfg: NormalizationConfig,
+    qualname: str,
+) -> str:
+    """
+    Build CFG, normalize it into a canonical form, and hash it.
+    """
+    builder = CFGBuilder()
+    graph = builder.build(qualname, node)
+    parts: list[str] = []
+    # Stable order for deterministic hash
+    for block in sorted(graph.blocks, key=lambda b: b.id):
+        # NOTE: normalized_ast_dump_from_list must accept Sequence[ast.AST] (covariant),
+        # but even if it still accepts list[ast.AST], passing list[ast.stmt] will fail
+        # due to invariance. We pass as Sequence[ast.AST] via a typed view.
+        stmts_as_ast: Sequence[ast.AST] = block.statements
+        normalized_stmts = normalized_ast_dump_from_list(stmts_as_ast, cfg)
+        successor_ids = sorted(succ.id for succ in block.successors)
+        parts.append(
+            f"BLOCK[{block.id}]:{normalized_stmts}"
+            f"|SUCCESSORS:{','.join(map(str, successor_ids))}"
+        )
+    return sha1("|".join(parts))
+# =========================
+# Public API
+# =========================
 def extract_units_from_source(
-        source: str,
-        filepath: str,
-        module_name: str,
-        cfg: NormalizationConfig,
-        min_loc: int,
-        min_stmt: int,
+    source: str,
+    filepath: str,
+    module_name: str,
+    cfg: NormalizationConfig,
+    min_loc: int,
+    min_stmt: int,
 ) -> tuple[list[Unit], list[BlockUnit]]:
     try:
         tree = ast.parse(source)
@@ -66,6 +127,7 @@ def extract_units_from_source(
     for local_name, node in qb.units:
         start = getattr(node, "lineno", None)
         end = getattr(node, "end_lineno", None)
         if not start or not end or end < start:
             continue
@@ -76,26 +138,24 @@ def extract_units_from_source(
             continue
         qualname = f"{module_name}:{local_name}"
-        dump = normalized_ast_dump(node, cfg)
-        fp = sha1(dump)
-        # ✅ __init__ INCLUDED as function-level unit
-        units.append(Unit(
-            qualname=qualname,
-            filepath=filepath,
-            start_line=start,
-            end_line=end,
-            loc=loc,
-            stmt_count=stmt_count,
-            fingerprint=fp,
-            loc_bucket=bucket_loc(loc),
-        ))
-        if (
-                not local_name.endswith("__init__")
-                and loc >= 40
-                and stmt_count >= 10
-        ):
+        fingerprint = get_cfg_fingerprint(node, cfg, qualname)
+        # Function-level unit (including __init__)
+        units.append(
+            Unit(
+                qualname=qualname,
+                filepath=filepath,
+                start_line=start,
+                end_line=end,
+                loc=loc,
+                stmt_count=stmt_count,
+                fingerprint=fingerprint,
+                loc_bucket=bucket_loc(loc),
+            )
+        )
+        # Block-level units (exclude __init__)
+        if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
             blocks = extract_blocks(
                 node,
                 filepath=filepath,

codeclone 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

codeclone 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl