PyPI - codeclone - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

codeclone 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

codeclone/__init__.py +1 -1
codeclone/baseline.py +33 -7
codeclone/blockhash.py +1 -1
codeclone/blocks.py +4 -3
codeclone/cache.py +151 -20
codeclone/cfg.py +53 -128
codeclone/cfg_model.py +47 -0
codeclone/cli.py +308 -114
codeclone/errors.py +27 -0
codeclone/extractor.py +101 -24
codeclone/html_report.py +196 -640
codeclone/normalize.py +21 -14
codeclone/py.typed +0 -0
codeclone/report.py +23 -12
codeclone/scanner.py +66 -3
codeclone/templates.py +1262 -0
{codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/METADATA +53 -35
codeclone-1.2.1.dist-info/RECORD +23 -0
codeclone-1.2.0.dist-info/RECORD +0 -19
{codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/WHEEL +0 -0
{codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/entry_points.txt +0 -0
{codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/licenses/LICENSE +0 -0
{codeclone-1.2.0.dist-info → codeclone-1.2.1.dist-info}/top_level.txt +0 -0

codeclone/cli.py CHANGED Viewed

@@ -1,36 +1,32 @@
-"""
-CodeClone — AST and CFG-based code clone detector for Python
-focused on architectural duplication.
-Copyright (c) 2026 Den Rozhnovskiy
-Licensed under the MIT License.
-"""
 from __future__ import annotations
 import argparse
+import os
 import sys
 from concurrent.futures import ProcessPoolExecutor, as_completed
+from dataclasses import asdict, dataclass
 from pathlib import Path
+from typing import Any, cast
 from rich.console import Console
 from rich.panel import Panel
 from rich.progress import (
+    BarColumn,
     Progress,
     SpinnerColumn,
     TextColumn,
-    BarColumn,
     TimeElapsedColumn,
 )
 from rich.table import Table
 from rich.theme import Theme
 from .baseline import Baseline
-from .cache import Cache, file_stat_signature
+from .cache import Cache, CacheEntry, FileStat, file_stat_signature
+from .errors import CacheError
 from .extractor import extract_units_from_source
 from .html_report import build_html_report
 from .normalize import NormalizationConfig
-from .report import build_groups, build_block_groups, to_json, to_text
+from .report import build_block_groups, build_groups, to_json_report, to_text
 from .scanner import iter_py_files, module_name_from_path
 # Custom theme for Rich
@@ -45,6 +41,21 @@ custom_theme = Theme(
 )
 console = Console(theme=custom_theme, width=200)
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+BATCH_SIZE = 100
+@dataclass(slots=True)
+class ProcessingResult:
+    """Result of processing a single file."""
+    filepath: str
+    success: bool
+    error: str | None = None
+    units: list[Any] | None = None
+    blocks: list[Any] | None = None
+    stat: FileStat | None = None
 def expand_path(p: str) -> Path:
     return Path(p).expanduser().resolve()
@@ -56,31 +67,76 @@ def process_file(
     cfg: NormalizationConfig,
     min_loc: int,
     min_stmt: int,
-) -> tuple[str, dict, list, list] | None:
+) -> ProcessingResult:
+    """
+    Process a single Python file with comprehensive error handling.
+    Args:
+        filepath: Absolute path to the file
+        root: Root directory of the scan
+        cfg: Normalization configuration
+        min_loc: Minimum lines of code to consider a function
+        min_stmt: Minimum statements to consider a function
+    Returns:
+        ProcessingResult object indicating success/failure and containing
+        extracted units/blocks if successful.
+    """
     try:
-        source = Path(filepath).read_text("utf-8")
-    except UnicodeDecodeError:
-        return None
-    stat = file_stat_signature(filepath)
-    module_name = module_name_from_path(root, filepath)
-    units, blocks = extract_units_from_source(
-        source=source,
-        filepath=filepath,
-        module_name=module_name,
-        cfg=cfg,
-        min_loc=min_loc,
-        min_stmt=min_stmt,
-    )
+        # Check file size
+        try:
+            st_size = os.path.getsize(filepath)
+            if st_size > MAX_FILE_SIZE:
+                return ProcessingResult(
+                    filepath=filepath,
+                    success=False,
+                    error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})",
+                )
+        except OSError as e:
+            return ProcessingResult(
+                filepath=filepath, success=False, error=f"Cannot stat file: {e}"
+            )
-    return filepath, stat, units, blocks
+        try:
+            source = Path(filepath).read_text("utf-8")
+        except UnicodeDecodeError as e:
+            return ProcessingResult(
+                filepath=filepath, success=False, error=f"Encoding error: {e}"
+            )
+        stat = file_stat_signature(filepath)
+        module_name = module_name_from_path(root, filepath)
+        units, blocks = extract_units_from_source(
+            source=source,
+            filepath=filepath,
+            module_name=module_name,
+            cfg=cfg,
+            min_loc=min_loc,
+            min_stmt=min_stmt,
+        )
+        return ProcessingResult(
+            filepath=filepath,
+            success=True,
+            units=units,
+            blocks=blocks,
+            stat=stat,
+        )
+    except Exception as e:
+        return ProcessingResult(
+            filepath=filepath,
+            success=False,
+            error=f"Unexpected error: {type(e).__name__}: {e}",
+        )
-def print_banner():
+def print_banner() -> None:
     console.print(
         Panel.fit(
-            "[bold white]CodeClone[/bold white] [dim]v1.2.0[/dim]\n"
+            "[bold white]CodeClone[/bold white] [dim]v1.2.1[/dim]\n"
             "[italic]Architectural duplication detector[/italic]",
             border_style="blue",
             padding=(0, 2),
@@ -185,9 +241,13 @@ def main() -> None:
     print_banner()
-    root_path = Path(args.root).resolve()
-    if not root_path.exists():
-        console.print(f"[error]Root path does not exist: {root_path}[/error]")
+    try:
+        root_path = Path(args.root).resolve()
+        if not root_path.exists():
+            console.print(f"[error]Root path does not exist: {root_path}[/error]")
+            sys.exit(1)
+    except Exception as e:
+        console.print(f"[error]Invalid root path: {e}[/error]")
         sys.exit(1)
     console.print(f"[info]Scanning root:[/info] {root_path}")
@@ -197,101 +257,213 @@ def main() -> None:
     cache_path = Path(args.cache_dir).expanduser()
     cache = Cache(cache_path)
     cache.load()
+    if cache.load_warning:
+        console.print(f"[warning]{cache.load_warning}[/warning]")
-    all_units: list[dict] = []
-    all_blocks: list[dict] = []
+    all_units: list[dict[str, Any]] = []
+    all_blocks: list[dict[str, Any]] = []
     changed_files_count = 0
     files_to_process: list[str] = []
+    def _get_cached_entry(
+        fp: str,
+    ) -> tuple[FileStat | None, CacheEntry | None, str | None]:
+        try:
+            stat = file_stat_signature(fp)
+        except OSError as e:
+            return None, None, f"[warning]Skipping file {fp}: {e}[/warning]"
+        cached = cache.get_file_entry(fp)
+        return stat, cached, None
+    def _safe_process_file(fp: str) -> ProcessingResult | None:
+        try:
+            return process_file(
+                fp,
+                str(root_path),
+                cfg,
+                args.min_loc,
+                args.min_stmt,
+            )
+        except Exception as e:
+            console.print(f"[warning]Worker failed: {e}[/warning]")
+            return None
+    def _safe_future_result(future: Any) -> tuple[ProcessingResult | None, str | None]:
+        try:
+            return future.result(), None
+        except Exception as e:
+            return None, str(e)
     # Discovery phase
     with console.status("[bold green]Discovering Python files...", spinner="dots"):
-        for fp in iter_py_files(str(root_path)):
-            stat = file_stat_signature(fp)
-            cached = cache.get_file_entry(fp)
-            if cached and cached.get("stat") == stat:
-                all_units.extend(cached.get("units", []))
-                all_blocks.extend(cached.get("blocks", []))
-            else:
-                files_to_process.append(fp)
+        try:
+            for fp in iter_py_files(str(root_path)):
+                stat, cached, warn = _get_cached_entry(fp)
+                if warn:
+                    console.print(warn)
+                    continue
+                if cached and cached.get("stat") == stat:
+                    all_units.extend(
+                        cast(
+                            list[dict[str, Any]],
+                            cast(object, cached.get("units", [])),
+                        )
+                    )
+                    all_blocks.extend(
+                        cast(
+                            list[dict[str, Any]],
+                            cast(object, cached.get("blocks", [])),
+                        )
+                    )
+                else:
+                    files_to_process.append(fp)
+        except Exception as e:
+            console.print(f"[error]Scan failed: {e}[/error]")
+            sys.exit(1)
     total_files = len(files_to_process)
+    failed_files = []
     # Processing phase
     if total_files > 0:
-        if args.no_progress:
-            console.print(f"[info]Processing {total_files} changed files...[/info]")
+        def handle_result(result: ProcessingResult) -> None:
+            nonlocal changed_files_count
+            if result.success and result.stat:
+                cache.put_file_entry(
+                    result.filepath,
+                    result.stat,
+                    result.units or [],
+                    result.blocks or [],
+                )
+                changed_files_count += 1
+                if result.units:
+                    all_units.extend([asdict(u) for u in result.units])
+                if result.blocks:
+                    all_blocks.extend([asdict(b) for b in result.blocks])
+            else:
+                failed_files.append(f"{result.filepath}: {result.error}")
+        def process_sequential(with_progress: bool) -> None:
+            if with_progress:
+                with Progress(
+                    SpinnerColumn(),
+                    TextColumn("[progress.description]{task.description}"),
+                    BarColumn(),
+                    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                    TimeElapsedColumn(),
+                    console=console,
+                ) as progress:
+                    task = progress.add_task(
+                        f"Analyzing {total_files} files...", total=total_files
+                    )
+                    for fp in files_to_process:
+                        result = _safe_process_file(fp)
+                        if result is not None:
+                            handle_result(result)
+                        progress.advance(task)
+            else:
+                console.print(f"[info]Processing {total_files} changed files...[/info]")
+                for fp in files_to_process:
+                    result = _safe_process_file(fp)
+                    if result is not None:
+                        handle_result(result)
+        try:
             with ProcessPoolExecutor(max_workers=args.processes) as executor:
-                futures = [
-                    executor.submit(
-                        process_file,
-                        fp,
-                        str(root_path),
-                        cfg,
-                        args.min_loc,
-                        args.min_stmt,
+                if args.no_progress:
+                    console.print(
+                        f"[info]Processing {total_files} changed files...[/info]"
                     )
-                    for fp in files_to_process
-                ]
-                for future in as_completed(futures):
-                    try:
-                        result = future.result()
-                    except Exception as e:
-                        console.print(f"[warning]Failed to process file: {e}[/warning]")
-                        continue
-                    if result:
-                        fp, stat, units, blocks = result
-                        cache.put_file_entry(fp, stat, units, blocks)
-                        changed_files_count += 1
-                        all_units.extend([u.__dict__ for u in units])
-                        all_blocks.extend([b.__dict__ for b in blocks])
-        else:
-            with Progress(
-                SpinnerColumn(),
-                TextColumn("[progress.description]{task.description}"),
-                BarColumn(),
-                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
-                TimeElapsedColumn(),
-                console=console,
-            ) as progress:
-                task = progress.add_task(
-                    f"Analyzing {total_files} files...", total=total_files
-                )
-                with ProcessPoolExecutor(max_workers=args.processes) as executor:
-                    futures = [
-                        executor.submit(
-                            process_file,
-                            fp,
-                            str(root_path),
-                            cfg,
-                            args.min_loc,
-                            args.min_stmt,
+                    # Process in batches to manage memory
+                    for i in range(0, total_files, BATCH_SIZE):
+                        batch = files_to_process[i : i + BATCH_SIZE]
+                        futures = [
+                            executor.submit(
+                                process_file,
+                                fp,
+                                str(root_path),
+                                cfg,
+                                args.min_loc,
+                                args.min_stmt,
+                            )
+                            for fp in batch
+                        ]
+                        for future in as_completed(futures):
+                            result, err = _safe_future_result(future)
+                            if result is not None:
+                                handle_result(result)
+                            elif err is not None:
+                                console.print(
+                                    "[warning]Failed to process batch item: "
+                                    f"{err}[/warning]"
+                                )
+                else:
+                    with Progress(
+                        SpinnerColumn(),
+                        TextColumn("[progress.description]{task.description}"),
+                        BarColumn(),
+                        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                        TimeElapsedColumn(),
+                        console=console,
+                    ) as progress:
+                        task = progress.add_task(
+                            f"Analyzing {total_files} files...", total=total_files
                         )
-                        for fp in files_to_process
-                    ]
-                    for future in as_completed(futures):
-                        try:
-                            result = future.result()
-                        except Exception:
-                            # Log error but keep progress bar moving?
-                            # console.print might break progress bar layout, better to rely on rich logging or just skip
-                            # console.print(f"[warning]Failed to process file: {e}[/warning]")
-                            continue
-                        finally:
-                            progress.advance(task)
-                        if result:
-                            fp, stat, units, blocks = result
-                            cache.put_file_entry(fp, stat, units, blocks)
-                            changed_files_count += 1
-                            all_units.extend([u.__dict__ for u in units])
-                            all_blocks.extend([b.__dict__ for b in blocks])
+                        # Process in batches
+                        for i in range(0, total_files, BATCH_SIZE):
+                            batch = files_to_process[i : i + BATCH_SIZE]
+                            futures = [
+                                executor.submit(
+                                    process_file,
+                                    fp,
+                                    str(root_path),
+                                    cfg,
+                                    args.min_loc,
+                                    args.min_stmt,
+                                )
+                                for fp in batch
+                            ]
+                            for future in as_completed(futures):
+                                result, err = _safe_future_result(future)
+                                if result is not None:
+                                    handle_result(result)
+                                elif err is not None:
+                                    # Should rarely happen due to try/except
+                                    # in process_file.
+                                    console.print(
+                                        f"[warning]Worker failed: {err}[/warning]"
+                                    )
+                                progress.advance(task)
+        except (OSError, RuntimeError, PermissionError) as e:
+            console.print(
+                "[warning]Parallel processing unavailable, "
+                f"falling back to sequential: {e}[/warning]"
+            )
+            process_sequential(with_progress=not args.no_progress)
+    if failed_files:
+        console.print(
+            f"\n[warning]⚠ {len(failed_files)} files failed to process:[/warning]"
+        )
+        for failure in failed_files[:10]:
+            console.print(f"  • {failure}")
+        if len(failed_files) > 10:
+            console.print(f"  ... and {len(failed_files) - 10} more")
     # Analysis phase
     with console.status("[bold green]Grouping clones...", spinner="dots"):
         func_groups = build_groups(all_units)
         block_groups = build_block_groups(all_blocks)
-        cache.save()
+        try:
+            cache.save()
+        except CacheError as e:
+            console.print(f"[warning]Failed to save cache: {e}[/warning]")
     # Reporting
     func_clones_count = len(func_groups)
@@ -300,24 +472,45 @@ def main() -> None:
     # Baseline Logic
     baseline_path = Path(args.baseline).expanduser().resolve()
-    # If user didn't specify path, and default logic applies, baseline_path is now ./codeclone_baseline.json
+    # If user didn't specify path and default logic applies, baseline_path
+    # is now ./codeclone_baseline.json
     baseline = Baseline(baseline_path)
     baseline_exists = baseline_path.exists()
     if baseline_exists:
         baseline.load()
+        if not args.update_baseline and baseline.python_version:
+            current_version = f"{sys.version_info.major}.{sys.version_info.minor}"
+            if baseline.python_version != current_version:
+                console.print(
+                    "[warning]Baseline Python version mismatch.[/warning]\n"
+                    f"Baseline was generated with Python {baseline.python_version}.\n"
+                    f"Current interpreter: Python {current_version}."
+                )
+                if args.fail_on_new:
+                    console.print(
+                        "[error]Baseline checks require the same Python version to "
+                        "ensure deterministic results. Please regenerate the baseline "
+                        "using the current interpreter.[/error]"
+                    )
+                    sys.exit(2)
     else:
         if not args.update_baseline:
             console.print(
-                f"[warning]Baseline file not found at: [bold]{baseline_path}[/bold][/warning]\n"
+                "[warning]Baseline file not found at: [bold]"
+                f"{baseline_path}"
+                "[/bold][/warning]\n"
                 "[dim]Comparing against an empty baseline. "
                 "Use --update-baseline to create it.[/dim]"
             )
     if args.update_baseline:
         new_baseline = Baseline.from_groups(
-            func_groups, block_groups, path=baseline_path
+            func_groups,
+            block_groups,
+            path=baseline_path,
+            python_version=f"{sys.version_info.major}.{sys.version_info.minor}",
         )
         new_baseline.save()
         console.print(f"[success]✔ Baseline updated:[/success] {baseline_path}")
@@ -365,7 +558,7 @@ def main() -> None:
         out = Path(args.json_out).expanduser().resolve()
         out.parent.mkdir(parents=True, exist_ok=True)
         out.write_text(
-            to_json({"functions": func_groups, "blocks": block_groups}),
+            to_json_report(func_groups, block_groups),
             "utf-8",
         )
         console.print(f"[info]JSON report saved:[/info] {out}")
@@ -392,8 +585,9 @@ def main() -> None:
         sys.exit(3)
     if 0 <= args.fail_threshold < (func_clones_count + block_clones_count):
+        total = func_clones_count + block_clones_count
         console.print(
-            f"\n[error]❌ FAILED: Total clones ({func_clones_count + block_clones_count}) "
+            f"\n[error]❌ FAILED: Total clones ({total}) "
             f"exceed threshold ({args.fail_threshold})![/error]"
         )
         sys.exit(2)

codeclone/errors.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+CodeClone — AST and CFG-based code clone detector for Python
+focused on architectural duplication.
+Copyright (c) 2026 Den Rozhnovskiy
+Licensed under the MIT License.
+"""
+class CodeCloneError(Exception):
+    """Base exception for CodeClone."""
+class FileProcessingError(CodeCloneError):
+    """Error processing a source file."""
+class ParseError(FileProcessingError):
+    """AST parsing failed."""
+class ValidationError(CodeCloneError):
+    """Input validation failed."""
+class CacheError(CodeCloneError):
+    """Cache operation failed."""

codeclone 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

codeclone 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl