PyPI - codeclone - Versions diffs - 1.4.1__tar.gz → 1.4.2__tar.gz - Mend

codeclone 1.4.1tar.gz → 1.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

{codeclone-1.4.1 → codeclone-1.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codeclone
-Version: 1.4.1
+Version: 1.4.2
 Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
 Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
 Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>

{codeclone-1.4.1 → codeclone-1.4.2}/codeclone/blocks.py RENAMED Viewed

@@ -9,6 +9,7 @@ Licensed under the MIT License.
 from __future__ import annotations
 import ast
+from collections.abc import Sequence
 from dataclasses import dataclass
 from .blockhash import stmt_hash
@@ -45,12 +46,20 @@ def extract_blocks(
     cfg: NormalizationConfig,
     block_size: int,
     max_blocks: int,
+    precomputed_hashes: Sequence[str] | None = None,
 ) -> list[BlockUnit]:
     body = getattr(func_node, "body", None)
     if not isinstance(body, list) or len(body) < block_size:
         return []
-    stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
+    if precomputed_hashes is not None:
+        assert len(precomputed_hashes) == len(body), (
+            f"precomputed_hashes length {len(precomputed_hashes)} "
+            f"!= body length {len(body)}"
+        )
+        stmt_hashes = precomputed_hashes
+    else:
+        stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
     blocks: list[BlockUnit] = []
     last_start: int | None = None
@@ -94,12 +103,20 @@ def extract_segments(
     cfg: NormalizationConfig,
     window_size: int,
     max_segments: int,
+    precomputed_hashes: Sequence[str] | None = None,
 ) -> list[SegmentUnit]:
     body = getattr(func_node, "body", None)
     if not isinstance(body, list) or len(body) < window_size:
         return []
-    stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
+    if precomputed_hashes is not None:
+        assert len(precomputed_hashes) == len(body), (
+            f"precomputed_hashes length {len(precomputed_hashes)} "
+            f"!= body length {len(body)}"
+        )
+        stmt_hashes = precomputed_hashes
+    else:
+        stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
     segments: list[SegmentUnit] = []

{codeclone-1.4.1 → codeclone-1.4.2}/codeclone/cache.py RENAMED Viewed

@@ -344,14 +344,14 @@ class Cache:
         try:
             self.path.parent.mkdir(parents=True, exist_ok=True)
             wire_files: dict[str, object] = {}
-            for runtime_path in sorted(
-                self.data["files"], key=self._wire_filepath_from_runtime
-            ):
+            wire_map = {
+                rp: self._wire_filepath_from_runtime(rp) for rp in self.data["files"]
+            }
+            for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__):
                 entry = self.get_file_entry(runtime_path)
                 if entry is None:
                     continue
-                wire_path = self._wire_filepath_from_runtime(runtime_path)
-                wire_files[wire_path] = _encode_wire_file_entry(entry)
+                wire_files[wire_map[runtime_path]] = _encode_wire_file_entry(entry)
             payload: dict[str, object] = {
                 "py": current_python_tag(),

{codeclone-1.4.1 → codeclone-1.4.2}/codeclone/cli.py RENAMED Viewed

@@ -122,14 +122,14 @@ def process_file(
     """
     try:
-        # Check file size
+        # Single os.stat() for both size check and cache signature
         try:
-            st_size = os.path.getsize(filepath)
-            if st_size > MAX_FILE_SIZE:
+            st = os.stat(filepath)
+            if st.st_size > MAX_FILE_SIZE:
                 return ProcessingResult(
                     filepath=filepath,
                     success=False,
-                    error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})",
+                    error=f"File too large: {st.st_size} bytes (max {MAX_FILE_SIZE})",
                     error_kind="file_too_large",
                 )
         except OSError as e:
@@ -140,6 +140,8 @@ def process_file(
                 error_kind="stat_error",
             )
+        stat: FileStat = {"mtime_ns": st.st_mtime_ns, "size": st.st_size}
         try:
             source = Path(filepath).read_text("utf-8")
         except UnicodeDecodeError as e:
@@ -157,7 +159,6 @@ def process_file(
                 error_kind="source_read_error",
             )
-        stat = file_stat_signature(filepath)
         module_name = module_name_from_path(root, filepath)
         units, blocks, segments = extract_units_from_source(
@@ -355,68 +356,44 @@ def _main_impl() -> None:
             return None, str(e)
     # Discovery phase
-    try:
-        if args.quiet:
-            for fp in iter_py_files(str(root_path)):
-                files_found += 1
-                stat, cached, warn = _get_cached_entry(fp)
-                if warn:
-                    console.print(warn)
-                    files_skipped += 1
-                    continue
-                if cached and cached.get("stat") == stat:
-                    cache_hits += 1
-                    all_units.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("units", [])),
-                        )
+    def _discover_files() -> None:
+        nonlocal files_found, cache_hits, files_skipped
+        for fp in iter_py_files(str(root_path)):
+            files_found += 1
+            stat, cached, warn = _get_cached_entry(fp)
+            if warn:
+                console.print(warn)
+                files_skipped += 1
+                continue
+            if cached and cached.get("stat") == stat:
+                cache_hits += 1
+                all_units.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("units", [])),
                     )
-                    all_blocks.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("blocks", [])),
-                        )
+                )
+                all_blocks.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("blocks", [])),
                     )
-                    all_segments.extend(
-                        cast(
-                            list[GroupItem],
-                            cast(object, cached.get("segments", [])),
-                        )
+                )
+                all_segments.extend(
+                    cast(
+                        list[GroupItem],
+                        cast(object, cached.get("segments", [])),
                     )
-                else:
-                    files_to_process.append(fp)
+                )
+            else:
+                files_to_process.append(fp)
+    try:
+        if args.quiet:
+            _discover_files()
         else:
             with console.status(ui.STATUS_DISCOVERING, spinner="dots"):
-                for fp in iter_py_files(str(root_path)):
-                    files_found += 1
-                    stat, cached, warn = _get_cached_entry(fp)
-                    if warn:
-                        console.print(warn)
-                        files_skipped += 1
-                        continue
-                    if cached and cached.get("stat") == stat:
-                        cache_hits += 1
-                        all_units.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("units", [])),
-                            )
-                        )
-                        all_blocks.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("blocks", [])),
-                            )
-                        )
-                        all_segments.extend(
-                            cast(
-                                list[GroupItem],
-                                cast(object, cached.get("segments", [])),
-                            )
-                        )
-                    else:
-                        files_to_process.append(fp)
+                _discover_files()
     except OSError as e:
         console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e)))
         sys.exit(ExitCode.CONTRACT_ERROR)

{codeclone-1.4.1 → codeclone-1.4.2}/codeclone/extractor.py RENAMED Viewed

@@ -16,6 +16,7 @@ from collections.abc import Iterator
 from contextlib import contextmanager
 from dataclasses import dataclass
+from .blockhash import stmt_hash
 from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments
 from .cfg import CFGBuilder
 from .errors import ParseError
@@ -250,28 +251,42 @@ def extract_units_from_source(
             )
         )
-        # Block-level units (exclude __init__)
-        if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
-            blocks = extract_blocks(
-                node,
-                filepath=filepath,
-                qualname=qualname,
-                cfg=cfg,
-                block_size=4,
-                max_blocks=15,
-            )
-            block_units.extend(blocks)
-        # Segment-level units (windows within functions, for internal clones)
-        if loc >= 30 and stmt_count >= 12:
-            segments = extract_segments(
-                node,
-                filepath=filepath,
-                qualname=qualname,
-                cfg=cfg,
-                window_size=6,
-                max_segments=60,
-            )
-            segment_units.extend(segments)
+        # Block-level and segment-level units share statement hashes
+        needs_blocks = (
+            not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10
+        )
+        needs_segments = loc >= 30 and stmt_count >= 12
+        if needs_blocks or needs_segments:
+            body = getattr(node, "body", None)
+            hashes: list[str] | None = None
+            if isinstance(body, list):
+                hashes = [stmt_hash(stmt, cfg) for stmt in body]
+            if needs_blocks:
+                block_units.extend(
+                    extract_blocks(
+                        node,
+                        filepath=filepath,
+                        qualname=qualname,
+                        cfg=cfg,
+                        block_size=4,
+                        max_blocks=15,
+                        precomputed_hashes=hashes,
+                    )
+                )
+            if needs_segments:
+                segment_units.extend(
+                    extract_segments(
+                        node,
+                        filepath=filepath,
+                        qualname=qualname,
+                        cfg=cfg,
+                        window_size=6,
+                        max_segments=60,
+                        precomputed_hashes=hashes,
+                    )
+                )
     return units, block_units, segment_units

{codeclone-1.4.1 → codeclone-1.4.2}/codeclone/scanner.py RENAMED Viewed

@@ -77,8 +77,9 @@ def iter_py_files(
             if root_str.startswith(sensitive + "/"):
                 raise ValidationError(f"Cannot scan under sensitive directory: {root}")
-    file_count = 0
-    for p in sorted(rootp.rglob("*.py"), key=lambda path: str(path)):
+    # Collect and filter first, then sort — avoids sorting excluded paths
+    candidates: list[Path] = []
+    for p in rootp.rglob("*.py"):
         # Verify path is actually under root (prevent symlink attacks)
         try:
             p.resolve().relative_to(rootp)
@@ -90,12 +91,15 @@ def iter_py_files(
         if any(ex in parts for ex in excludes):
             continue
-        file_count += 1
-        if file_count > max_files:
-            raise ValidationError(
-                f"File count exceeds limit of {max_files}. "
-                "Use more specific root or increase limit."
-            )
+        candidates.append(p)
+    if len(candidates) > max_files:
+        raise ValidationError(
+            f"File count exceeds limit of {max_files}. "
+            "Use more specific root or increase limit."
+        )
+    for p in sorted(candidates, key=lambda path: str(path)):
         yield str(p)

{codeclone-1.4.1 → codeclone-1.4.2}/codeclone.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codeclone
-Version: 1.4.1
+Version: 1.4.2
 Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
 Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
 Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>

{codeclone-1.4.1 → codeclone-1.4.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "codeclone"
-version = "1.4.1"
+version = "1.4.2"
 description = "AST and CFG-based code clone detector for Python focused on architectural duplication"
 readme = { file = "README.md", content-type = "text/markdown" }
 license = { text = "MIT" }

{codeclone-1.4.1 → codeclone-1.4.2}/tests/test_cli_unit.py RENAMED Viewed

@@ -28,10 +28,14 @@ def test_process_file_stat_error(
     src = tmp_path / "a.py"
     src.write_text("def f():\n    return 1\n", "utf-8")
-    def _boom(_path: str) -> int:
-        raise OSError("nope")
+    _original_stat = os.stat
-    monkeypatch.setattr(os.path, "getsize", _boom)
+    def _boom(path: str, *args: object, **kwargs: object) -> os.stat_result:
+        if str(path) == str(src):
+            raise OSError("nope")
+        return _original_stat(path, *args, **kwargs)  # type: ignore[arg-type]
+    monkeypatch.setattr(os, "stat", _boom)
     result = process_file(str(src), str(tmp_path), NormalizationConfig(), 1, 1)
     assert result.success is False
     assert result.error is not None

{codeclone-1.4.1 → codeclone-1.4.2}/tests/test_extractor.py RENAMED Viewed

@@ -374,6 +374,111 @@ def f():
     assert segments == []
+def test_extract_generates_segments_without_blocks_when_only_segment_gate_met() -> None:
+    lines = ["def f():"]
+    for i in range(12):
+        lines.append(f"    x{i} = {i}")
+        lines.append("")
+        lines.append("")
+    src = "\n".join(lines)
+    units, blocks, segments = extract_units_from_source(
+        source=src,
+        filepath="x.py",
+        module_name="mod",
+        cfg=NormalizationConfig(),
+        min_loc=1,
+        min_stmt=1,
+    )
+    assert units
+    assert blocks == []
+    assert segments
+def test_extract_generates_blocks_without_segments_when_only_block_gate_met() -> None:
+    lines = ["def f():"]
+    for i in range(10):
+        lines.append(f"    x{i} = {i}")
+        lines.append("")
+        lines.append("")
+        lines.append("")
+        lines.append("")
+    src = "\n".join(lines)
+    units, blocks, segments = extract_units_from_source(
+        source=src,
+        filepath="x.py",
+        module_name="mod",
+        cfg=NormalizationConfig(),
+        min_loc=1,
+        min_stmt=1,
+    )
+    assert units
+    assert blocks
+    assert segments == []
+def test_extract_handles_non_list_function_body_for_hash_reuse(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    lines = ["def f():"]
+    for i in range(12):
+        lines.append(f"    x{i} = {i}")
+        lines.append("")
+        lines.append("")
+    tree = ast.parse("\n".join(lines))
+    func = tree.body[0]
+    assert isinstance(func, ast.FunctionDef)
+    func.body = tuple(func.body)  # type: ignore[assignment]
+    captured_hashes: dict[str, object] = {}
+    def _fake_parse(_source: str, _timeout_s: int) -> ast.AST:
+        return tree
+    def _fake_fingerprint(
+        _node: ast.FunctionDef | ast.AsyncFunctionDef,
+        _cfg: NormalizationConfig,
+        _qualname: str,
+    ) -> str:
+        return "f" * 40
+    def _fake_extract_segments(
+        _node: ast.FunctionDef | ast.AsyncFunctionDef,
+        filepath: str,
+        qualname: str,
+        cfg: NormalizationConfig,
+        window_size: int = 6,
+        max_segments: int = 60,
+        *,
+        precomputed_hashes: list[str] | None = None,
+    ) -> list[object]:
+        del filepath, qualname, cfg, window_size, max_segments
+        captured_hashes["value"] = precomputed_hashes
+        return []
+    monkeypatch.setattr(extractor, "_parse_with_limits", _fake_parse)
+    monkeypatch.setattr(extractor, "_stmt_count", lambda _node: 12)
+    monkeypatch.setattr(extractor, "get_cfg_fingerprint", _fake_fingerprint)
+    monkeypatch.setattr(extractor, "extract_segments", _fake_extract_segments)
+    units, blocks, segments = extract_units_from_source(
+        source="def f():\n    pass\n",
+        filepath="x.py",
+        module_name="mod",
+        cfg=NormalizationConfig(),
+        min_loc=1,
+        min_stmt=1,
+    )
+    assert len(units) == 1
+    assert blocks == []
+    assert segments == []
+    assert captured_hashes["value"] is None
 def test_extract_skips_invalid_positions(monkeypatch: pytest.MonkeyPatch) -> None:
     tree = ast.parse(
         """

{codeclone-1.4.1 → codeclone-1.4.2}/tests/test_security.py RENAMED Viewed

@@ -30,18 +30,34 @@ def test_process_file_size_limit() -> None:
     try:
         cfg = NormalizationConfig()
+        real_stat = os.stat(tmp_path)
-        # Mock os.path.getsize to return huge size
-        with patch("os.path.getsize", return_value=MAX_FILE_SIZE + 1):
+        # Mock os.stat to return huge st_size
+        def _huge_stat(path: str, *args: object, **kwargs: object) -> os.stat_result:
+            return os.stat_result(
+                (
+                    real_stat.st_mode,
+                    real_stat.st_ino,
+                    real_stat.st_dev,
+                    real_stat.st_nlink,
+                    real_stat.st_uid,
+                    real_stat.st_gid,
+                    MAX_FILE_SIZE + 1,  # st_size
+                    int(real_stat.st_atime),
+                    int(real_stat.st_mtime),
+                    int(real_stat.st_ctime),
+                )
+            )
+        with patch("os.stat", side_effect=_huge_stat):
             result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0)
             assert result.success is False
             assert result.error is not None
             assert "File too large" in result.error
-        # Normal size should pass
-        with patch("os.path.getsize", return_value=10):
-            result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0)
-            assert result.success is True
+        # Normal size should pass (no mock — real stat)
+        result = process_file(tmp_path, os.path.dirname(tmp_path), cfg, 0, 0)
+        assert result.success is True
     finally:
         os.remove(tmp_path)