codeclone 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeclone/__init__.py CHANGED
@@ -6,7 +6,7 @@ Copyright (c) 2026 Den Rozhnovskiy
6
6
  Licensed under the MIT License.
7
7
  """
8
8
 
9
- from importlib.metadata import version, PackageNotFoundError
9
+ from importlib.metadata import PackageNotFoundError, version
10
10
 
11
11
  try:
12
12
  __version__ = version("codeclone")
codeclone/baseline.py CHANGED
@@ -9,32 +9,40 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import json
12
+ from collections.abc import Mapping
12
13
  from pathlib import Path
13
- from typing import Set
14
+ from typing import Any
14
15
 
15
16
 
16
17
  class Baseline:
17
- def __init__(self, path: str):
18
+ __slots__ = ("blocks", "functions", "path", "python_version")
19
+
20
+ def __init__(self, path: str | Path):
18
21
  self.path = Path(path)
19
- self.functions: Set[str] = set()
20
- self.blocks: Set[str] = set()
22
+ self.functions: set[str] = set()
23
+ self.blocks: set[str] = set()
24
+ self.python_version: str | None = None
21
25
 
22
26
  def load(self) -> None:
23
27
  if not self.path.exists():
24
28
  return
25
29
 
26
- data = json.loads(self.path.read_text("utf-8"))
27
- self.functions = set(data.get("functions", []))
28
- self.blocks = set(data.get("blocks", []))
30
+ try:
31
+ data = json.loads(self.path.read_text("utf-8"))
32
+ self.functions = set(data.get("functions", []))
33
+ self.blocks = set(data.get("blocks", []))
34
+ python_version = data.get("python_version")
35
+ self.python_version = (
36
+ python_version if isinstance(python_version, str) else None
37
+ )
38
+ except json.JSONDecodeError as e:
39
+ raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e
29
40
 
30
41
  def save(self) -> None:
31
42
  self.path.parent.mkdir(parents=True, exist_ok=True)
32
43
  self.path.write_text(
33
44
  json.dumps(
34
- {
35
- "functions": sorted(self.functions),
36
- "blocks": sorted(self.blocks),
37
- },
45
+ _baseline_payload(self.functions, self.blocks, self.python_version),
38
46
  indent=2,
39
47
  ensure_ascii=False,
40
48
  ),
@@ -42,13 +50,35 @@ class Baseline:
42
50
  )
43
51
 
44
52
  @staticmethod
45
- def from_groups(func_groups: dict, block_groups: dict) -> "Baseline":
46
- bl = Baseline("")
53
+ def from_groups(
54
+ func_groups: Mapping[str, object],
55
+ block_groups: Mapping[str, object],
56
+ path: str | Path = "",
57
+ python_version: str | None = None,
58
+ ) -> Baseline:
59
+ bl = Baseline(path)
47
60
  bl.functions = set(func_groups.keys())
48
61
  bl.blocks = set(block_groups.keys())
62
+ bl.python_version = python_version
49
63
  return bl
50
64
 
51
- def diff(self, func_groups: dict, block_groups: dict) -> tuple[set, set]:
65
+ def diff(
66
+ self, func_groups: Mapping[str, object], block_groups: Mapping[str, object]
67
+ ) -> tuple[set[str], set[str]]:
52
68
  new_funcs = set(func_groups.keys()) - self.functions
53
69
  new_blocks = set(block_groups.keys()) - self.blocks
54
70
  return new_funcs, new_blocks
71
+
72
+
73
+ def _baseline_payload(
74
+ functions: set[str],
75
+ blocks: set[str],
76
+ python_version: str | None,
77
+ ) -> dict[str, Any]:
78
+ payload: dict[str, Any] = {
79
+ "functions": sorted(functions),
80
+ "blocks": sorted(blocks),
81
+ }
82
+ if python_version:
83
+ payload["python_version"] = python_version
84
+ return payload
codeclone/blockhash.py CHANGED
@@ -11,7 +11,7 @@ from __future__ import annotations
11
11
  import ast
12
12
  import hashlib
13
13
 
14
- from .normalize import NormalizationConfig, AstNormalizer
14
+ from .normalize import AstNormalizer, NormalizationConfig
15
15
 
16
16
 
17
17
  def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
codeclone/blocks.py CHANGED
@@ -15,7 +15,7 @@ from .blockhash import stmt_hash
15
15
  from .normalize import NormalizationConfig
16
16
 
17
17
 
18
- @dataclass(frozen=True)
18
+ @dataclass(frozen=True, slots=True)
19
19
  class BlockUnit:
20
20
  block_hash: str
21
21
  filepath: str
@@ -42,7 +42,8 @@ def extract_blocks(
42
42
 
43
43
  blocks: list[BlockUnit] = []
44
44
  last_start: int | None = None
45
- MIN_LINE_DISTANCE = 5 # suppress overlapping windows
45
+ # Allow some overlap (50%), but at least 3 lines apart
46
+ min_line_distance = max(block_size // 2, 3)
46
47
 
47
48
  for i in range(len(stmt_hashes) - block_size + 1):
48
49
  start = getattr(body[i], "lineno", None)
@@ -50,7 +51,7 @@ def extract_blocks(
50
51
  if not start or not end:
51
52
  continue
52
53
 
53
- if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
54
+ if last_start is not None and start - last_start < min_line_distance:
54
55
  continue
55
56
 
56
57
  bh = "|".join(stmt_hashes[i : i + block_size])
codeclone/cache.py CHANGED
@@ -8,41 +8,178 @@ Licensed under the MIT License.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import hashlib
12
+ import hmac
11
13
  import json
12
14
  import os
15
+ import secrets
16
+ from collections.abc import Mapping
13
17
  from dataclasses import asdict
14
18
  from pathlib import Path
15
- from typing import Optional
19
+ from typing import TYPE_CHECKING, Any, TypedDict, cast
20
+
21
+ if TYPE_CHECKING:
22
+ from .blocks import BlockUnit
23
+ from .extractor import Unit
24
+
25
+ from .errors import CacheError
26
+
27
+
28
+ class FileStat(TypedDict):
29
+ mtime_ns: int
30
+ size: int
31
+
32
+
33
+ class UnitDict(TypedDict):
34
+ qualname: str
35
+ filepath: str
36
+ start_line: int
37
+ end_line: int
38
+ loc: int
39
+ stmt_count: int
40
+ fingerprint: str
41
+ loc_bucket: str
42
+
43
+
44
+ class BlockDict(TypedDict):
45
+ block_hash: str
46
+ filepath: str
47
+ qualname: str
48
+ start_line: int
49
+ end_line: int
50
+ size: int
51
+
52
+
53
+ class CacheEntry(TypedDict):
54
+ stat: FileStat
55
+ units: list[UnitDict]
56
+ blocks: list[BlockDict]
57
+
58
+
59
+ class CacheData(TypedDict):
60
+ version: str
61
+ files: dict[str, CacheEntry]
16
62
 
17
63
 
18
64
  class Cache:
19
- def __init__(self, path: str):
65
+ __slots__ = ("data", "load_warning", "path", "secret")
66
+ CACHE_VERSION = "1.0"
67
+
68
+ def __init__(self, path: str | Path):
20
69
  self.path = Path(path)
21
- self.data: dict = {"files": {}}
70
+ self.data: CacheData = {"version": self.CACHE_VERSION, "files": {}}
71
+ self.secret = self._load_secret()
72
+ self.load_warning: str | None = None
73
+
74
+ def _load_secret(self) -> bytes:
75
+ """Load or create cache signing secret."""
76
+ # Store secret in the same directory as the cache file, named .cache_secret
77
+ # If cache is at ~/.cache/codeclone/cache.json, secret is
78
+ # ~/.cache/codeclone/.cache_secret
79
+ secret_path = self.path.parent / ".cache_secret"
80
+ if secret_path.exists():
81
+ return secret_path.read_bytes()
82
+ else:
83
+ secret = secrets.token_bytes(32)
84
+ try:
85
+ self.path.parent.mkdir(parents=True, exist_ok=True)
86
+ secret_path.write_bytes(secret)
87
+ # Set restrictive permissions on secret file (Unix only)
88
+ if os.name == "posix":
89
+ secret_path.chmod(0o600)
90
+ except OSError:
91
+ pass
92
+ return secret
93
+
94
+ def _sign_data(self, data: Mapping[str, Any]) -> str:
95
+ """Create HMAC signature of cache data."""
96
+ # Sort keys for deterministic JSON serialization
97
+ data_str = json.dumps(data, sort_keys=True)
98
+ return hmac.new(self.secret, data_str.encode(), hashlib.sha256).hexdigest()
22
99
 
23
100
  def load(self) -> None:
24
- if self.path.exists():
25
- self.data = json.loads(self.path.read_text("utf-8"))
101
+ if not self.path.exists():
102
+ return
103
+
104
+ try:
105
+ raw = json.loads(self.path.read_text("utf-8"))
106
+ stored_sig = raw.get("_signature")
107
+
108
+ # Extract data without signature for verification
109
+ data = {k: v for k, v in raw.items() if k != "_signature"}
110
+
111
+ # Verify signature
112
+ expected_sig = self._sign_data(data)
113
+ if stored_sig != expected_sig:
114
+ self.load_warning = "Cache signature mismatch; ignoring cache."
115
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
116
+ return
117
+
118
+ if data.get("version") != self.CACHE_VERSION:
119
+ self.load_warning = (
120
+ "Cache version mismatch "
121
+ f"(found {data.get('version')}); ignoring cache."
122
+ )
123
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
124
+ return
125
+
126
+ # Basic structure check
127
+ if not isinstance(data.get("files"), dict):
128
+ self.load_warning = "Cache format invalid; ignoring cache."
129
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
130
+ return
131
+
132
+ self.data = cast(CacheData, data)
133
+ self.load_warning = None
134
+
135
+ except (json.JSONDecodeError, ValueError):
136
+ self.load_warning = "Cache corrupted; ignoring cache."
137
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
26
138
 
27
139
  def save(self) -> None:
28
- self.path.parent.mkdir(parents=True, exist_ok=True)
29
- self.path.write_text(
30
- json.dumps(self.data, ensure_ascii=False, indent=2),
31
- "utf-8",
32
- )
140
+ try:
141
+ self.path.parent.mkdir(parents=True, exist_ok=True)
142
+
143
+ # Add signature
144
+ data_with_sig = {**self.data, "_signature": self._sign_data(self.data)}
145
+
146
+ self.path.write_text(
147
+ json.dumps(data_with_sig, ensure_ascii=False, indent=2),
148
+ "utf-8",
149
+ )
150
+ except OSError as e:
151
+ raise CacheError(f"Failed to save cache: {e}") from e
152
+
153
+ def get_file_entry(self, filepath: str) -> CacheEntry | None:
154
+ entry = self.data["files"].get(filepath)
155
+
156
+ if entry is None:
157
+ return None
158
+
159
+ if not isinstance(entry, dict):
160
+ return None
161
+
162
+ required = {"stat", "units", "blocks"}
163
+ if not required.issubset(entry.keys()):
164
+ return None
33
165
 
34
- def get_file_entry(self, filepath: str) -> Optional[dict]:
35
- return self.data.get("files", {}).get(filepath)
166
+ return entry
36
167
 
37
- def put_file_entry(self, filepath: str, stat_sig: dict, units, blocks) -> None:
38
- self.data.setdefault("files", {})[filepath] = {
168
+ def put_file_entry(
169
+ self,
170
+ filepath: str,
171
+ stat_sig: FileStat,
172
+ units: list[Unit],
173
+ blocks: list[BlockUnit],
174
+ ) -> None:
175
+ self.data["files"][filepath] = {
39
176
  "stat": stat_sig,
40
- "units": [asdict(u) for u in units],
41
- "blocks": [asdict(b) for b in blocks],
177
+ "units": cast(list[UnitDict], cast(object, [asdict(u) for u in units])),
178
+ "blocks": cast(list[BlockDict], cast(object, [asdict(b) for b in blocks])),
42
179
  }
43
180
 
44
181
 
45
- def file_stat_signature(path: str) -> dict:
182
+ def file_stat_signature(path: str) -> FileStat:
46
183
  st = os.stat(path)
47
184
  return {
48
185
  "mtime_ns": st.st_mtime_ns,
codeclone/cfg.py CHANGED
@@ -9,48 +9,21 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import ast
12
- from dataclasses import dataclass, field
13
- from typing import Iterable
12
+ from collections.abc import Iterable
13
+ from typing import Protocol, cast
14
14
 
15
+ from .cfg_model import CFG, Block
15
16
 
16
- # =========================
17
- # Core CFG structures
18
- # =========================
19
-
20
-
21
- @dataclass(eq=False)
22
- class Block:
23
- id: int
24
- statements: list[ast.stmt] = field(default_factory=list)
25
- successors: set["Block"] = field(default_factory=set)
26
- is_terminated: bool = False
27
-
28
- def add_successor(self, block: Block) -> None:
29
- self.successors.add(block)
30
-
31
- def __hash__(self) -> int:
32
- return hash(self.id)
17
+ __all__ = ["CFG", "CFGBuilder"]
33
18
 
34
- def __eq__(self, other: object) -> bool:
35
- return isinstance(other, Block) and self.id == other.id
19
+ TryStar = getattr(ast, "TryStar", ast.Try)
36
20
 
37
21
 
38
- @dataclass
39
- class CFG:
40
- qualname: str
41
- blocks: list[Block] = field(default_factory=list)
42
-
43
- entry: Block = field(init=False)
44
- exit: Block = field(init=False)
45
-
46
- def __post_init__(self) -> None:
47
- self.entry = self.create_block()
48
- self.exit = self.create_block()
49
-
50
- def create_block(self) -> Block:
51
- block = Block(id=len(self.blocks))
52
- self.blocks.append(block)
53
- return block
22
+ class _TryLike(Protocol):
23
+ body: list[ast.stmt]
24
+ handlers: list[ast.ExceptHandler]
25
+ orelse: list[ast.stmt]
26
+ finalbody: list[ast.stmt]
54
27
 
55
28
 
56
29
  # =========================
@@ -59,6 +32,8 @@ class CFG:
59
32
 
60
33
 
61
34
  class CFGBuilder:
35
+ __slots__ = ("cfg", "current")
36
+
62
37
  def __init__(self) -> None:
63
38
  self.cfg: CFG
64
39
  self.current: Block
@@ -107,6 +82,20 @@ class CFGBuilder:
107
82
  case ast.For():
108
83
  self._visit_for(stmt)
109
84
 
85
+ case ast.AsyncFor():
86
+ self._visit_for(stmt) # Structure is identical to For
87
+
88
+ case ast.Try():
89
+ self._visit_try(cast(_TryLike, stmt))
90
+ case _ if TryStar is not None and isinstance(stmt, TryStar):
91
+ self._visit_try(cast(_TryLike, stmt))
92
+
93
+ case ast.With() | ast.AsyncWith():
94
+ self._visit_with(stmt)
95
+
96
+ case ast.Match():
97
+ self._visit_match(stmt)
98
+
110
99
  case _:
111
100
  self.current.statements.append(stmt)
112
101
 
@@ -153,7 +142,7 @@ class CFGBuilder:
153
142
 
154
143
  self.current = after_block
155
144
 
156
- def _visit_for(self, stmt: ast.For) -> None:
145
+ def _visit_for(self, stmt: ast.For | ast.AsyncFor) -> None:
157
146
  iter_block = self.cfg.create_block()
158
147
  body_block = self.cfg.create_block()
159
148
  after_block = self.cfg.create_block()
@@ -171,3 +160,104 @@ class CFGBuilder:
171
160
  self.current.add_successor(iter_block)
172
161
 
173
162
  self.current = after_block
163
+
164
+ def _visit_with(self, stmt: ast.With | ast.AsyncWith) -> None:
165
+ # Treat WITH as linear flow (enter -> body -> exit), but preserve
166
+ # block structure
167
+ # We record the context manager expression in the current block
168
+ # Then we enter a new block for the body (to separate it structurally)
169
+ # Then we enter a new block for 'after' (exit)
170
+
171
+ # Why new block? Because 'with' implies a scope/context.
172
+ # It helps matching.
173
+
174
+ body_block = self.cfg.create_block()
175
+ after_block = self.cfg.create_block()
176
+
177
+ # Record the 'items' (context managers)
178
+ # We wrap them in Expr to treat them as statements for hashing
179
+ for item in stmt.items:
180
+ self.current.statements.append(ast.Expr(value=item.context_expr))
181
+
182
+ self.current.add_successor(body_block)
183
+
184
+ self.current = body_block
185
+ self._visit_statements(stmt.body)
186
+ if not self.current.is_terminated:
187
+ self.current.add_successor(after_block)
188
+
189
+ self.current = after_block
190
+
191
+ def _visit_try(self, stmt: _TryLike) -> None:
192
+ try_entry = self.cfg.create_block()
193
+ self.current.add_successor(try_entry)
194
+ self.current = try_entry
195
+
196
+ handlers_blocks = [self.cfg.create_block() for _ in stmt.handlers]
197
+ else_block = self.cfg.create_block() if stmt.orelse else None
198
+ final_block = self.cfg.create_block()
199
+
200
+ # Process each statement in try body
201
+ # Link each to exception handlers
202
+ for stmt_node in stmt.body:
203
+ if self.current.is_terminated:
204
+ break
205
+
206
+ # Current statement could raise exception
207
+ for h_block in handlers_blocks:
208
+ self.current.add_successor(h_block)
209
+
210
+ self._visit(stmt_node)
211
+
212
+ # Normal exit from try
213
+ if not self.current.is_terminated:
214
+ if else_block:
215
+ self.current.add_successor(else_block)
216
+ else:
217
+ self.current.add_successor(final_block)
218
+
219
+ # Process handlers
220
+ for handler, h_block in zip(stmt.handlers, handlers_blocks, strict=True):
221
+ self.current = h_block
222
+ if handler.type:
223
+ self.current.statements.append(ast.Expr(value=handler.type))
224
+
225
+ self._visit_statements(handler.body)
226
+ if not self.current.is_terminated:
227
+ self.current.add_successor(final_block)
228
+
229
+ # Process else
230
+ if else_block:
231
+ self.current = else_block
232
+ self._visit_statements(stmt.orelse)
233
+ if not self.current.is_terminated:
234
+ self.current.add_successor(final_block)
235
+
236
+ # Process finally
237
+ self.current = final_block
238
+ if stmt.finalbody:
239
+ self._visit_statements(stmt.finalbody)
240
+
241
+ def _visit_match(self, stmt: ast.Match) -> None:
242
+ self.current.statements.append(ast.Expr(value=stmt.subject))
243
+
244
+ subject_block = self.current
245
+ after_block = self.cfg.create_block()
246
+
247
+ for case_ in stmt.cases:
248
+ case_block = self.cfg.create_block()
249
+ subject_block.add_successor(case_block)
250
+
251
+ self.current = case_block
252
+
253
+ # Record pattern structure
254
+ pattern_repr = ast.dump(case_.pattern, annotate_fields=False)
255
+ self.current.statements.append(
256
+ ast.Expr(value=ast.Constant(value=f"PATTERN:{pattern_repr}"))
257
+ )
258
+
259
+ self._visit_statements(case_.body)
260
+ if not self.current.is_terminated:
261
+ self.current.add_successor(after_block)
262
+
263
+ self.current = after_block
codeclone/cfg_model.py ADDED
@@ -0,0 +1,47 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import ast
12
+ from dataclasses import dataclass, field
13
+
14
+
15
+ @dataclass(eq=False, slots=True)
16
+ class Block:
17
+ id: int
18
+ statements: list[ast.stmt] = field(default_factory=list)
19
+ successors: set[Block] = field(default_factory=set)
20
+ is_terminated: bool = False
21
+
22
+ def add_successor(self, block: Block) -> None:
23
+ self.successors.add(block)
24
+
25
+ def __hash__(self) -> int:
26
+ return hash(self.id)
27
+
28
+ def __eq__(self, other: object) -> bool:
29
+ return isinstance(other, Block) and self.id == other.id
30
+
31
+
32
+ @dataclass(slots=True)
33
+ class CFG:
34
+ qualname: str
35
+ blocks: list[Block] = field(default_factory=list)
36
+
37
+ entry: Block = field(init=False)
38
+ exit: Block = field(init=False)
39
+
40
+ def __post_init__(self) -> None:
41
+ self.entry = self.create_block()
42
+ self.exit = self.create_block()
43
+
44
+ def create_block(self) -> Block:
45
+ block = Block(id=len(self.blocks))
46
+ self.blocks.append(block)
47
+ return block