codeclone 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeclone/__init__.py CHANGED
@@ -6,7 +6,7 @@ Copyright (c) 2026 Den Rozhnovskiy
6
6
  Licensed under the MIT License.
7
7
  """
8
8
 
9
- from importlib.metadata import version, PackageNotFoundError
9
+ from importlib.metadata import PackageNotFoundError, version
10
10
 
11
11
  try:
12
12
  __version__ = version("codeclone")
codeclone/baseline.py CHANGED
@@ -9,14 +9,19 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import json
12
+ from collections.abc import Mapping
12
13
  from pathlib import Path
14
+ from typing import Any
13
15
 
14
16
 
15
17
  class Baseline:
18
+ __slots__ = ("blocks", "functions", "path", "python_version")
19
+
16
20
  def __init__(self, path: str | Path):
17
21
  self.path = Path(path)
18
22
  self.functions: set[str] = set()
19
23
  self.blocks: set[str] = set()
24
+ self.python_version: str | None = None
20
25
 
21
26
  def load(self) -> None:
22
27
  if not self.path.exists():
@@ -26,6 +31,10 @@ class Baseline:
26
31
  data = json.loads(self.path.read_text("utf-8"))
27
32
  self.functions = set(data.get("functions", []))
28
33
  self.blocks = set(data.get("blocks", []))
34
+ python_version = data.get("python_version")
35
+ self.python_version = (
36
+ python_version if isinstance(python_version, str) else None
37
+ )
29
38
  except json.JSONDecodeError as e:
30
39
  raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e
31
40
 
@@ -33,10 +42,7 @@ class Baseline:
33
42
  self.path.parent.mkdir(parents=True, exist_ok=True)
34
43
  self.path.write_text(
35
44
  json.dumps(
36
- {
37
- "functions": sorted(self.functions),
38
- "blocks": sorted(self.blocks),
39
- },
45
+ _baseline_payload(self.functions, self.blocks, self.python_version),
40
46
  indent=2,
41
47
  ensure_ascii=False,
42
48
  ),
@@ -45,14 +51,34 @@ class Baseline:
45
51
 
46
52
  @staticmethod
47
53
  def from_groups(
48
- func_groups: dict, block_groups: dict, path: str | Path = ""
49
- ) -> "Baseline":
54
+ func_groups: Mapping[str, object],
55
+ block_groups: Mapping[str, object],
56
+ path: str | Path = "",
57
+ python_version: str | None = None,
58
+ ) -> Baseline:
50
59
  bl = Baseline(path)
51
60
  bl.functions = set(func_groups.keys())
52
61
  bl.blocks = set(block_groups.keys())
62
+ bl.python_version = python_version
53
63
  return bl
54
64
 
55
- def diff(self, func_groups: dict, block_groups: dict) -> tuple[set, set]:
65
+ def diff(
66
+ self, func_groups: Mapping[str, object], block_groups: Mapping[str, object]
67
+ ) -> tuple[set[str], set[str]]:
56
68
  new_funcs = set(func_groups.keys()) - self.functions
57
69
  new_blocks = set(block_groups.keys()) - self.blocks
58
70
  return new_funcs, new_blocks
71
+
72
+
73
+ def _baseline_payload(
74
+ functions: set[str],
75
+ blocks: set[str],
76
+ python_version: str | None,
77
+ ) -> dict[str, Any]:
78
+ payload: dict[str, Any] = {
79
+ "functions": sorted(functions),
80
+ "blocks": sorted(blocks),
81
+ }
82
+ if python_version:
83
+ payload["python_version"] = python_version
84
+ return payload
codeclone/blockhash.py CHANGED
@@ -11,7 +11,7 @@ from __future__ import annotations
11
11
  import ast
12
12
  import hashlib
13
13
 
14
- from .normalize import NormalizationConfig, AstNormalizer
14
+ from .normalize import AstNormalizer, NormalizationConfig
15
15
 
16
16
 
17
17
  def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
codeclone/blocks.py CHANGED
@@ -15,7 +15,7 @@ from .blockhash import stmt_hash
15
15
  from .normalize import NormalizationConfig
16
16
 
17
17
 
18
- @dataclass(frozen=True)
18
+ @dataclass(frozen=True, slots=True)
19
19
  class BlockUnit:
20
20
  block_hash: str
21
21
  filepath: str
@@ -42,7 +42,8 @@ def extract_blocks(
42
42
 
43
43
  blocks: list[BlockUnit] = []
44
44
  last_start: int | None = None
45
- MIN_LINE_DISTANCE = 5 # suppress overlapping windows
45
+ # Allow some overlap (50%), but at least 3 lines apart
46
+ min_line_distance = max(block_size // 2, 3)
46
47
 
47
48
  for i in range(len(stmt_hashes) - block_size + 1):
48
49
  start = getattr(body[i], "lineno", None)
@@ -50,7 +51,7 @@ def extract_blocks(
50
51
  if not start or not end:
51
52
  continue
52
53
 
53
- if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
54
+ if last_start is not None and start - last_start < min_line_distance:
54
55
  continue
55
56
 
56
57
  bh = "|".join(stmt_hashes[i : i + block_size])
codeclone/cache.py CHANGED
@@ -8,47 +8,178 @@ Licensed under the MIT License.
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import hashlib
12
+ import hmac
11
13
  import json
12
14
  import os
15
+ import secrets
16
+ from collections.abc import Mapping
13
17
  from dataclasses import asdict
14
18
  from pathlib import Path
15
- from typing import Any, Optional
19
+ from typing import TYPE_CHECKING, Any, TypedDict, cast
20
+
21
+ if TYPE_CHECKING:
22
+ from .blocks import BlockUnit
23
+ from .extractor import Unit
24
+
25
+ from .errors import CacheError
26
+
27
+
28
+ class FileStat(TypedDict):
29
+ mtime_ns: int
30
+ size: int
31
+
32
+
33
+ class UnitDict(TypedDict):
34
+ qualname: str
35
+ filepath: str
36
+ start_line: int
37
+ end_line: int
38
+ loc: int
39
+ stmt_count: int
40
+ fingerprint: str
41
+ loc_bucket: str
42
+
43
+
44
+ class BlockDict(TypedDict):
45
+ block_hash: str
46
+ filepath: str
47
+ qualname: str
48
+ start_line: int
49
+ end_line: int
50
+ size: int
51
+
52
+
53
+ class CacheEntry(TypedDict):
54
+ stat: FileStat
55
+ units: list[UnitDict]
56
+ blocks: list[BlockDict]
57
+
58
+
59
+ class CacheData(TypedDict):
60
+ version: str
61
+ files: dict[str, CacheEntry]
16
62
 
17
63
 
18
64
  class Cache:
65
+ __slots__ = ("data", "load_warning", "path", "secret")
66
+ CACHE_VERSION = "1.0"
67
+
19
68
  def __init__(self, path: str | Path):
20
69
  self.path = Path(path)
21
- self.data: dict[str, Any] = {"files": {}}
70
+ self.data: CacheData = {"version": self.CACHE_VERSION, "files": {}}
71
+ self.secret = self._load_secret()
72
+ self.load_warning: str | None = None
22
73
 
23
- def load(self) -> None:
24
- if self.path.exists():
74
+ def _load_secret(self) -> bytes:
75
+ """Load or create cache signing secret."""
76
+ # Store secret in the same directory as the cache file, named .cache_secret
77
+ # If cache is at ~/.cache/codeclone/cache.json, secret is
78
+ # ~/.cache/codeclone/.cache_secret
79
+ secret_path = self.path.parent / ".cache_secret"
80
+ if secret_path.exists():
81
+ return secret_path.read_bytes()
82
+ else:
83
+ secret = secrets.token_bytes(32)
25
84
  try:
26
- self.data = json.loads(self.path.read_text("utf-8"))
27
- except json.JSONDecodeError:
28
- # If cache is corrupted, start fresh
29
- self.data = {"files": {}}
85
+ self.path.parent.mkdir(parents=True, exist_ok=True)
86
+ secret_path.write_bytes(secret)
87
+ # Set restrictive permissions on secret file (Unix only)
88
+ if os.name == "posix":
89
+ secret_path.chmod(0o600)
90
+ except OSError:
91
+ pass
92
+ return secret
93
+
94
+ def _sign_data(self, data: Mapping[str, Any]) -> str:
95
+ """Create HMAC signature of cache data."""
96
+ # Sort keys for deterministic JSON serialization
97
+ data_str = json.dumps(data, sort_keys=True)
98
+ return hmac.new(self.secret, data_str.encode(), hashlib.sha256).hexdigest()
99
+
100
+ def load(self) -> None:
101
+ if not self.path.exists():
102
+ return
103
+
104
+ try:
105
+ raw = json.loads(self.path.read_text("utf-8"))
106
+ stored_sig = raw.get("_signature")
107
+
108
+ # Extract data without signature for verification
109
+ data = {k: v for k, v in raw.items() if k != "_signature"}
110
+
111
+ # Verify signature
112
+ expected_sig = self._sign_data(data)
113
+ if stored_sig != expected_sig:
114
+ self.load_warning = "Cache signature mismatch; ignoring cache."
115
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
116
+ return
117
+
118
+ if data.get("version") != self.CACHE_VERSION:
119
+ self.load_warning = (
120
+ "Cache version mismatch "
121
+ f"(found {data.get('version')}); ignoring cache."
122
+ )
123
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
124
+ return
125
+
126
+ # Basic structure check
127
+ if not isinstance(data.get("files"), dict):
128
+ self.load_warning = "Cache format invalid; ignoring cache."
129
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
130
+ return
131
+
132
+ self.data = cast(CacheData, data)
133
+ self.load_warning = None
134
+
135
+ except (json.JSONDecodeError, ValueError):
136
+ self.load_warning = "Cache corrupted; ignoring cache."
137
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
30
138
 
31
139
  def save(self) -> None:
32
- self.path.parent.mkdir(parents=True, exist_ok=True)
33
- self.path.write_text(
34
- json.dumps(self.data, ensure_ascii=False, indent=2),
35
- "utf-8",
36
- )
140
+ try:
141
+ self.path.parent.mkdir(parents=True, exist_ok=True)
142
+
143
+ # Add signature
144
+ data_with_sig = {**self.data, "_signature": self._sign_data(self.data)}
145
+
146
+ self.path.write_text(
147
+ json.dumps(data_with_sig, ensure_ascii=False, indent=2),
148
+ "utf-8",
149
+ )
150
+ except OSError as e:
151
+ raise CacheError(f"Failed to save cache: {e}") from e
152
+
153
+ def get_file_entry(self, filepath: str) -> CacheEntry | None:
154
+ entry = self.data["files"].get(filepath)
155
+
156
+ if entry is None:
157
+ return None
158
+
159
+ if not isinstance(entry, dict):
160
+ return None
161
+
162
+ required = {"stat", "units", "blocks"}
163
+ if not required.issubset(entry.keys()):
164
+ return None
37
165
 
38
- def get_file_entry(self, filepath: str) -> Optional[dict[str, Any]]:
39
- return self.data.get("files", {}).get(filepath)
166
+ return entry
40
167
 
41
168
  def put_file_entry(
42
- self, filepath: str, stat_sig: dict[str, Any], units: list, blocks: list
169
+ self,
170
+ filepath: str,
171
+ stat_sig: FileStat,
172
+ units: list[Unit],
173
+ blocks: list[BlockUnit],
43
174
  ) -> None:
44
- self.data.setdefault("files", {})[filepath] = {
175
+ self.data["files"][filepath] = {
45
176
  "stat": stat_sig,
46
- "units": [asdict(u) for u in units],
47
- "blocks": [asdict(b) for b in blocks],
177
+ "units": cast(list[UnitDict], cast(object, [asdict(u) for u in units])),
178
+ "blocks": cast(list[BlockDict], cast(object, [asdict(b) for b in blocks])),
48
179
  }
49
180
 
50
181
 
51
- def file_stat_signature(path: str) -> dict:
182
+ def file_stat_signature(path: str) -> FileStat:
52
183
  st = os.stat(path)
53
184
  return {
54
185
  "mtime_ns": st.st_mtime_ns,
codeclone/cfg.py CHANGED
@@ -9,48 +9,21 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import ast
12
- from dataclasses import dataclass, field
13
- from typing import Iterable
12
+ from collections.abc import Iterable
13
+ from typing import Protocol, cast
14
14
 
15
+ from .cfg_model import CFG, Block
15
16
 
16
- # =========================
17
- # Core CFG structures
18
- # =========================
19
-
20
-
21
- @dataclass(eq=False)
22
- class Block:
23
- id: int
24
- statements: list[ast.stmt] = field(default_factory=list)
25
- successors: set["Block"] = field(default_factory=set)
26
- is_terminated: bool = False
27
-
28
- def add_successor(self, block: Block) -> None:
29
- self.successors.add(block)
30
-
31
- def __hash__(self) -> int:
32
- return hash(self.id)
17
+ __all__ = ["CFG", "CFGBuilder"]
33
18
 
34
- def __eq__(self, other: object) -> bool:
35
- return isinstance(other, Block) and self.id == other.id
19
+ TryStar = getattr(ast, "TryStar", ast.Try)
36
20
 
37
21
 
38
- @dataclass
39
- class CFG:
40
- qualname: str
41
- blocks: list[Block] = field(default_factory=list)
42
-
43
- entry: Block = field(init=False)
44
- exit: Block = field(init=False)
45
-
46
- def __post_init__(self) -> None:
47
- self.entry = self.create_block()
48
- self.exit = self.create_block()
49
-
50
- def create_block(self) -> Block:
51
- block = Block(id=len(self.blocks))
52
- self.blocks.append(block)
53
- return block
22
+ class _TryLike(Protocol):
23
+ body: list[ast.stmt]
24
+ handlers: list[ast.ExceptHandler]
25
+ orelse: list[ast.stmt]
26
+ finalbody: list[ast.stmt]
54
27
 
55
28
 
56
29
  # =========================
@@ -59,6 +32,8 @@ class CFG:
59
32
 
60
33
 
61
34
  class CFGBuilder:
35
+ __slots__ = ("cfg", "current")
36
+
62
37
  def __init__(self) -> None:
63
38
  self.cfg: CFG
64
39
  self.current: Block
@@ -110,8 +85,10 @@ class CFGBuilder:
110
85
  case ast.AsyncFor():
111
86
  self._visit_for(stmt) # Structure is identical to For
112
87
 
113
- case ast.Try() | ast.TryStar():
114
- self._visit_try(stmt)
88
+ case ast.Try():
89
+ self._visit_try(cast(_TryLike, stmt))
90
+ case _ if TryStar is not None and isinstance(stmt, TryStar):
91
+ self._visit_try(cast(_TryLike, stmt))
115
92
 
116
93
  case ast.With() | ast.AsyncWith():
117
94
  self._visit_with(stmt)
@@ -185,7 +162,8 @@ class CFGBuilder:
185
162
  self.current = after_block
186
163
 
187
164
  def _visit_with(self, stmt: ast.With | ast.AsyncWith) -> None:
188
- # Treat WITH as linear flow (enter -> body -> exit), but preserve block structure
165
+ # Treat WITH as linear flow (enter -> body -> exit), but preserve
166
+ # block structure
189
167
  # We record the context manager expression in the current block
190
168
  # Then we enter a new block for the body (to separate it structurally)
191
169
  # Then we enter a new block for 'after' (exit)
@@ -210,126 +188,73 @@ class CFGBuilder:
210
188
 
211
189
  self.current = after_block
212
190
 
213
- def _visit_try(self, stmt: ast.Try | ast.TryStar) -> None:
214
- # Simplified Try CFG:
215
- # Try Body -> [Handlers...] -> Finally/After
216
- # Try Body -> Else -> Finally/After
217
-
218
- try_block = self.cfg.create_block()
219
- self.current.add_successor(try_block)
220
-
221
- # We don't know WHERE in the try block exception happens, so we assume
222
- # any point in try block *could* jump to handlers.
223
- # But for structural hashing, we just process the body.
224
- # Ideally, we should link the try_block (or its end) to handlers?
225
- # A simple approximation:
226
- # 1. Process body.
227
- # 2. Link entry (or end of body) to handlers?
228
- # Let's do: Entry -> BodyBlock.
229
- # Entry -> HandlerBlocks (to represent potential jump).
230
-
231
- # Actually, let's keep it linear but branched.
232
- # Current -> TryBody
233
- # Current -> Handlers (Abstractly representing the jump)
191
+ def _visit_try(self, stmt: _TryLike) -> None:
192
+ try_entry = self.cfg.create_block()
193
+ self.current.add_successor(try_entry)
194
+ self.current = try_entry
234
195
 
235
196
  handlers_blocks = [self.cfg.create_block() for _ in stmt.handlers]
236
197
  else_block = self.cfg.create_block() if stmt.orelse else None
237
- final_block = self.cfg.create_block() # This is finally or after
198
+ final_block = self.cfg.create_block()
238
199
 
239
- # Link current to TryBody
240
- self.current = try_block
241
- self._visit_statements(stmt.body)
200
+ # Process each statement in try body
201
+ # Link each to exception handlers
202
+ for stmt_node in stmt.body:
203
+ if self.current.is_terminated:
204
+ break
205
+
206
+ # Current statement could raise exception
207
+ for h_block in handlers_blocks:
208
+ self.current.add_successor(h_block)
209
+
210
+ self._visit(stmt_node)
242
211
 
243
- # If try body finishes successfully:
212
+ # Normal exit from try
244
213
  if not self.current.is_terminated:
245
214
  if else_block:
246
215
  self.current.add_successor(else_block)
247
216
  else:
248
217
  self.current.add_successor(final_block)
249
218
 
250
- # Handle Else
251
- if else_block:
252
- self.current = else_block
253
- self._visit_statements(stmt.orelse)
254
- if not self.current.is_terminated:
255
- self.current.add_successor(final_block)
256
-
257
- # Handle Handlers
258
- # We assume control flow *could* jump from start of Try to any handler
259
- # (Technically from inside try, but we model structural containment)
260
- # To make fingerprints stable, we just need to ensure handlers are visited
261
- # and linked.
262
-
263
- # We link the *original* predecessor (before try) or the try_block start to handlers?
264
- # Let's link the `try_block` (as a container concept) to handlers.
265
- # But `try_block` was mutated by `_visit_statements`.
266
- # Let's use the `try_block` (start of try) to link to handlers.
267
- for h_block in handlers_blocks:
268
- try_block.add_successor(h_block)
269
-
270
- for handler, h_block in zip(stmt.handlers, handlers_blocks):
219
+ # Process handlers
220
+ for handler, h_block in zip(stmt.handlers, handlers_blocks, strict=True):
271
221
  self.current = h_block
272
- # Record exception type
273
222
  if handler.type:
274
223
  self.current.statements.append(ast.Expr(value=handler.type))
224
+
275
225
  self._visit_statements(handler.body)
276
226
  if not self.current.is_terminated:
277
227
  self.current.add_successor(final_block)
278
228
 
279
- # Finally logic:
280
- # If there is a finally block, `final_block` IS the finally block.
281
- # We visit it. Then we create a new `after_finally` block?
282
- # Or `final_block` is the start of finally.
229
+ # Process else
230
+ if else_block:
231
+ self.current = else_block
232
+ self._visit_statements(stmt.orelse)
233
+ if not self.current.is_terminated:
234
+ self.current.add_successor(final_block)
283
235
 
236
+ # Process finally
237
+ self.current = final_block
284
238
  if stmt.finalbody:
285
- self.current = final_block
286
239
  self._visit_statements(stmt.finalbody)
287
- # And then continue to next code?
288
- # Yes, finally flows to next statement.
289
- # Unless terminated.
290
-
291
- # If no finally, `final_block` is just the merge point (after).
292
- self.current = final_block
293
240
 
294
241
  def _visit_match(self, stmt: ast.Match) -> None:
295
- # Match subject -> Cases -> After
296
-
297
242
  self.current.statements.append(ast.Expr(value=stmt.subject))
298
243
 
299
- after_block = self.cfg.create_block()
300
-
301
- for case_ in stmt.cases:
302
- case_block = self.cfg.create_block()
303
- self.current.add_successor(case_block)
304
-
305
- # Save current context to restore for next case branching?
306
- # No, 'current' is the match subject block. It branches to ALL cases.
307
-
308
- # Visit Case
309
- # We must set self.current to case_block for visiting body
310
- # But we lose reference to 'match subject block' to link next case!
311
- # So we need a variable `subject_block`.
312
- pass
313
-
314
- # Re-implementing loop correctly
315
244
  subject_block = self.current
245
+ after_block = self.cfg.create_block()
316
246
 
317
247
  for case_ in stmt.cases:
318
248
  case_block = self.cfg.create_block()
319
249
  subject_block.add_successor(case_block)
320
250
 
321
251
  self.current = case_block
322
- # We could record the pattern here?
323
- # patterns are complex AST nodes. For now, let's skip pattern structure hash
324
- # and just hash the body. Or dump pattern as statement?
325
- # Pattern is not a statement.
326
- # Let's ignore pattern details for V1, or try to normalize it.
327
- # If we ignore pattern, then `case []:` and `case {}:` look same.
328
- # Ideally: `self.current.statements.append(case_.pattern)` but pattern is not stmt.
329
- # We can wrap in Expr? `ast.Expr(value=case_.pattern)`?
330
- # Pattern is NOT an Expr subclass in 3.10. It's `ast.pattern`.
331
- # So we cannot append it to `statements` list which expects `ast.stmt`.
332
- # We will ignore pattern structure for now (it's structural flow we care about).
252
+
253
+ # Record pattern structure
254
+ pattern_repr = ast.dump(case_.pattern, annotate_fields=False)
255
+ self.current.statements.append(
256
+ ast.Expr(value=ast.Constant(value=f"PATTERN:{pattern_repr}"))
257
+ )
333
258
 
334
259
  self._visit_statements(case_.body)
335
260
  if not self.current.is_terminated:
codeclone/cfg_model.py ADDED
@@ -0,0 +1,47 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import ast
12
+ from dataclasses import dataclass, field
13
+
14
+
15
+ @dataclass(eq=False, slots=True)
16
+ class Block:
17
+ id: int
18
+ statements: list[ast.stmt] = field(default_factory=list)
19
+ successors: set[Block] = field(default_factory=set)
20
+ is_terminated: bool = False
21
+
22
+ def add_successor(self, block: Block) -> None:
23
+ self.successors.add(block)
24
+
25
+ def __hash__(self) -> int:
26
+ return hash(self.id)
27
+
28
+ def __eq__(self, other: object) -> bool:
29
+ return isinstance(other, Block) and self.id == other.id
30
+
31
+
32
+ @dataclass(slots=True)
33
+ class CFG:
34
+ qualname: str
35
+ blocks: list[Block] = field(default_factory=list)
36
+
37
+ entry: Block = field(init=False)
38
+ exit: Block = field(init=False)
39
+
40
+ def __post_init__(self) -> None:
41
+ self.entry = self.create_block()
42
+ self.exit = self.create_block()
43
+
44
+ def create_block(self) -> Block:
45
+ block = Block(id=len(self.blocks))
46
+ self.blocks.append(block)
47
+ return block