codeclone 1.2.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {codeclone-1.2.0 → codeclone-1.2.1}/PKG-INFO +53 -35
  2. {codeclone-1.2.0 → codeclone-1.2.1}/README.md +49 -32
  3. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/__init__.py +1 -1
  4. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/baseline.py +33 -7
  5. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/blockhash.py +1 -1
  6. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/blocks.py +4 -3
  7. codeclone-1.2.1/codeclone/cache.py +187 -0
  8. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/cfg.py +53 -128
  9. codeclone-1.2.1/codeclone/cfg_model.py +47 -0
  10. codeclone-1.2.1/codeclone/cli.py +603 -0
  11. codeclone-1.2.1/codeclone/errors.py +27 -0
  12. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/extractor.py +101 -24
  13. codeclone-1.2.1/codeclone/html_report.py +492 -0
  14. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/normalize.py +21 -14
  15. codeclone-1.2.1/codeclone/py.typed +0 -0
  16. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/report.py +23 -12
  17. codeclone-1.2.1/codeclone/scanner.py +111 -0
  18. codeclone-1.2.1/codeclone/templates.py +1262 -0
  19. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/PKG-INFO +53 -35
  20. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/SOURCES.txt +15 -1
  21. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/requires.txt +2 -0
  22. {codeclone-1.2.0 → codeclone-1.2.1}/pyproject.toml +36 -4
  23. {codeclone-1.2.0 → codeclone-1.2.1}/tests/test_baseline.py +29 -10
  24. codeclone-1.2.1/tests/test_blockhash.py +11 -0
  25. codeclone-1.2.1/tests/test_blocks.py +107 -0
  26. codeclone-1.2.1/tests/test_cache.py +198 -0
  27. codeclone-1.2.1/tests/test_cfg.py +417 -0
  28. codeclone-1.2.1/tests/test_cfg_model.py +18 -0
  29. codeclone-1.2.1/tests/test_cli_inprocess.py +812 -0
  30. codeclone-1.2.1/tests/test_cli_main_guard.py +17 -0
  31. {codeclone-1.2.0 → codeclone-1.2.1}/tests/test_cli_smoke.py +9 -6
  32. codeclone-1.2.1/tests/test_cli_unit.py +69 -0
  33. codeclone-1.2.1/tests/test_extractor.py +212 -0
  34. codeclone-1.2.1/tests/test_fingerprint.py +15 -0
  35. codeclone-1.2.1/tests/test_html_report.py +216 -0
  36. codeclone-1.2.1/tests/test_init.py +26 -0
  37. codeclone-1.2.1/tests/test_normalize.py +147 -0
  38. codeclone-1.2.1/tests/test_report.py +67 -0
  39. codeclone-1.2.1/tests/test_scanner_extra.py +165 -0
  40. codeclone-1.2.1/tests/test_security.py +44 -0
  41. codeclone-1.2.0/codeclone/cache.py +0 -56
  42. codeclone-1.2.0/codeclone/cli.py +0 -409
  43. codeclone-1.2.0/codeclone/html_report.py +0 -936
  44. codeclone-1.2.0/codeclone/scanner.py +0 -48
  45. codeclone-1.2.0/tests/test_blocks.py +0 -32
  46. codeclone-1.2.0/tests/test_cfg.py +0 -176
  47. codeclone-1.2.0/tests/test_extractor.py +0 -49
  48. codeclone-1.2.0/tests/test_html_report.py +0 -44
  49. codeclone-1.2.0/tests/test_normalize.py +0 -22
  50. codeclone-1.2.0/tests/test_report.py +0 -24
  51. {codeclone-1.2.0 → codeclone-1.2.1}/LICENSE +0 -0
  52. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/fingerprint.py +0 -0
  53. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/dependency_links.txt +0 -0
  54. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/entry_points.txt +0 -0
  55. {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/top_level.txt +0 -0
  56. {codeclone-1.2.0 → codeclone-1.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeclone
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
5
5
  Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
6
6
  Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
@@ -10,11 +10,10 @@ Project-URL: Repository, https://github.com/orenlab/codeclone
10
10
  Project-URL: Issues, https://github.com/orenlab/codeclone/issues
11
11
  Project-URL: Changelog, https://github.com/orenlab/codeclone/releases
12
12
  Project-URL: Documentation, https://github.com/orenlab/codeclone/tree/main/docs
13
- Keywords: python,ast,code-clone,duplication,static-analysis,ci,architecture
13
+ Keywords: python,ast,cfg,code-clone,duplication,static-analysis,architecture,control-flow,ci
14
14
  Classifier: Development Status :: 5 - Production/Stable
15
15
  Classifier: Intended Audience :: Developers
16
16
  Classifier: Topic :: Software Development :: Quality Assurance
17
- Classifier: Topic :: Software Development :: Code Generators
18
17
  Classifier: Topic :: Software Development :: Testing
19
18
  Classifier: Typing :: Typed
20
19
  Classifier: License :: OSI Approved :: MIT License
@@ -32,19 +31,22 @@ Requires-Dist: pygments>=2.19.2
32
31
  Requires-Dist: rich>=14.3.2
33
32
  Provides-Extra: dev
34
33
  Requires-Dist: pytest>=9.0.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=6.1.0; extra == "dev"
35
35
  Requires-Dist: build>=1.2.0; extra == "dev"
36
36
  Requires-Dist: twine>=5.0.0; extra == "dev"
37
37
  Requires-Dist: mypy>=1.19.1; extra == "dev"
38
+ Requires-Dist: ruff>=0.12.0; extra == "dev"
38
39
  Dynamic: license-file
39
40
 
40
41
  # CodeClone
41
42
 
42
43
  [![PyPI](https://img.shields.io/pypi/v/codeclone.svg)](https://pypi.org/project/codeclone/)
43
44
  [![Downloads](https://img.shields.io/pypi/dm/codeclone.svg)](https://pypi.org/project/codeclone/)
45
+ [![tests](https://github.com/orenlab/codeclone/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/orenlab/codeclone/actions/workflows/tests.yml)
44
46
  [![Python](https://img.shields.io/pypi/pyversions/codeclone.svg)](https://pypi.org/project/codeclone/)
45
47
  [![License](https://img.shields.io/pypi/l/codeclone.svg)](LICENSE)
46
48
 
47
- **CodeClone** is a Python code clone detector based on **normalized AST and control-flow graphs (CFG)**.
49
+ **CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**.
48
50
  It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
49
51
 
50
52
  CodeClone is designed to help teams:
@@ -53,15 +55,16 @@ CodeClone is designed to help teams:
53
55
  - identify architectural hotspots,
54
56
  - prevent *new* duplication via CI and pre-commit hooks.
55
57
 
56
- Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against renaming,
57
- formatting, and minor refactoring.
58
+ Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
59
+ renaming, formatting, and minor refactoring.
58
60
 
59
61
  ---
60
62
 
61
63
  ## Why CodeClone?
62
64
 
63
65
  Most existing tools detect *textual* duplication.
64
- CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or architectural drift.
66
+ CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
67
+ architectural drift.
65
68
 
66
69
  Typical use cases:
67
70
 
@@ -79,11 +82,11 @@ Typical use cases:
79
82
  - Detects functions and methods with identical **control-flow structure**.
80
83
  - Based on **Control Flow Graph (CFG)** fingerprinting.
81
84
  - Robust to:
82
- - variable renaming,
83
- - constant changes,
84
- - attribute renaming,
85
- - formatting differences,
86
- - docstrings and type annotations.
85
+ - variable renaming,
86
+ - constant changes,
87
+ - attribute renaming,
88
+ - formatting differences,
89
+ - docstrings and type annotations.
87
90
  - Ideal for spotting architectural duplication across layers.
88
91
 
89
92
  ### Block-level clone detection (Type-3-lite)
@@ -91,29 +94,29 @@ Typical use cases:
91
94
  - Detects repeated **statement blocks** inside larger functions.
92
95
  - Uses sliding windows over CFG-normalized statement sequences.
93
96
  - Targets:
94
- - validation blocks,
95
- - guard clauses,
96
- - repeated orchestration logic.
97
+ - validation blocks,
98
+ - guard clauses,
99
+ - repeated orchestration logic.
97
100
  - Carefully filtered to reduce noise:
98
- - no overlapping windows,
99
- - no clones inside the same function,
100
- - no `__init__` noise,
101
- - size and statement-count thresholds.
101
+ - no overlapping windows,
102
+ - no clones inside the same function,
103
+ - no `__init__` noise,
104
+ - size and statement-count thresholds.
102
105
 
103
106
  ### Control-Flow Awareness (CFG v1)
104
107
 
105
108
  - Each function is converted into a **Control Flow Graph**.
106
109
  - CFG nodes contain normalized AST statements.
107
110
  - CFG edges represent structural control flow:
108
- - `if` / `else`
109
- - `for` / `async for` / `while`
110
- - `try` / `except` / `finally`
111
- - `with` / `async with`
112
- - `match` / `case` (Python 3.10+)
111
+ - `if` / `else`
112
+ - `for` / `async for` / `while`
113
+ - `try` / `except` / `finally`
114
+ - `with` / `async with`
115
+ - `match` / `case` (Python 3.10+)
113
116
  - Current CFG semantics (v1):
114
- - `break` and `continue` are treated as statements (no jump targets),
115
- - after-blocks are explicit and always present,
116
- - focus is on **structural similarity**, not precise runtime semantics.
117
+ - `break` and `continue` are treated as statements (no jump targets),
118
+ - after-blocks are explicit and always present,
119
+ - focus is on **structural similarity**, not precise runtime semantics.
117
120
 
118
121
  This design keeps clone detection **stable, deterministic, and low-noise**.
119
122
 
@@ -122,6 +125,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**.
122
125
  - AST + CFG normalization instead of token matching.
123
126
  - Conservative defaults tuned for real-world Python projects.
124
127
  - Explicit thresholds for size and statement count.
128
+ - No probabilistic scoring or heuristic similarity thresholds.
125
129
  - Focus on *architectural duplication*, not micro-similarities.
126
130
 
127
131
  ### CI-friendly baseline mode
@@ -188,14 +192,26 @@ Commit the generated baseline file to the repository.
188
192
  ### 2. Use in CI
189
193
 
190
194
  ```bash
191
- codeclone . --fail-on-new
195
+ codeclone . --fail-on-new --no-progress
192
196
  ```
193
197
 
194
198
  Behavior:
195
199
 
196
- - existing clones are allowed,
197
- - build fails if *new* clones appear,
198
- - refactoring that removes duplication is always allowed.
200
+ - existing clones are allowed,
201
+ - the build fails if *new* clones appear,
202
+ - refactoring that removes duplication is always allowed.
203
+
204
+ `--fail-on-new` exits with a non-zero code when new clones are detected.
205
+
206
+ ### Python Version Consistency for Baseline Checks
207
+
208
+ Due to inherent differences in Python’s AST between interpreter versions, baseline
209
+ generation and verification must be performed using the same Python version.
210
+
211
+ This ensures deterministic and reproducible clone detection results.
212
+
213
+ CI checks therefore pin baseline verification to a single Python version, while the
214
+ test matrix continues to validate compatibility across Python 3.10–3.14.
199
215
 
200
216
  ---
201
217
 
@@ -203,14 +219,14 @@ Behavior:
203
219
 
204
220
  ```yaml
205
221
  repos:
206
- - repo: local
222
+ - repo: local
207
223
  hooks:
208
- - id: codeclone
224
+ - id: codeclone
209
225
  name: CodeClone
210
226
  entry: codeclone
211
227
  language: python
212
- args: [".", "--fail-on-new"]
213
- types: [python]
228
+ args: [ ".", "--fail-on-new" ]
229
+ types: [ python ]
214
230
  ```
215
231
 
216
232
  ---
@@ -243,6 +259,7 @@ repos:
243
259
  6. Apply conservative filters to suppress noise.
244
260
 
245
261
  See the architectural overview:
262
+
246
263
  - [docs/architecture.md](docs/architecture.md)
247
264
 
248
265
  ---
@@ -255,6 +272,7 @@ to improve structural clone detection robustness.
255
272
  The CFG is a **structural abstraction**, not a runtime execution model.
256
273
 
257
274
  See full design and semantics:
275
+
258
276
  - [docs/cfg.md](docs/cfg.md)
259
277
 
260
278
  ---
@@ -2,10 +2,11 @@
2
2
 
3
3
  [![PyPI](https://img.shields.io/pypi/v/codeclone.svg)](https://pypi.org/project/codeclone/)
4
4
  [![Downloads](https://img.shields.io/pypi/dm/codeclone.svg)](https://pypi.org/project/codeclone/)
5
+ [![tests](https://github.com/orenlab/codeclone/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/orenlab/codeclone/actions/workflows/tests.yml)
5
6
  [![Python](https://img.shields.io/pypi/pyversions/codeclone.svg)](https://pypi.org/project/codeclone/)
6
7
  [![License](https://img.shields.io/pypi/l/codeclone.svg)](LICENSE)
7
8
 
8
- **CodeClone** is a Python code clone detector based on **normalized AST and control-flow graphs (CFG)**.
9
+ **CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**.
9
10
  It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
10
11
 
11
12
  CodeClone is designed to help teams:
@@ -14,15 +15,16 @@ CodeClone is designed to help teams:
14
15
  - identify architectural hotspots,
15
16
  - prevent *new* duplication via CI and pre-commit hooks.
16
17
 
17
- Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against renaming,
18
- formatting, and minor refactoring.
18
+ Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
19
+ renaming, formatting, and minor refactoring.
19
20
 
20
21
  ---
21
22
 
22
23
  ## Why CodeClone?
23
24
 
24
25
  Most existing tools detect *textual* duplication.
25
- CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or architectural drift.
26
+ CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
27
+ architectural drift.
26
28
 
27
29
  Typical use cases:
28
30
 
@@ -40,11 +42,11 @@ Typical use cases:
40
42
  - Detects functions and methods with identical **control-flow structure**.
41
43
  - Based on **Control Flow Graph (CFG)** fingerprinting.
42
44
  - Robust to:
43
- - variable renaming,
44
- - constant changes,
45
- - attribute renaming,
46
- - formatting differences,
47
- - docstrings and type annotations.
45
+ - variable renaming,
46
+ - constant changes,
47
+ - attribute renaming,
48
+ - formatting differences,
49
+ - docstrings and type annotations.
48
50
  - Ideal for spotting architectural duplication across layers.
49
51
 
50
52
  ### Block-level clone detection (Type-3-lite)
@@ -52,29 +54,29 @@ Typical use cases:
52
54
  - Detects repeated **statement blocks** inside larger functions.
53
55
  - Uses sliding windows over CFG-normalized statement sequences.
54
56
  - Targets:
55
- - validation blocks,
56
- - guard clauses,
57
- - repeated orchestration logic.
57
+ - validation blocks,
58
+ - guard clauses,
59
+ - repeated orchestration logic.
58
60
  - Carefully filtered to reduce noise:
59
- - no overlapping windows,
60
- - no clones inside the same function,
61
- - no `__init__` noise,
62
- - size and statement-count thresholds.
61
+ - no overlapping windows,
62
+ - no clones inside the same function,
63
+ - no `__init__` noise,
64
+ - size and statement-count thresholds.
63
65
 
64
66
  ### Control-Flow Awareness (CFG v1)
65
67
 
66
68
  - Each function is converted into a **Control Flow Graph**.
67
69
  - CFG nodes contain normalized AST statements.
68
70
  - CFG edges represent structural control flow:
69
- - `if` / `else`
70
- - `for` / `async for` / `while`
71
- - `try` / `except` / `finally`
72
- - `with` / `async with`
73
- - `match` / `case` (Python 3.10+)
71
+ - `if` / `else`
72
+ - `for` / `async for` / `while`
73
+ - `try` / `except` / `finally`
74
+ - `with` / `async with`
75
+ - `match` / `case` (Python 3.10+)
74
76
  - Current CFG semantics (v1):
75
- - `break` and `continue` are treated as statements (no jump targets),
76
- - after-blocks are explicit and always present,
77
- - focus is on **structural similarity**, not precise runtime semantics.
77
+ - `break` and `continue` are treated as statements (no jump targets),
78
+ - after-blocks are explicit and always present,
79
+ - focus is on **structural similarity**, not precise runtime semantics.
78
80
 
79
81
  This design keeps clone detection **stable, deterministic, and low-noise**.
80
82
 
@@ -83,6 +85,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**.
83
85
  - AST + CFG normalization instead of token matching.
84
86
  - Conservative defaults tuned for real-world Python projects.
85
87
  - Explicit thresholds for size and statement count.
88
+ - No probabilistic scoring or heuristic similarity thresholds.
86
89
  - Focus on *architectural duplication*, not micro-similarities.
87
90
 
88
91
  ### CI-friendly baseline mode
@@ -149,14 +152,26 @@ Commit the generated baseline file to the repository.
149
152
  ### 2. Use in CI
150
153
 
151
154
  ```bash
152
- codeclone . --fail-on-new
155
+ codeclone . --fail-on-new --no-progress
153
156
  ```
154
157
 
155
158
  Behavior:
156
159
 
157
- - existing clones are allowed,
158
- - build fails if *new* clones appear,
159
- - refactoring that removes duplication is always allowed.
160
+ - existing clones are allowed,
161
+ - the build fails if *new* clones appear,
162
+ - refactoring that removes duplication is always allowed.
163
+
164
+ `--fail-on-new` exits with a non-zero code when new clones are detected.
165
+
166
+ ### Python Version Consistency for Baseline Checks
167
+
168
+ Due to inherent differences in Python’s AST between interpreter versions, baseline
169
+ generation and verification must be performed using the same Python version.
170
+
171
+ This ensures deterministic and reproducible clone detection results.
172
+
173
+ CI checks therefore pin baseline verification to a single Python version, while the
174
+ test matrix continues to validate compatibility across Python 3.10–3.14.
160
175
 
161
176
  ---
162
177
 
@@ -164,14 +179,14 @@ Behavior:
164
179
 
165
180
  ```yaml
166
181
  repos:
167
- - repo: local
182
+ - repo: local
168
183
  hooks:
169
- - id: codeclone
184
+ - id: codeclone
170
185
  name: CodeClone
171
186
  entry: codeclone
172
187
  language: python
173
- args: [".", "--fail-on-new"]
174
- types: [python]
188
+ args: [ ".", "--fail-on-new" ]
189
+ types: [ python ]
175
190
  ```
176
191
 
177
192
  ---
@@ -204,6 +219,7 @@ repos:
204
219
  6. Apply conservative filters to suppress noise.
205
220
 
206
221
  See the architectural overview:
222
+
207
223
  - [docs/architecture.md](docs/architecture.md)
208
224
 
209
225
  ---
@@ -216,6 +232,7 @@ to improve structural clone detection robustness.
216
232
  The CFG is a **structural abstraction**, not a runtime execution model.
217
233
 
218
234
  See full design and semantics:
235
+
219
236
  - [docs/cfg.md](docs/cfg.md)
220
237
 
221
238
  ---
@@ -6,7 +6,7 @@ Copyright (c) 2026 Den Rozhnovskiy
6
6
  Licensed under the MIT License.
7
7
  """
8
8
 
9
- from importlib.metadata import version, PackageNotFoundError
9
+ from importlib.metadata import PackageNotFoundError, version
10
10
 
11
11
  try:
12
12
  __version__ = version("codeclone")
@@ -9,14 +9,19 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import json
12
+ from collections.abc import Mapping
12
13
  from pathlib import Path
14
+ from typing import Any
13
15
 
14
16
 
15
17
  class Baseline:
18
+ __slots__ = ("blocks", "functions", "path", "python_version")
19
+
16
20
  def __init__(self, path: str | Path):
17
21
  self.path = Path(path)
18
22
  self.functions: set[str] = set()
19
23
  self.blocks: set[str] = set()
24
+ self.python_version: str | None = None
20
25
 
21
26
  def load(self) -> None:
22
27
  if not self.path.exists():
@@ -26,6 +31,10 @@ class Baseline:
26
31
  data = json.loads(self.path.read_text("utf-8"))
27
32
  self.functions = set(data.get("functions", []))
28
33
  self.blocks = set(data.get("blocks", []))
34
+ python_version = data.get("python_version")
35
+ self.python_version = (
36
+ python_version if isinstance(python_version, str) else None
37
+ )
29
38
  except json.JSONDecodeError as e:
30
39
  raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e
31
40
 
@@ -33,10 +42,7 @@ class Baseline:
33
42
  self.path.parent.mkdir(parents=True, exist_ok=True)
34
43
  self.path.write_text(
35
44
  json.dumps(
36
- {
37
- "functions": sorted(self.functions),
38
- "blocks": sorted(self.blocks),
39
- },
45
+ _baseline_payload(self.functions, self.blocks, self.python_version),
40
46
  indent=2,
41
47
  ensure_ascii=False,
42
48
  ),
@@ -45,14 +51,34 @@ class Baseline:
45
51
 
46
52
  @staticmethod
47
53
  def from_groups(
48
- func_groups: dict, block_groups: dict, path: str | Path = ""
49
- ) -> "Baseline":
54
+ func_groups: Mapping[str, object],
55
+ block_groups: Mapping[str, object],
56
+ path: str | Path = "",
57
+ python_version: str | None = None,
58
+ ) -> Baseline:
50
59
  bl = Baseline(path)
51
60
  bl.functions = set(func_groups.keys())
52
61
  bl.blocks = set(block_groups.keys())
62
+ bl.python_version = python_version
53
63
  return bl
54
64
 
55
- def diff(self, func_groups: dict, block_groups: dict) -> tuple[set, set]:
65
+ def diff(
66
+ self, func_groups: Mapping[str, object], block_groups: Mapping[str, object]
67
+ ) -> tuple[set[str], set[str]]:
56
68
  new_funcs = set(func_groups.keys()) - self.functions
57
69
  new_blocks = set(block_groups.keys()) - self.blocks
58
70
  return new_funcs, new_blocks
71
+
72
+
73
+ def _baseline_payload(
74
+ functions: set[str],
75
+ blocks: set[str],
76
+ python_version: str | None,
77
+ ) -> dict[str, Any]:
78
+ payload: dict[str, Any] = {
79
+ "functions": sorted(functions),
80
+ "blocks": sorted(blocks),
81
+ }
82
+ if python_version:
83
+ payload["python_version"] = python_version
84
+ return payload
@@ -11,7 +11,7 @@ from __future__ import annotations
11
11
  import ast
12
12
  import hashlib
13
13
 
14
- from .normalize import NormalizationConfig, AstNormalizer
14
+ from .normalize import AstNormalizer, NormalizationConfig
15
15
 
16
16
 
17
17
  def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
@@ -15,7 +15,7 @@ from .blockhash import stmt_hash
15
15
  from .normalize import NormalizationConfig
16
16
 
17
17
 
18
- @dataclass(frozen=True)
18
+ @dataclass(frozen=True, slots=True)
19
19
  class BlockUnit:
20
20
  block_hash: str
21
21
  filepath: str
@@ -42,7 +42,8 @@ def extract_blocks(
42
42
 
43
43
  blocks: list[BlockUnit] = []
44
44
  last_start: int | None = None
45
- MIN_LINE_DISTANCE = 5 # suppress overlapping windows
45
+ # Allow some overlap (50%), but at least 3 lines apart
46
+ min_line_distance = max(block_size // 2, 3)
46
47
 
47
48
  for i in range(len(stmt_hashes) - block_size + 1):
48
49
  start = getattr(body[i], "lineno", None)
@@ -50,7 +51,7 @@ def extract_blocks(
50
51
  if not start or not end:
51
52
  continue
52
53
 
53
- if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
54
+ if last_start is not None and start - last_start < min_line_distance:
54
55
  continue
55
56
 
56
57
  bh = "|".join(stmt_hashes[i : i + block_size])
@@ -0,0 +1,187 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import hmac
13
+ import json
14
+ import os
15
+ import secrets
16
+ from collections.abc import Mapping
17
+ from dataclasses import asdict
18
+ from pathlib import Path
19
+ from typing import TYPE_CHECKING, Any, TypedDict, cast
20
+
21
+ if TYPE_CHECKING:
22
+ from .blocks import BlockUnit
23
+ from .extractor import Unit
24
+
25
+ from .errors import CacheError
26
+
27
+
28
+ class FileStat(TypedDict):
29
+ mtime_ns: int
30
+ size: int
31
+
32
+
33
+ class UnitDict(TypedDict):
34
+ qualname: str
35
+ filepath: str
36
+ start_line: int
37
+ end_line: int
38
+ loc: int
39
+ stmt_count: int
40
+ fingerprint: str
41
+ loc_bucket: str
42
+
43
+
44
+ class BlockDict(TypedDict):
45
+ block_hash: str
46
+ filepath: str
47
+ qualname: str
48
+ start_line: int
49
+ end_line: int
50
+ size: int
51
+
52
+
53
+ class CacheEntry(TypedDict):
54
+ stat: FileStat
55
+ units: list[UnitDict]
56
+ blocks: list[BlockDict]
57
+
58
+
59
+ class CacheData(TypedDict):
60
+ version: str
61
+ files: dict[str, CacheEntry]
62
+
63
+
64
+ class Cache:
65
+ __slots__ = ("data", "load_warning", "path", "secret")
66
+ CACHE_VERSION = "1.0"
67
+
68
+ def __init__(self, path: str | Path):
69
+ self.path = Path(path)
70
+ self.data: CacheData = {"version": self.CACHE_VERSION, "files": {}}
71
+ self.secret = self._load_secret()
72
+ self.load_warning: str | None = None
73
+
74
+ def _load_secret(self) -> bytes:
75
+ """Load or create cache signing secret."""
76
+ # Store secret in the same directory as the cache file, named .cache_secret
77
+ # If cache is at ~/.cache/codeclone/cache.json, secret is
78
+ # ~/.cache/codeclone/.cache_secret
79
+ secret_path = self.path.parent / ".cache_secret"
80
+ if secret_path.exists():
81
+ return secret_path.read_bytes()
82
+ else:
83
+ secret = secrets.token_bytes(32)
84
+ try:
85
+ self.path.parent.mkdir(parents=True, exist_ok=True)
86
+ secret_path.write_bytes(secret)
87
+ # Set restrictive permissions on secret file (Unix only)
88
+ if os.name == "posix":
89
+ secret_path.chmod(0o600)
90
+ except OSError:
91
+ pass
92
+ return secret
93
+
94
+ def _sign_data(self, data: Mapping[str, Any]) -> str:
95
+ """Create HMAC signature of cache data."""
96
+ # Sort keys for deterministic JSON serialization
97
+ data_str = json.dumps(data, sort_keys=True)
98
+ return hmac.new(self.secret, data_str.encode(), hashlib.sha256).hexdigest()
99
+
100
+ def load(self) -> None:
101
+ if not self.path.exists():
102
+ return
103
+
104
+ try:
105
+ raw = json.loads(self.path.read_text("utf-8"))
106
+ stored_sig = raw.get("_signature")
107
+
108
+ # Extract data without signature for verification
109
+ data = {k: v for k, v in raw.items() if k != "_signature"}
110
+
111
+ # Verify signature
112
+ expected_sig = self._sign_data(data)
113
+ if stored_sig != expected_sig:
114
+ self.load_warning = "Cache signature mismatch; ignoring cache."
115
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
116
+ return
117
+
118
+ if data.get("version") != self.CACHE_VERSION:
119
+ self.load_warning = (
120
+ "Cache version mismatch "
121
+ f"(found {data.get('version')}); ignoring cache."
122
+ )
123
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
124
+ return
125
+
126
+ # Basic structure check
127
+ if not isinstance(data.get("files"), dict):
128
+ self.load_warning = "Cache format invalid; ignoring cache."
129
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
130
+ return
131
+
132
+ self.data = cast(CacheData, data)
133
+ self.load_warning = None
134
+
135
+ except (json.JSONDecodeError, ValueError):
136
+ self.load_warning = "Cache corrupted; ignoring cache."
137
+ self.data = {"version": self.CACHE_VERSION, "files": {}}
138
+
139
+ def save(self) -> None:
140
+ try:
141
+ self.path.parent.mkdir(parents=True, exist_ok=True)
142
+
143
+ # Add signature
144
+ data_with_sig = {**self.data, "_signature": self._sign_data(self.data)}
145
+
146
+ self.path.write_text(
147
+ json.dumps(data_with_sig, ensure_ascii=False, indent=2),
148
+ "utf-8",
149
+ )
150
+ except OSError as e:
151
+ raise CacheError(f"Failed to save cache: {e}") from e
152
+
153
+ def get_file_entry(self, filepath: str) -> CacheEntry | None:
154
+ entry = self.data["files"].get(filepath)
155
+
156
+ if entry is None:
157
+ return None
158
+
159
+ if not isinstance(entry, dict):
160
+ return None
161
+
162
+ required = {"stat", "units", "blocks"}
163
+ if not required.issubset(entry.keys()):
164
+ return None
165
+
166
+ return entry
167
+
168
+ def put_file_entry(
169
+ self,
170
+ filepath: str,
171
+ stat_sig: FileStat,
172
+ units: list[Unit],
173
+ blocks: list[BlockUnit],
174
+ ) -> None:
175
+ self.data["files"][filepath] = {
176
+ "stat": stat_sig,
177
+ "units": cast(list[UnitDict], cast(object, [asdict(u) for u in units])),
178
+ "blocks": cast(list[BlockDict], cast(object, [asdict(b) for b in blocks])),
179
+ }
180
+
181
+
182
+ def file_stat_signature(path: str) -> FileStat:
183
+ st = os.stat(path)
184
+ return {
185
+ "mtime_ns": st.st_mtime_ns,
186
+ "size": st.st_size,
187
+ }