codeclone 1.0.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. codeclone-1.2.0/PKG-INFO +264 -0
  2. codeclone-1.2.0/README.md +225 -0
  3. codeclone-1.2.0/codeclone/__init__.py +16 -0
  4. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone/baseline.py +21 -9
  5. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone/blockhash.py +10 -1
  6. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone/blocks.py +26 -16
  7. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone/cache.py +20 -6
  8. codeclone-1.2.0/codeclone/cfg.py +338 -0
  9. codeclone-1.2.0/codeclone/cli.py +409 -0
  10. codeclone-1.2.0/codeclone/extractor.py +169 -0
  11. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone/fingerprint.py +11 -1
  12. codeclone-1.2.0/codeclone/html_report.py +936 -0
  13. codeclone-1.2.0/codeclone/normalize.py +130 -0
  14. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone/report.py +29 -13
  15. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone/scanner.py +24 -4
  16. codeclone-1.2.0/codeclone.egg-info/PKG-INFO +264 -0
  17. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone.egg-info/SOURCES.txt +4 -0
  18. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone.egg-info/requires.txt +3 -0
  19. codeclone-1.2.0/pyproject.toml +77 -0
  20. codeclone-1.2.0/tests/test_baseline.py +62 -0
  21. {codeclone-1.0.0 → codeclone-1.2.0}/tests/test_blocks.py +1 -1
  22. codeclone-1.2.0/tests/test_cfg.py +176 -0
  23. codeclone-1.2.0/tests/test_cli_smoke.py +107 -0
  24. {codeclone-1.0.0 → codeclone-1.2.0}/tests/test_extractor.py +1 -1
  25. codeclone-1.2.0/tests/test_html_report.py +44 -0
  26. {codeclone-1.0.0 → codeclone-1.2.0}/tests/test_report.py +1 -1
  27. codeclone-1.0.0/PKG-INFO +0 -211
  28. codeclone-1.0.0/README.md +0 -178
  29. codeclone-1.0.0/codeclone/__init__.py +0 -0
  30. codeclone-1.0.0/codeclone/cli.py +0 -145
  31. codeclone-1.0.0/codeclone/extractor.py +0 -109
  32. codeclone-1.0.0/codeclone/normalize.py +0 -83
  33. codeclone-1.0.0/codeclone.egg-info/PKG-INFO +0 -211
  34. codeclone-1.0.0/pyproject.toml +0 -70
  35. codeclone-1.0.0/tests/test_baseline.py +0 -15
  36. codeclone-1.0.0/tests/test_cli_smoke.py +0 -24
  37. {codeclone-1.0.0 → codeclone-1.2.0}/LICENSE +0 -0
  38. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone.egg-info/dependency_links.txt +0 -0
  39. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone.egg-info/entry_points.txt +0 -0
  40. {codeclone-1.0.0 → codeclone-1.2.0}/codeclone.egg-info/top_level.txt +0 -0
  41. {codeclone-1.0.0 → codeclone-1.2.0}/setup.cfg +0 -0
  42. {codeclone-1.0.0 → codeclone-1.2.0}/tests/test_normalize.py +0 -0
@@ -0,0 +1,264 @@
1
+ Metadata-Version: 2.4
2
+ Name: codeclone
3
+ Version: 1.2.0
4
+ Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
5
+ Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
6
+ Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/orenlab/codeclone
9
+ Project-URL: Repository, https://github.com/orenlab/codeclone
10
+ Project-URL: Issues, https://github.com/orenlab/codeclone/issues
11
+ Project-URL: Changelog, https://github.com/orenlab/codeclone/releases
12
+ Project-URL: Documentation, https://github.com/orenlab/codeclone/tree/main/docs
13
+ Keywords: python,ast,code-clone,duplication,static-analysis,ci,architecture
14
+ Classifier: Development Status :: 5 - Production/Stable
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Topic :: Software Development :: Quality Assurance
17
+ Classifier: Topic :: Software Development :: Code Generators
18
+ Classifier: Topic :: Software Development :: Testing
19
+ Classifier: Typing :: Typed
20
+ Classifier: License :: OSI Approved :: MIT License
21
+ Classifier: Programming Language :: Python :: 3
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: 3.13
26
+ Classifier: Programming Language :: Python :: 3.14
27
+ Classifier: Operating System :: OS Independent
28
+ Requires-Python: >=3.10
29
+ Description-Content-Type: text/markdown
30
+ License-File: LICENSE
31
+ Requires-Dist: pygments>=2.19.2
32
+ Requires-Dist: rich>=14.3.2
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=9.0.0; extra == "dev"
35
+ Requires-Dist: build>=1.2.0; extra == "dev"
36
+ Requires-Dist: twine>=5.0.0; extra == "dev"
37
+ Requires-Dist: mypy>=1.19.1; extra == "dev"
38
+ Dynamic: license-file
39
+
40
+ # CodeClone
41
+
42
+ [![PyPI](https://img.shields.io/pypi/v/codeclone.svg)](https://pypi.org/project/codeclone/)
43
+ [![Downloads](https://img.shields.io/pypi/dm/codeclone.svg)](https://pypi.org/project/codeclone/)
44
+ [![Python](https://img.shields.io/pypi/pyversions/codeclone.svg)](https://pypi.org/project/codeclone/)
45
+ [![License](https://img.shields.io/pypi/l/codeclone.svg)](LICENSE)
46
+
47
+ **CodeClone** is a Python code clone detector based on **normalized AST and control-flow graphs (CFG)**.
48
+ It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
49
+
50
+ CodeClone is designed to help teams:
51
+
52
+ - discover **structural and control-flow duplication**,
53
+ - identify architectural hotspots,
54
+ - prevent *new* duplication via CI and pre-commit hooks.
55
+
56
+ Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against renaming,
57
+ formatting, and minor refactoring.
58
+
59
+ ---
60
+
61
+ ## Why CodeClone?
62
+
63
+ Most existing tools detect *textual* duplication.
64
+ CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or architectural drift.
65
+
66
+ Typical use cases:
67
+
68
+ - duplicated service or orchestration logic across layers (API ↔ application),
69
+ - repeated validation or guard blocks,
70
+ - copy-pasted request / handler flows,
71
+ - duplicated control-flow logic in routers, handlers, or services.
72
+
73
+ ---
74
+
75
+ ## Features
76
+
77
+ ### Function-level clone detection (Type-2, CFG-based)
78
+
79
+ - Detects functions and methods with identical **control-flow structure**.
80
+ - Based on **Control Flow Graph (CFG)** fingerprinting.
81
+ - Robust to:
82
+ - variable renaming,
83
+ - constant changes,
84
+ - attribute renaming,
85
+ - formatting differences,
86
+ - docstrings and type annotations.
87
+ - Ideal for spotting architectural duplication across layers.
88
+
89
+ ### Block-level clone detection (Type-3-lite)
90
+
91
+ - Detects repeated **statement blocks** inside larger functions.
92
+ - Uses sliding windows over CFG-normalized statement sequences.
93
+ - Targets:
94
+ - validation blocks,
95
+ - guard clauses,
96
+ - repeated orchestration logic.
97
+ - Carefully filtered to reduce noise:
98
+ - no overlapping windows,
99
+ - no clones inside the same function,
100
+ - no `__init__` noise,
101
+ - size and statement-count thresholds.
102
+
103
+ ### Control-Flow Awareness (CFG v1)
104
+
105
+ - Each function is converted into a **Control Flow Graph**.
106
+ - CFG nodes contain normalized AST statements.
107
+ - CFG edges represent structural control flow:
108
+ - `if` / `else`
109
+ - `for` / `async for` / `while`
110
+ - `try` / `except` / `finally`
111
+ - `with` / `async with`
112
+ - `match` / `case` (Python 3.10+)
113
+ - Current CFG semantics (v1):
114
+ - `break` and `continue` are treated as statements (no jump targets),
115
+ - after-blocks are explicit and always present,
116
+ - focus is on **structural similarity**, not precise runtime semantics.
117
+
118
+ This design keeps clone detection **stable, deterministic, and low-noise**.
119
+
120
+ ### Low-noise by design
121
+
122
+ - AST + CFG normalization instead of token matching.
123
+ - Conservative defaults tuned for real-world Python projects.
124
+ - Explicit thresholds for size and statement count.
125
+ - Focus on *architectural duplication*, not micro-similarities.
126
+
127
+ ### CI-friendly baseline mode
128
+
129
+ - Establish a baseline of existing clones.
130
+ - Fail CI **only when new clones are introduced**.
131
+ - Safe for legacy codebases and incremental refactoring.
132
+
133
+ ---
134
+
135
+ ## Installation
136
+
137
+ ```bash
138
+ pip install codeclone
139
+ ```
140
+
141
+ Python **3.10+** is required.
142
+
143
+ ---
144
+
145
+ ## Quick Start
146
+
147
+ Run on a project:
148
+
149
+ ```bash
150
+ codeclone .
151
+ ```
152
+
153
+ This will:
154
+
155
+ - scan Python files,
156
+ - build CFGs for functions,
157
+ - detect function-level and block-level clones,
158
+ - print a summary to stdout.
159
+
160
+ Generate reports:
161
+
162
+ ```bash
163
+ codeclone . \
164
+ --json .cache/codeclone/report.json \
165
+ --text .cache/codeclone/report.txt
166
+ ```
167
+
168
+ Generate an HTML report:
169
+
170
+ ```bash
171
+ codeclone . --html .cache/codeclone/report.html
172
+ ```
173
+
174
+ ---
175
+
176
+ ## Baseline Workflow (Recommended)
177
+
178
+ ### 1. Create a baseline
179
+
180
+ Run once on your current codebase:
181
+
182
+ ```bash
183
+ codeclone . --update-baseline
184
+ ```
185
+
186
+ Commit the generated baseline file to the repository.
187
+
188
+ ### 2. Use in CI
189
+
190
+ ```bash
191
+ codeclone . --fail-on-new
192
+ ```
193
+
194
+ Behavior:
195
+
196
+ - ✅ existing clones are allowed,
197
+ - ❌ build fails if *new* clones appear,
198
+ - ✅ refactoring that removes duplication is always allowed.
199
+
200
+ ---
201
+
202
+ ## Using with pre-commit
203
+
204
+ ```yaml
205
+ repos:
206
+ - repo: local
207
+ hooks:
208
+ - id: codeclone
209
+ name: CodeClone
210
+ entry: codeclone
211
+ language: python
212
+ args: [".", "--fail-on-new"]
213
+ types: [python]
214
+ ```
215
+
216
+ ---
217
+
218
+ ## What CodeClone Is (and Is Not)
219
+
220
+ ### CodeClone **is**
221
+
222
+ - an architectural analysis tool,
223
+ - a duplication radar,
224
+ - a CI guard against copy-paste,
225
+ - a control-flow-aware clone detector.
226
+
227
+ ### CodeClone **is not**
228
+
229
+ - a linter,
230
+ - a formatter,
231
+ - a semantic equivalence prover,
232
+ - a runtime analyzer.
233
+
234
+ ---
235
+
236
+ ## How It Works (High Level)
237
+
238
+ 1. Parse Python source into AST.
239
+ 2. Normalize AST (names, constants, attributes, annotations).
240
+ 3. Build a **Control Flow Graph (CFG)** per function.
241
+ 4. Compute stable CFG fingerprints.
242
+ 5. Detect function-level and block-level clones.
243
+ 6. Apply conservative filters to suppress noise.
244
+
245
+ See the architectural overview:
246
+ - [docs/architecture.md](docs/architecture.md)
247
+
248
+ ---
249
+
250
+ ## Control Flow Graph (CFG)
251
+
252
+ Starting from **version 1.1.0**, CodeClone uses a **Control Flow Graph (CFG)**
253
+ to improve structural clone detection robustness.
254
+
255
+ The CFG is a **structural abstraction**, not a runtime execution model.
256
+
257
+ See full design and semantics:
258
+ - [docs/cfg.md](docs/cfg.md)
259
+
260
+ ---
261
+
262
+ ## License
263
+
264
+ MIT License
@@ -0,0 +1,225 @@
1
+ # CodeClone
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/codeclone.svg)](https://pypi.org/project/codeclone/)
4
+ [![Downloads](https://img.shields.io/pypi/dm/codeclone.svg)](https://pypi.org/project/codeclone/)
5
+ [![Python](https://img.shields.io/pypi/pyversions/codeclone.svg)](https://pypi.org/project/codeclone/)
6
+ [![License](https://img.shields.io/pypi/l/codeclone.svg)](LICENSE)
7
+
8
+ **CodeClone** is a Python code clone detector based on **normalized AST and control-flow graphs (CFG)**.
9
+ It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
10
+
11
+ CodeClone is designed to help teams:
12
+
13
+ - discover **structural and control-flow duplication**,
14
+ - identify architectural hotspots,
15
+ - prevent *new* duplication via CI and pre-commit hooks.
16
+
17
+ Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against renaming,
18
+ formatting, and minor refactoring.
19
+
20
+ ---
21
+
22
+ ## Why CodeClone?
23
+
24
+ Most existing tools detect *textual* duplication.
25
+ CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or architectural drift.
26
+
27
+ Typical use cases:
28
+
29
+ - duplicated service or orchestration logic across layers (API ↔ application),
30
+ - repeated validation or guard blocks,
31
+ - copy-pasted request / handler flows,
32
+ - duplicated control-flow logic in routers, handlers, or services.
33
+
34
+ ---
35
+
36
+ ## Features
37
+
38
+ ### Function-level clone detection (Type-2, CFG-based)
39
+
40
+ - Detects functions and methods with identical **control-flow structure**.
41
+ - Based on **Control Flow Graph (CFG)** fingerprinting.
42
+ - Robust to:
43
+ - variable renaming,
44
+ - constant changes,
45
+ - attribute renaming,
46
+ - formatting differences,
47
+ - docstrings and type annotations.
48
+ - Ideal for spotting architectural duplication across layers.
49
+
50
+ ### Block-level clone detection (Type-3-lite)
51
+
52
+ - Detects repeated **statement blocks** inside larger functions.
53
+ - Uses sliding windows over CFG-normalized statement sequences.
54
+ - Targets:
55
+ - validation blocks,
56
+ - guard clauses,
57
+ - repeated orchestration logic.
58
+ - Carefully filtered to reduce noise:
59
+ - no overlapping windows,
60
+ - no clones inside the same function,
61
+ - no `__init__` noise,
62
+ - size and statement-count thresholds.
63
+
64
+ ### Control-Flow Awareness (CFG v1)
65
+
66
+ - Each function is converted into a **Control Flow Graph**.
67
+ - CFG nodes contain normalized AST statements.
68
+ - CFG edges represent structural control flow:
69
+ - `if` / `else`
70
+ - `for` / `async for` / `while`
71
+ - `try` / `except` / `finally`
72
+ - `with` / `async with`
73
+ - `match` / `case` (Python 3.10+)
74
+ - Current CFG semantics (v1):
75
+ - `break` and `continue` are treated as statements (no jump targets),
76
+ - after-blocks are explicit and always present,
77
+ - focus is on **structural similarity**, not precise runtime semantics.
78
+
79
+ This design keeps clone detection **stable, deterministic, and low-noise**.
80
+
81
+ ### Low-noise by design
82
+
83
+ - AST + CFG normalization instead of token matching.
84
+ - Conservative defaults tuned for real-world Python projects.
85
+ - Explicit thresholds for size and statement count.
86
+ - Focus on *architectural duplication*, not micro-similarities.
87
+
88
+ ### CI-friendly baseline mode
89
+
90
+ - Establish a baseline of existing clones.
91
+ - Fail CI **only when new clones are introduced**.
92
+ - Safe for legacy codebases and incremental refactoring.
93
+
94
+ ---
95
+
96
+ ## Installation
97
+
98
+ ```bash
99
+ pip install codeclone
100
+ ```
101
+
102
+ Python **3.10+** is required.
103
+
104
+ ---
105
+
106
+ ## Quick Start
107
+
108
+ Run on a project:
109
+
110
+ ```bash
111
+ codeclone .
112
+ ```
113
+
114
+ This will:
115
+
116
+ - scan Python files,
117
+ - build CFGs for functions,
118
+ - detect function-level and block-level clones,
119
+ - print a summary to stdout.
120
+
121
+ Generate reports:
122
+
123
+ ```bash
124
+ codeclone . \
125
+ --json .cache/codeclone/report.json \
126
+ --text .cache/codeclone/report.txt
127
+ ```
128
+
129
+ Generate an HTML report:
130
+
131
+ ```bash
132
+ codeclone . --html .cache/codeclone/report.html
133
+ ```
134
+
135
+ ---
136
+
137
+ ## Baseline Workflow (Recommended)
138
+
139
+ ### 1. Create a baseline
140
+
141
+ Run once on your current codebase:
142
+
143
+ ```bash
144
+ codeclone . --update-baseline
145
+ ```
146
+
147
+ Commit the generated baseline file to the repository.
148
+
149
+ ### 2. Use in CI
150
+
151
+ ```bash
152
+ codeclone . --fail-on-new
153
+ ```
154
+
155
+ Behavior:
156
+
157
+ - ✅ existing clones are allowed,
158
+ - ❌ build fails if *new* clones appear,
159
+ - ✅ refactoring that removes duplication is always allowed.
160
+
161
+ ---
162
+
163
+ ## Using with pre-commit
164
+
165
+ ```yaml
166
+ repos:
167
+ - repo: local
168
+ hooks:
169
+ - id: codeclone
170
+ name: CodeClone
171
+ entry: codeclone
172
+ language: python
173
+ args: [".", "--fail-on-new"]
174
+ types: [python]
175
+ ```
176
+
177
+ ---
178
+
179
+ ## What CodeClone Is (and Is Not)
180
+
181
+ ### CodeClone **is**
182
+
183
+ - an architectural analysis tool,
184
+ - a duplication radar,
185
+ - a CI guard against copy-paste,
186
+ - a control-flow-aware clone detector.
187
+
188
+ ### CodeClone **is not**
189
+
190
+ - a linter,
191
+ - a formatter,
192
+ - a semantic equivalence prover,
193
+ - a runtime analyzer.
194
+
195
+ ---
196
+
197
+ ## How It Works (High Level)
198
+
199
+ 1. Parse Python source into AST.
200
+ 2. Normalize AST (names, constants, attributes, annotations).
201
+ 3. Build a **Control Flow Graph (CFG)** per function.
202
+ 4. Compute stable CFG fingerprints.
203
+ 5. Detect function-level and block-level clones.
204
+ 6. Apply conservative filters to suppress noise.
205
+
206
+ See the architectural overview:
207
+ - [docs/architecture.md](docs/architecture.md)
208
+
209
+ ---
210
+
211
+ ## Control Flow Graph (CFG)
212
+
213
+ Starting from **version 1.1.0**, CodeClone uses a **Control Flow Graph (CFG)**
214
+ to improve structural clone detection robustness.
215
+
216
+ The CFG is a **structural abstraction**, not a runtime execution model.
217
+
218
+ See full design and semantics:
219
+ - [docs/cfg.md](docs/cfg.md)
220
+
221
+ ---
222
+
223
+ ## License
224
+
225
+ MIT License
@@ -0,0 +1,16 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
9
+ from importlib.metadata import version, PackageNotFoundError
10
+
11
+ try:
12
+ __version__ = version("codeclone")
13
+ except PackageNotFoundError:
14
+ __version__ = "dev"
15
+
16
+ __all__ = ["__version__"]
@@ -1,23 +1,33 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import json
4
12
  from pathlib import Path
5
- from typing import Set
6
13
 
7
14
 
8
15
  class Baseline:
9
- def __init__(self, path: str):
16
+ def __init__(self, path: str | Path):
10
17
  self.path = Path(path)
11
- self.functions: Set[str] = set()
12
- self.blocks: Set[str] = set()
18
+ self.functions: set[str] = set()
19
+ self.blocks: set[str] = set()
13
20
 
14
21
  def load(self) -> None:
15
22
  if not self.path.exists():
16
23
  return
17
24
 
18
- data = json.loads(self.path.read_text("utf-8"))
19
- self.functions = set(data.get("functions", []))
20
- self.blocks = set(data.get("blocks", []))
25
+ try:
26
+ data = json.loads(self.path.read_text("utf-8"))
27
+ self.functions = set(data.get("functions", []))
28
+ self.blocks = set(data.get("blocks", []))
29
+ except json.JSONDecodeError as e:
30
+ raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e
21
31
 
22
32
  def save(self) -> None:
23
33
  self.path.parent.mkdir(parents=True, exist_ok=True)
@@ -34,8 +44,10 @@ class Baseline:
34
44
  )
35
45
 
36
46
  @staticmethod
37
- def from_groups(func_groups: dict, block_groups: dict) -> "Baseline":
38
- bl = Baseline("")
47
+ def from_groups(
48
+ func_groups: dict, block_groups: dict, path: str | Path = ""
49
+ ) -> "Baseline":
50
+ bl = Baseline(path)
39
51
  bl.functions = set(func_groups.keys())
40
52
  bl.blocks = set(block_groups.keys())
41
53
  return bl
@@ -1,3 +1,11 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import ast
@@ -5,8 +13,9 @@ import hashlib
5
13
 
6
14
  from .normalize import NormalizationConfig, AstNormalizer
7
15
 
16
+
8
17
  def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
9
18
  normalizer = AstNormalizer(cfg)
10
19
  stmt = ast.fix_missing_locations(normalizer.visit(stmt))
11
20
  dump = ast.dump(stmt, annotate_fields=True, include_attributes=False)
12
- return hashlib.sha1(dump.encode("utf-8")).hexdigest()
21
+ return hashlib.sha1(dump.encode("utf-8")).hexdigest()
@@ -1,3 +1,11 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import ast
@@ -18,13 +26,13 @@ class BlockUnit:
18
26
 
19
27
 
20
28
  def extract_blocks(
21
- func_node: ast.AST,
22
- *,
23
- filepath: str,
24
- qualname: str,
25
- cfg: NormalizationConfig,
26
- block_size: int,
27
- max_blocks: int,
29
+ func_node: ast.AST,
30
+ *,
31
+ filepath: str,
32
+ qualname: str,
33
+ cfg: NormalizationConfig,
34
+ block_size: int,
35
+ max_blocks: int,
28
36
  ) -> list[BlockUnit]:
29
37
  body = getattr(func_node, "body", None)
30
38
  if not isinstance(body, list) or len(body) < block_size:
@@ -45,16 +53,18 @@ def extract_blocks(
45
53
  if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
46
54
  continue
47
55
 
48
- bh = "|".join(stmt_hashes[i:i + block_size])
56
+ bh = "|".join(stmt_hashes[i : i + block_size])
49
57
 
50
- blocks.append(BlockUnit(
51
- block_hash=bh,
52
- filepath=filepath,
53
- qualname=qualname,
54
- start_line=start,
55
- end_line=end,
56
- size=block_size,
57
- ))
58
+ blocks.append(
59
+ BlockUnit(
60
+ block_hash=bh,
61
+ filepath=filepath,
62
+ qualname=qualname,
63
+ start_line=start,
64
+ end_line=end,
65
+ size=block_size,
66
+ )
67
+ )
58
68
 
59
69
  last_start = start
60
70
  if len(blocks) >= max_blocks:
@@ -1,20 +1,32 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import json
4
12
  import os
5
13
  from dataclasses import asdict
6
14
  from pathlib import Path
7
- from typing import Optional
15
+ from typing import Any, Optional
8
16
 
9
17
 
10
18
  class Cache:
11
- def __init__(self, path: str):
19
+ def __init__(self, path: str | Path):
12
20
  self.path = Path(path)
13
- self.data: dict = {"files": {}}
21
+ self.data: dict[str, Any] = {"files": {}}
14
22
 
15
23
  def load(self) -> None:
16
24
  if self.path.exists():
17
- self.data = json.loads(self.path.read_text("utf-8"))
25
+ try:
26
+ self.data = json.loads(self.path.read_text("utf-8"))
27
+ except json.JSONDecodeError:
28
+ # If cache is corrupted, start fresh
29
+ self.data = {"files": {}}
18
30
 
19
31
  def save(self) -> None:
20
32
  self.path.parent.mkdir(parents=True, exist_ok=True)
@@ -23,10 +35,12 @@ class Cache:
23
35
  "utf-8",
24
36
  )
25
37
 
26
- def get_file_entry(self, filepath: str) -> Optional[dict]:
38
+ def get_file_entry(self, filepath: str) -> Optional[dict[str, Any]]:
27
39
  return self.data.get("files", {}).get(filepath)
28
40
 
29
- def put_file_entry(self, filepath: str, stat_sig: dict, units, blocks) -> None:
41
+ def put_file_entry(
42
+ self, filepath: str, stat_sig: dict[str, Any], units: list, blocks: list
43
+ ) -> None:
30
44
  self.data.setdefault("files", {})[filepath] = {
31
45
  "stat": stat_sig,
32
46
  "units": [asdict(u) for u in units],