codeclone 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeclone-1.2.0 → codeclone-1.2.1}/PKG-INFO +53 -35
- {codeclone-1.2.0 → codeclone-1.2.1}/README.md +49 -32
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/__init__.py +1 -1
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/baseline.py +33 -7
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/blockhash.py +1 -1
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/blocks.py +4 -3
- codeclone-1.2.1/codeclone/cache.py +187 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/cfg.py +53 -128
- codeclone-1.2.1/codeclone/cfg_model.py +47 -0
- codeclone-1.2.1/codeclone/cli.py +603 -0
- codeclone-1.2.1/codeclone/errors.py +27 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/extractor.py +101 -24
- codeclone-1.2.1/codeclone/html_report.py +492 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/normalize.py +21 -14
- codeclone-1.2.1/codeclone/py.typed +0 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/report.py +23 -12
- codeclone-1.2.1/codeclone/scanner.py +111 -0
- codeclone-1.2.1/codeclone/templates.py +1262 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/PKG-INFO +53 -35
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/SOURCES.txt +15 -1
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/requires.txt +2 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/pyproject.toml +36 -4
- {codeclone-1.2.0 → codeclone-1.2.1}/tests/test_baseline.py +29 -10
- codeclone-1.2.1/tests/test_blockhash.py +11 -0
- codeclone-1.2.1/tests/test_blocks.py +107 -0
- codeclone-1.2.1/tests/test_cache.py +198 -0
- codeclone-1.2.1/tests/test_cfg.py +417 -0
- codeclone-1.2.1/tests/test_cfg_model.py +18 -0
- codeclone-1.2.1/tests/test_cli_inprocess.py +812 -0
- codeclone-1.2.1/tests/test_cli_main_guard.py +17 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/tests/test_cli_smoke.py +9 -6
- codeclone-1.2.1/tests/test_cli_unit.py +69 -0
- codeclone-1.2.1/tests/test_extractor.py +212 -0
- codeclone-1.2.1/tests/test_fingerprint.py +15 -0
- codeclone-1.2.1/tests/test_html_report.py +216 -0
- codeclone-1.2.1/tests/test_init.py +26 -0
- codeclone-1.2.1/tests/test_normalize.py +147 -0
- codeclone-1.2.1/tests/test_report.py +67 -0
- codeclone-1.2.1/tests/test_scanner_extra.py +165 -0
- codeclone-1.2.1/tests/test_security.py +44 -0
- codeclone-1.2.0/codeclone/cache.py +0 -56
- codeclone-1.2.0/codeclone/cli.py +0 -409
- codeclone-1.2.0/codeclone/html_report.py +0 -936
- codeclone-1.2.0/codeclone/scanner.py +0 -48
- codeclone-1.2.0/tests/test_blocks.py +0 -32
- codeclone-1.2.0/tests/test_cfg.py +0 -176
- codeclone-1.2.0/tests/test_extractor.py +0 -49
- codeclone-1.2.0/tests/test_html_report.py +0 -44
- codeclone-1.2.0/tests/test_normalize.py +0 -22
- codeclone-1.2.0/tests/test_report.py +0 -24
- {codeclone-1.2.0 → codeclone-1.2.1}/LICENSE +0 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone/fingerprint.py +0 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/dependency_links.txt +0 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/entry_points.txt +0 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/codeclone.egg-info/top_level.txt +0 -0
- {codeclone-1.2.0 → codeclone-1.2.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeclone
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
|
|
5
5
|
Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
6
6
|
Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
@@ -10,11 +10,10 @@ Project-URL: Repository, https://github.com/orenlab/codeclone
|
|
|
10
10
|
Project-URL: Issues, https://github.com/orenlab/codeclone/issues
|
|
11
11
|
Project-URL: Changelog, https://github.com/orenlab/codeclone/releases
|
|
12
12
|
Project-URL: Documentation, https://github.com/orenlab/codeclone/tree/main/docs
|
|
13
|
-
Keywords: python,ast,code-clone,duplication,static-analysis,ci
|
|
13
|
+
Keywords: python,ast,cfg,code-clone,duplication,static-analysis,architecture,control-flow,ci
|
|
14
14
|
Classifier: Development Status :: 5 - Production/Stable
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
16
16
|
Classifier: Topic :: Software Development :: Quality Assurance
|
|
17
|
-
Classifier: Topic :: Software Development :: Code Generators
|
|
18
17
|
Classifier: Topic :: Software Development :: Testing
|
|
19
18
|
Classifier: Typing :: Typed
|
|
20
19
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -32,19 +31,22 @@ Requires-Dist: pygments>=2.19.2
|
|
|
32
31
|
Requires-Dist: rich>=14.3.2
|
|
33
32
|
Provides-Extra: dev
|
|
34
33
|
Requires-Dist: pytest>=9.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov>=6.1.0; extra == "dev"
|
|
35
35
|
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
36
36
|
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
37
37
|
Requires-Dist: mypy>=1.19.1; extra == "dev"
|
|
38
|
+
Requires-Dist: ruff>=0.12.0; extra == "dev"
|
|
38
39
|
Dynamic: license-file
|
|
39
40
|
|
|
40
41
|
# CodeClone
|
|
41
42
|
|
|
42
43
|
[](https://pypi.org/project/codeclone/)
|
|
43
44
|
[](https://pypi.org/project/codeclone/)
|
|
45
|
+
[](https://github.com/orenlab/codeclone/actions/workflows/tests.yml)
|
|
44
46
|
[](https://pypi.org/project/codeclone/)
|
|
45
47
|
[](LICENSE)
|
|
46
48
|
|
|
47
|
-
**CodeClone** is a Python code clone detector based on **normalized AST and
|
|
49
|
+
**CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**.
|
|
48
50
|
It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
|
|
49
51
|
|
|
50
52
|
CodeClone is designed to help teams:
|
|
@@ -53,15 +55,16 @@ CodeClone is designed to help teams:
|
|
|
53
55
|
- identify architectural hotspots,
|
|
54
56
|
- prevent *new* duplication via CI and pre-commit hooks.
|
|
55
57
|
|
|
56
|
-
Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
|
|
57
|
-
formatting, and minor refactoring.
|
|
58
|
+
Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
|
|
59
|
+
renaming, formatting, and minor refactoring.
|
|
58
60
|
|
|
59
61
|
---
|
|
60
62
|
|
|
61
63
|
## Why CodeClone?
|
|
62
64
|
|
|
63
65
|
Most existing tools detect *textual* duplication.
|
|
64
|
-
CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
|
|
66
|
+
CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
|
|
67
|
+
architectural drift.
|
|
65
68
|
|
|
66
69
|
Typical use cases:
|
|
67
70
|
|
|
@@ -79,11 +82,11 @@ Typical use cases:
|
|
|
79
82
|
- Detects functions and methods with identical **control-flow structure**.
|
|
80
83
|
- Based on **Control Flow Graph (CFG)** fingerprinting.
|
|
81
84
|
- Robust to:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
- variable renaming,
|
|
86
|
+
- constant changes,
|
|
87
|
+
- attribute renaming,
|
|
88
|
+
- formatting differences,
|
|
89
|
+
- docstrings and type annotations.
|
|
87
90
|
- Ideal for spotting architectural duplication across layers.
|
|
88
91
|
|
|
89
92
|
### Block-level clone detection (Type-3-lite)
|
|
@@ -91,29 +94,29 @@ Typical use cases:
|
|
|
91
94
|
- Detects repeated **statement blocks** inside larger functions.
|
|
92
95
|
- Uses sliding windows over CFG-normalized statement sequences.
|
|
93
96
|
- Targets:
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
+
- validation blocks,
|
|
98
|
+
- guard clauses,
|
|
99
|
+
- repeated orchestration logic.
|
|
97
100
|
- Carefully filtered to reduce noise:
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
101
|
+
- no overlapping windows,
|
|
102
|
+
- no clones inside the same function,
|
|
103
|
+
- no `__init__` noise,
|
|
104
|
+
- size and statement-count thresholds.
|
|
102
105
|
|
|
103
106
|
### Control-Flow Awareness (CFG v1)
|
|
104
107
|
|
|
105
108
|
- Each function is converted into a **Control Flow Graph**.
|
|
106
109
|
- CFG nodes contain normalized AST statements.
|
|
107
110
|
- CFG edges represent structural control flow:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
111
|
+
- `if` / `else`
|
|
112
|
+
- `for` / `async for` / `while`
|
|
113
|
+
- `try` / `except` / `finally`
|
|
114
|
+
- `with` / `async with`
|
|
115
|
+
- `match` / `case` (Python 3.10+)
|
|
113
116
|
- Current CFG semantics (v1):
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
+
- `break` and `continue` are treated as statements (no jump targets),
|
|
118
|
+
- after-blocks are explicit and always present,
|
|
119
|
+
- focus is on **structural similarity**, not precise runtime semantics.
|
|
117
120
|
|
|
118
121
|
This design keeps clone detection **stable, deterministic, and low-noise**.
|
|
119
122
|
|
|
@@ -122,6 +125,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**.
|
|
|
122
125
|
- AST + CFG normalization instead of token matching.
|
|
123
126
|
- Conservative defaults tuned for real-world Python projects.
|
|
124
127
|
- Explicit thresholds for size and statement count.
|
|
128
|
+
- No probabilistic scoring or heuristic similarity thresholds.
|
|
125
129
|
- Focus on *architectural duplication*, not micro-similarities.
|
|
126
130
|
|
|
127
131
|
### CI-friendly baseline mode
|
|
@@ -188,14 +192,26 @@ Commit the generated baseline file to the repository.
|
|
|
188
192
|
### 2. Use in CI
|
|
189
193
|
|
|
190
194
|
```bash
|
|
191
|
-
codeclone . --fail-on-new
|
|
195
|
+
codeclone . --fail-on-new --no-progress
|
|
192
196
|
```
|
|
193
197
|
|
|
194
198
|
Behavior:
|
|
195
199
|
|
|
196
|
-
-
|
|
197
|
-
-
|
|
198
|
-
-
|
|
200
|
+
- existing clones are allowed,
|
|
201
|
+
- the build fails if *new* clones appear,
|
|
202
|
+
- refactoring that removes duplication is always allowed.
|
|
203
|
+
|
|
204
|
+
`--fail-on-new` exits with a non-zero code when new clones are detected.
|
|
205
|
+
|
|
206
|
+
### Python Version Consistency for Baseline Checks
|
|
207
|
+
|
|
208
|
+
Due to inherent differences in Python’s AST between interpreter versions, baseline
|
|
209
|
+
generation and verification must be performed using the same Python version.
|
|
210
|
+
|
|
211
|
+
This ensures deterministic and reproducible clone detection results.
|
|
212
|
+
|
|
213
|
+
CI checks therefore pin baseline verification to a single Python version, while the
|
|
214
|
+
test matrix continues to validate compatibility across Python 3.10–3.14.
|
|
199
215
|
|
|
200
216
|
---
|
|
201
217
|
|
|
@@ -203,14 +219,14 @@ Behavior:
|
|
|
203
219
|
|
|
204
220
|
```yaml
|
|
205
221
|
repos:
|
|
206
|
-
-
|
|
222
|
+
- repo: local
|
|
207
223
|
hooks:
|
|
208
|
-
|
|
224
|
+
- id: codeclone
|
|
209
225
|
name: CodeClone
|
|
210
226
|
entry: codeclone
|
|
211
227
|
language: python
|
|
212
|
-
args: [".", "--fail-on-new"]
|
|
213
|
-
types: [python]
|
|
228
|
+
args: [ ".", "--fail-on-new" ]
|
|
229
|
+
types: [ python ]
|
|
214
230
|
```
|
|
215
231
|
|
|
216
232
|
---
|
|
@@ -243,6 +259,7 @@ repos:
|
|
|
243
259
|
6. Apply conservative filters to suppress noise.
|
|
244
260
|
|
|
245
261
|
See the architectural overview:
|
|
262
|
+
|
|
246
263
|
- [docs/architecture.md](docs/architecture.md)
|
|
247
264
|
|
|
248
265
|
---
|
|
@@ -255,6 +272,7 @@ to improve structural clone detection robustness.
|
|
|
255
272
|
The CFG is a **structural abstraction**, not a runtime execution model.
|
|
256
273
|
|
|
257
274
|
See full design and semantics:
|
|
275
|
+
|
|
258
276
|
- [docs/cfg.md](docs/cfg.md)
|
|
259
277
|
|
|
260
278
|
---
|
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://pypi.org/project/codeclone/)
|
|
4
4
|
[](https://pypi.org/project/codeclone/)
|
|
5
|
+
[](https://github.com/orenlab/codeclone/actions/workflows/tests.yml)
|
|
5
6
|
[](https://pypi.org/project/codeclone/)
|
|
6
7
|
[](LICENSE)
|
|
7
8
|
|
|
8
|
-
**CodeClone** is a Python code clone detector based on **normalized AST and
|
|
9
|
+
**CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**.
|
|
9
10
|
It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
|
|
10
11
|
|
|
11
12
|
CodeClone is designed to help teams:
|
|
@@ -14,15 +15,16 @@ CodeClone is designed to help teams:
|
|
|
14
15
|
- identify architectural hotspots,
|
|
15
16
|
- prevent *new* duplication via CI and pre-commit hooks.
|
|
16
17
|
|
|
17
|
-
Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
|
|
18
|
-
formatting, and minor refactoring.
|
|
18
|
+
Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
|
|
19
|
+
renaming, formatting, and minor refactoring.
|
|
19
20
|
|
|
20
21
|
---
|
|
21
22
|
|
|
22
23
|
## Why CodeClone?
|
|
23
24
|
|
|
24
25
|
Most existing tools detect *textual* duplication.
|
|
25
|
-
CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
|
|
26
|
+
CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
|
|
27
|
+
architectural drift.
|
|
26
28
|
|
|
27
29
|
Typical use cases:
|
|
28
30
|
|
|
@@ -40,11 +42,11 @@ Typical use cases:
|
|
|
40
42
|
- Detects functions and methods with identical **control-flow structure**.
|
|
41
43
|
- Based on **Control Flow Graph (CFG)** fingerprinting.
|
|
42
44
|
- Robust to:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
- variable renaming,
|
|
46
|
+
- constant changes,
|
|
47
|
+
- attribute renaming,
|
|
48
|
+
- formatting differences,
|
|
49
|
+
- docstrings and type annotations.
|
|
48
50
|
- Ideal for spotting architectural duplication across layers.
|
|
49
51
|
|
|
50
52
|
### Block-level clone detection (Type-3-lite)
|
|
@@ -52,29 +54,29 @@ Typical use cases:
|
|
|
52
54
|
- Detects repeated **statement blocks** inside larger functions.
|
|
53
55
|
- Uses sliding windows over CFG-normalized statement sequences.
|
|
54
56
|
- Targets:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
- validation blocks,
|
|
58
|
+
- guard clauses,
|
|
59
|
+
- repeated orchestration logic.
|
|
58
60
|
- Carefully filtered to reduce noise:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
61
|
+
- no overlapping windows,
|
|
62
|
+
- no clones inside the same function,
|
|
63
|
+
- no `__init__` noise,
|
|
64
|
+
- size and statement-count thresholds.
|
|
63
65
|
|
|
64
66
|
### Control-Flow Awareness (CFG v1)
|
|
65
67
|
|
|
66
68
|
- Each function is converted into a **Control Flow Graph**.
|
|
67
69
|
- CFG nodes contain normalized AST statements.
|
|
68
70
|
- CFG edges represent structural control flow:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
71
|
+
- `if` / `else`
|
|
72
|
+
- `for` / `async for` / `while`
|
|
73
|
+
- `try` / `except` / `finally`
|
|
74
|
+
- `with` / `async with`
|
|
75
|
+
- `match` / `case` (Python 3.10+)
|
|
74
76
|
- Current CFG semantics (v1):
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
77
|
+
- `break` and `continue` are treated as statements (no jump targets),
|
|
78
|
+
- after-blocks are explicit and always present,
|
|
79
|
+
- focus is on **structural similarity**, not precise runtime semantics.
|
|
78
80
|
|
|
79
81
|
This design keeps clone detection **stable, deterministic, and low-noise**.
|
|
80
82
|
|
|
@@ -83,6 +85,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**.
|
|
|
83
85
|
- AST + CFG normalization instead of token matching.
|
|
84
86
|
- Conservative defaults tuned for real-world Python projects.
|
|
85
87
|
- Explicit thresholds for size and statement count.
|
|
88
|
+
- No probabilistic scoring or heuristic similarity thresholds.
|
|
86
89
|
- Focus on *architectural duplication*, not micro-similarities.
|
|
87
90
|
|
|
88
91
|
### CI-friendly baseline mode
|
|
@@ -149,14 +152,26 @@ Commit the generated baseline file to the repository.
|
|
|
149
152
|
### 2. Use in CI
|
|
150
153
|
|
|
151
154
|
```bash
|
|
152
|
-
codeclone . --fail-on-new
|
|
155
|
+
codeclone . --fail-on-new --no-progress
|
|
153
156
|
```
|
|
154
157
|
|
|
155
158
|
Behavior:
|
|
156
159
|
|
|
157
|
-
-
|
|
158
|
-
-
|
|
159
|
-
-
|
|
160
|
+
- existing clones are allowed,
|
|
161
|
+
- the build fails if *new* clones appear,
|
|
162
|
+
- refactoring that removes duplication is always allowed.
|
|
163
|
+
|
|
164
|
+
`--fail-on-new` exits with a non-zero code when new clones are detected.
|
|
165
|
+
|
|
166
|
+
### Python Version Consistency for Baseline Checks
|
|
167
|
+
|
|
168
|
+
Due to inherent differences in Python’s AST between interpreter versions, baseline
|
|
169
|
+
generation and verification must be performed using the same Python version.
|
|
170
|
+
|
|
171
|
+
This ensures deterministic and reproducible clone detection results.
|
|
172
|
+
|
|
173
|
+
CI checks therefore pin baseline verification to a single Python version, while the
|
|
174
|
+
test matrix continues to validate compatibility across Python 3.10–3.14.
|
|
160
175
|
|
|
161
176
|
---
|
|
162
177
|
|
|
@@ -164,14 +179,14 @@ Behavior:
|
|
|
164
179
|
|
|
165
180
|
```yaml
|
|
166
181
|
repos:
|
|
167
|
-
-
|
|
182
|
+
- repo: local
|
|
168
183
|
hooks:
|
|
169
|
-
|
|
184
|
+
- id: codeclone
|
|
170
185
|
name: CodeClone
|
|
171
186
|
entry: codeclone
|
|
172
187
|
language: python
|
|
173
|
-
args: [".", "--fail-on-new"]
|
|
174
|
-
types: [python]
|
|
188
|
+
args: [ ".", "--fail-on-new" ]
|
|
189
|
+
types: [ python ]
|
|
175
190
|
```
|
|
176
191
|
|
|
177
192
|
---
|
|
@@ -204,6 +219,7 @@ repos:
|
|
|
204
219
|
6. Apply conservative filters to suppress noise.
|
|
205
220
|
|
|
206
221
|
See the architectural overview:
|
|
222
|
+
|
|
207
223
|
- [docs/architecture.md](docs/architecture.md)
|
|
208
224
|
|
|
209
225
|
---
|
|
@@ -216,6 +232,7 @@ to improve structural clone detection robustness.
|
|
|
216
232
|
The CFG is a **structural abstraction**, not a runtime execution model.
|
|
217
233
|
|
|
218
234
|
See full design and semantics:
|
|
235
|
+
|
|
219
236
|
- [docs/cfg.md](docs/cfg.md)
|
|
220
237
|
|
|
221
238
|
---
|
|
@@ -9,14 +9,19 @@ Licensed under the MIT License.
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
11
|
import json
|
|
12
|
+
from collections.abc import Mapping
|
|
12
13
|
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
class Baseline:
|
|
18
|
+
__slots__ = ("blocks", "functions", "path", "python_version")
|
|
19
|
+
|
|
16
20
|
def __init__(self, path: str | Path):
|
|
17
21
|
self.path = Path(path)
|
|
18
22
|
self.functions: set[str] = set()
|
|
19
23
|
self.blocks: set[str] = set()
|
|
24
|
+
self.python_version: str | None = None
|
|
20
25
|
|
|
21
26
|
def load(self) -> None:
|
|
22
27
|
if not self.path.exists():
|
|
@@ -26,6 +31,10 @@ class Baseline:
|
|
|
26
31
|
data = json.loads(self.path.read_text("utf-8"))
|
|
27
32
|
self.functions = set(data.get("functions", []))
|
|
28
33
|
self.blocks = set(data.get("blocks", []))
|
|
34
|
+
python_version = data.get("python_version")
|
|
35
|
+
self.python_version = (
|
|
36
|
+
python_version if isinstance(python_version, str) else None
|
|
37
|
+
)
|
|
29
38
|
except json.JSONDecodeError as e:
|
|
30
39
|
raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e
|
|
31
40
|
|
|
@@ -33,10 +42,7 @@ class Baseline:
|
|
|
33
42
|
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
34
43
|
self.path.write_text(
|
|
35
44
|
json.dumps(
|
|
36
|
-
|
|
37
|
-
"functions": sorted(self.functions),
|
|
38
|
-
"blocks": sorted(self.blocks),
|
|
39
|
-
},
|
|
45
|
+
_baseline_payload(self.functions, self.blocks, self.python_version),
|
|
40
46
|
indent=2,
|
|
41
47
|
ensure_ascii=False,
|
|
42
48
|
),
|
|
@@ -45,14 +51,34 @@ class Baseline:
|
|
|
45
51
|
|
|
46
52
|
@staticmethod
|
|
47
53
|
def from_groups(
|
|
48
|
-
func_groups:
|
|
49
|
-
|
|
54
|
+
func_groups: Mapping[str, object],
|
|
55
|
+
block_groups: Mapping[str, object],
|
|
56
|
+
path: str | Path = "",
|
|
57
|
+
python_version: str | None = None,
|
|
58
|
+
) -> Baseline:
|
|
50
59
|
bl = Baseline(path)
|
|
51
60
|
bl.functions = set(func_groups.keys())
|
|
52
61
|
bl.blocks = set(block_groups.keys())
|
|
62
|
+
bl.python_version = python_version
|
|
53
63
|
return bl
|
|
54
64
|
|
|
55
|
-
def diff(
|
|
65
|
+
def diff(
|
|
66
|
+
self, func_groups: Mapping[str, object], block_groups: Mapping[str, object]
|
|
67
|
+
) -> tuple[set[str], set[str]]:
|
|
56
68
|
new_funcs = set(func_groups.keys()) - self.functions
|
|
57
69
|
new_blocks = set(block_groups.keys()) - self.blocks
|
|
58
70
|
return new_funcs, new_blocks
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _baseline_payload(
|
|
74
|
+
functions: set[str],
|
|
75
|
+
blocks: set[str],
|
|
76
|
+
python_version: str | None,
|
|
77
|
+
) -> dict[str, Any]:
|
|
78
|
+
payload: dict[str, Any] = {
|
|
79
|
+
"functions": sorted(functions),
|
|
80
|
+
"blocks": sorted(blocks),
|
|
81
|
+
}
|
|
82
|
+
if python_version:
|
|
83
|
+
payload["python_version"] = python_version
|
|
84
|
+
return payload
|
|
@@ -11,7 +11,7 @@ from __future__ import annotations
|
|
|
11
11
|
import ast
|
|
12
12
|
import hashlib
|
|
13
13
|
|
|
14
|
-
from .normalize import
|
|
14
|
+
from .normalize import AstNormalizer, NormalizationConfig
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
|
|
@@ -15,7 +15,7 @@ from .blockhash import stmt_hash
|
|
|
15
15
|
from .normalize import NormalizationConfig
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
@dataclass(frozen=True)
|
|
18
|
+
@dataclass(frozen=True, slots=True)
|
|
19
19
|
class BlockUnit:
|
|
20
20
|
block_hash: str
|
|
21
21
|
filepath: str
|
|
@@ -42,7 +42,8 @@ def extract_blocks(
|
|
|
42
42
|
|
|
43
43
|
blocks: list[BlockUnit] = []
|
|
44
44
|
last_start: int | None = None
|
|
45
|
-
|
|
45
|
+
# Allow some overlap (50%), but at least 3 lines apart
|
|
46
|
+
min_line_distance = max(block_size // 2, 3)
|
|
46
47
|
|
|
47
48
|
for i in range(len(stmt_hashes) - block_size + 1):
|
|
48
49
|
start = getattr(body[i], "lineno", None)
|
|
@@ -50,7 +51,7 @@ def extract_blocks(
|
|
|
50
51
|
if not start or not end:
|
|
51
52
|
continue
|
|
52
53
|
|
|
53
|
-
if last_start is not None and start - last_start <
|
|
54
|
+
if last_start is not None and start - last_start < min_line_distance:
|
|
54
55
|
continue
|
|
55
56
|
|
|
56
57
|
bh = "|".join(stmt_hashes[i : i + block_size])
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import hmac
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import secrets
|
|
16
|
+
from collections.abc import Mapping
|
|
17
|
+
from dataclasses import asdict
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import TYPE_CHECKING, Any, TypedDict, cast
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from .blocks import BlockUnit
|
|
23
|
+
from .extractor import Unit
|
|
24
|
+
|
|
25
|
+
from .errors import CacheError
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FileStat(TypedDict):
|
|
29
|
+
mtime_ns: int
|
|
30
|
+
size: int
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class UnitDict(TypedDict):
|
|
34
|
+
qualname: str
|
|
35
|
+
filepath: str
|
|
36
|
+
start_line: int
|
|
37
|
+
end_line: int
|
|
38
|
+
loc: int
|
|
39
|
+
stmt_count: int
|
|
40
|
+
fingerprint: str
|
|
41
|
+
loc_bucket: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class BlockDict(TypedDict):
|
|
45
|
+
block_hash: str
|
|
46
|
+
filepath: str
|
|
47
|
+
qualname: str
|
|
48
|
+
start_line: int
|
|
49
|
+
end_line: int
|
|
50
|
+
size: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class CacheEntry(TypedDict):
|
|
54
|
+
stat: FileStat
|
|
55
|
+
units: list[UnitDict]
|
|
56
|
+
blocks: list[BlockDict]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class CacheData(TypedDict):
|
|
60
|
+
version: str
|
|
61
|
+
files: dict[str, CacheEntry]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Cache:
|
|
65
|
+
__slots__ = ("data", "load_warning", "path", "secret")
|
|
66
|
+
CACHE_VERSION = "1.0"
|
|
67
|
+
|
|
68
|
+
def __init__(self, path: str | Path):
|
|
69
|
+
self.path = Path(path)
|
|
70
|
+
self.data: CacheData = {"version": self.CACHE_VERSION, "files": {}}
|
|
71
|
+
self.secret = self._load_secret()
|
|
72
|
+
self.load_warning: str | None = None
|
|
73
|
+
|
|
74
|
+
def _load_secret(self) -> bytes:
|
|
75
|
+
"""Load or create cache signing secret."""
|
|
76
|
+
# Store secret in the same directory as the cache file, named .cache_secret
|
|
77
|
+
# If cache is at ~/.cache/codeclone/cache.json, secret is
|
|
78
|
+
# ~/.cache/codeclone/.cache_secret
|
|
79
|
+
secret_path = self.path.parent / ".cache_secret"
|
|
80
|
+
if secret_path.exists():
|
|
81
|
+
return secret_path.read_bytes()
|
|
82
|
+
else:
|
|
83
|
+
secret = secrets.token_bytes(32)
|
|
84
|
+
try:
|
|
85
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
secret_path.write_bytes(secret)
|
|
87
|
+
# Set restrictive permissions on secret file (Unix only)
|
|
88
|
+
if os.name == "posix":
|
|
89
|
+
secret_path.chmod(0o600)
|
|
90
|
+
except OSError:
|
|
91
|
+
pass
|
|
92
|
+
return secret
|
|
93
|
+
|
|
94
|
+
def _sign_data(self, data: Mapping[str, Any]) -> str:
|
|
95
|
+
"""Create HMAC signature of cache data."""
|
|
96
|
+
# Sort keys for deterministic JSON serialization
|
|
97
|
+
data_str = json.dumps(data, sort_keys=True)
|
|
98
|
+
return hmac.new(self.secret, data_str.encode(), hashlib.sha256).hexdigest()
|
|
99
|
+
|
|
100
|
+
def load(self) -> None:
|
|
101
|
+
if not self.path.exists():
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
raw = json.loads(self.path.read_text("utf-8"))
|
|
106
|
+
stored_sig = raw.get("_signature")
|
|
107
|
+
|
|
108
|
+
# Extract data without signature for verification
|
|
109
|
+
data = {k: v for k, v in raw.items() if k != "_signature"}
|
|
110
|
+
|
|
111
|
+
# Verify signature
|
|
112
|
+
expected_sig = self._sign_data(data)
|
|
113
|
+
if stored_sig != expected_sig:
|
|
114
|
+
self.load_warning = "Cache signature mismatch; ignoring cache."
|
|
115
|
+
self.data = {"version": self.CACHE_VERSION, "files": {}}
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
if data.get("version") != self.CACHE_VERSION:
|
|
119
|
+
self.load_warning = (
|
|
120
|
+
"Cache version mismatch "
|
|
121
|
+
f"(found {data.get('version')}); ignoring cache."
|
|
122
|
+
)
|
|
123
|
+
self.data = {"version": self.CACHE_VERSION, "files": {}}
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
# Basic structure check
|
|
127
|
+
if not isinstance(data.get("files"), dict):
|
|
128
|
+
self.load_warning = "Cache format invalid; ignoring cache."
|
|
129
|
+
self.data = {"version": self.CACHE_VERSION, "files": {}}
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
self.data = cast(CacheData, data)
|
|
133
|
+
self.load_warning = None
|
|
134
|
+
|
|
135
|
+
except (json.JSONDecodeError, ValueError):
|
|
136
|
+
self.load_warning = "Cache corrupted; ignoring cache."
|
|
137
|
+
self.data = {"version": self.CACHE_VERSION, "files": {}}
|
|
138
|
+
|
|
139
|
+
def save(self) -> None:
|
|
140
|
+
try:
|
|
141
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
142
|
+
|
|
143
|
+
# Add signature
|
|
144
|
+
data_with_sig = {**self.data, "_signature": self._sign_data(self.data)}
|
|
145
|
+
|
|
146
|
+
self.path.write_text(
|
|
147
|
+
json.dumps(data_with_sig, ensure_ascii=False, indent=2),
|
|
148
|
+
"utf-8",
|
|
149
|
+
)
|
|
150
|
+
except OSError as e:
|
|
151
|
+
raise CacheError(f"Failed to save cache: {e}") from e
|
|
152
|
+
|
|
153
|
+
def get_file_entry(self, filepath: str) -> CacheEntry | None:
|
|
154
|
+
entry = self.data["files"].get(filepath)
|
|
155
|
+
|
|
156
|
+
if entry is None:
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
if not isinstance(entry, dict):
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
required = {"stat", "units", "blocks"}
|
|
163
|
+
if not required.issubset(entry.keys()):
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
return entry
|
|
167
|
+
|
|
168
|
+
def put_file_entry(
|
|
169
|
+
self,
|
|
170
|
+
filepath: str,
|
|
171
|
+
stat_sig: FileStat,
|
|
172
|
+
units: list[Unit],
|
|
173
|
+
blocks: list[BlockUnit],
|
|
174
|
+
) -> None:
|
|
175
|
+
self.data["files"][filepath] = {
|
|
176
|
+
"stat": stat_sig,
|
|
177
|
+
"units": cast(list[UnitDict], cast(object, [asdict(u) for u in units])),
|
|
178
|
+
"blocks": cast(list[BlockDict], cast(object, [asdict(b) for b in blocks])),
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def file_stat_signature(path: str) -> FileStat:
|
|
183
|
+
st = os.stat(path)
|
|
184
|
+
return {
|
|
185
|
+
"mtime_ns": st.st_mtime_ns,
|
|
186
|
+
"size": st.st_size,
|
|
187
|
+
}
|