codeclone 1.1.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeclone-1.1.0 → codeclone-1.2.1}/PKG-INFO +62 -34
- {codeclone-1.1.0 → codeclone-1.2.1}/README.md +56 -31
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/__init__.py +1 -1
- codeclone-1.2.1/codeclone/baseline.py +84 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/blockhash.py +1 -1
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/blocks.py +4 -3
- codeclone-1.2.1/codeclone/cache.py +187 -0
- codeclone-1.2.1/codeclone/cfg.py +263 -0
- codeclone-1.2.1/codeclone/cfg_model.py +47 -0
- codeclone-1.2.1/codeclone/cli.py +603 -0
- codeclone-1.2.1/codeclone/errors.py +27 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/extractor.py +101 -24
- codeclone-1.2.1/codeclone/html_report.py +492 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/normalize.py +43 -13
- codeclone-1.2.1/codeclone/py.typed +0 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/report.py +23 -12
- codeclone-1.2.1/codeclone/scanner.py +111 -0
- codeclone-1.2.1/codeclone/templates.py +1262 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/PKG-INFO +62 -34
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/SOURCES.txt +16 -1
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/requires.txt +3 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/pyproject.toml +38 -4
- codeclone-1.2.1/tests/test_baseline.py +81 -0
- codeclone-1.2.1/tests/test_blockhash.py +11 -0
- codeclone-1.2.1/tests/test_blocks.py +107 -0
- codeclone-1.2.1/tests/test_cache.py +198 -0
- codeclone-1.2.1/tests/test_cfg.py +417 -0
- codeclone-1.2.1/tests/test_cfg_model.py +18 -0
- codeclone-1.2.1/tests/test_cli_inprocess.py +812 -0
- codeclone-1.2.1/tests/test_cli_main_guard.py +17 -0
- codeclone-1.2.1/tests/test_cli_smoke.py +110 -0
- codeclone-1.2.1/tests/test_cli_unit.py +69 -0
- codeclone-1.2.1/tests/test_extractor.py +212 -0
- codeclone-1.2.1/tests/test_fingerprint.py +15 -0
- codeclone-1.2.1/tests/test_html_report.py +216 -0
- codeclone-1.2.1/tests/test_init.py +26 -0
- codeclone-1.2.1/tests/test_normalize.py +147 -0
- codeclone-1.2.1/tests/test_report.py +67 -0
- codeclone-1.2.1/tests/test_scanner_extra.py +165 -0
- codeclone-1.2.1/tests/test_security.py +44 -0
- codeclone-1.1.0/codeclone/baseline.py +0 -54
- codeclone-1.1.0/codeclone/cache.py +0 -50
- codeclone-1.1.0/codeclone/cfg.py +0 -173
- codeclone-1.1.0/codeclone/cli.py +0 -179
- codeclone-1.1.0/codeclone/html_report.py +0 -953
- codeclone-1.1.0/codeclone/scanner.py +0 -48
- codeclone-1.1.0/tests/test_baseline.py +0 -15
- codeclone-1.1.0/tests/test_blocks.py +0 -32
- codeclone-1.1.0/tests/test_cfg.py +0 -133
- codeclone-1.1.0/tests/test_cli_smoke.py +0 -24
- codeclone-1.1.0/tests/test_extractor.py +0 -49
- codeclone-1.1.0/tests/test_normalize.py +0 -22
- codeclone-1.1.0/tests/test_report.py +0 -24
- {codeclone-1.1.0 → codeclone-1.2.1}/LICENSE +0 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/fingerprint.py +0 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/dependency_links.txt +0 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/entry_points.txt +0 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/top_level.txt +0 -0
- {codeclone-1.1.0 → codeclone-1.2.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeclone
|
|
3
|
-
Version: 1.1
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
|
|
5
5
|
Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
6
6
|
Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
@@ -10,11 +10,10 @@ Project-URL: Repository, https://github.com/orenlab/codeclone
|
|
|
10
10
|
Project-URL: Issues, https://github.com/orenlab/codeclone/issues
|
|
11
11
|
Project-URL: Changelog, https://github.com/orenlab/codeclone/releases
|
|
12
12
|
Project-URL: Documentation, https://github.com/orenlab/codeclone/tree/main/docs
|
|
13
|
-
Keywords: python,ast,code-clone,duplication,static-analysis,ci
|
|
13
|
+
Keywords: python,ast,cfg,code-clone,duplication,static-analysis,architecture,control-flow,ci
|
|
14
14
|
Classifier: Development Status :: 5 - Production/Stable
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
16
16
|
Classifier: Topic :: Software Development :: Quality Assurance
|
|
17
|
-
Classifier: Topic :: Software Development :: Code Generators
|
|
18
17
|
Classifier: Topic :: Software Development :: Testing
|
|
19
18
|
Classifier: Typing :: Typed
|
|
20
19
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -23,26 +22,31 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
23
22
|
Classifier: Programming Language :: Python :: 3.11
|
|
24
23
|
Classifier: Programming Language :: Python :: 3.12
|
|
25
24
|
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
26
26
|
Classifier: Operating System :: OS Independent
|
|
27
27
|
Requires-Python: >=3.10
|
|
28
28
|
Description-Content-Type: text/markdown
|
|
29
29
|
License-File: LICENSE
|
|
30
30
|
Requires-Dist: pygments>=2.19.2
|
|
31
|
+
Requires-Dist: rich>=14.3.2
|
|
31
32
|
Provides-Extra: dev
|
|
32
33
|
Requires-Dist: pytest>=9.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov>=6.1.0; extra == "dev"
|
|
33
35
|
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
34
36
|
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
35
37
|
Requires-Dist: mypy>=1.19.1; extra == "dev"
|
|
38
|
+
Requires-Dist: ruff>=0.12.0; extra == "dev"
|
|
36
39
|
Dynamic: license-file
|
|
37
40
|
|
|
38
41
|
# CodeClone
|
|
39
42
|
|
|
40
43
|
[](https://pypi.org/project/codeclone/)
|
|
41
44
|
[](https://pypi.org/project/codeclone/)
|
|
45
|
+
[](https://github.com/orenlab/codeclone/actions/workflows/tests.yml)
|
|
42
46
|
[](https://pypi.org/project/codeclone/)
|
|
43
47
|
[](LICENSE)
|
|
44
48
|
|
|
45
|
-
**CodeClone** is a Python code clone detector based on **normalized AST and
|
|
49
|
+
**CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**.
|
|
46
50
|
It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
|
|
47
51
|
|
|
48
52
|
CodeClone is designed to help teams:
|
|
@@ -51,15 +55,16 @@ CodeClone is designed to help teams:
|
|
|
51
55
|
- identify architectural hotspots,
|
|
52
56
|
- prevent *new* duplication via CI and pre-commit hooks.
|
|
53
57
|
|
|
54
|
-
Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
|
|
55
|
-
formatting, and minor refactoring.
|
|
58
|
+
Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
|
|
59
|
+
renaming, formatting, and minor refactoring.
|
|
56
60
|
|
|
57
61
|
---
|
|
58
62
|
|
|
59
63
|
## Why CodeClone?
|
|
60
64
|
|
|
61
65
|
Most existing tools detect *textual* duplication.
|
|
62
|
-
CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
|
|
66
|
+
CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
|
|
67
|
+
architectural drift.
|
|
63
68
|
|
|
64
69
|
Typical use cases:
|
|
65
70
|
|
|
@@ -77,11 +82,11 @@ Typical use cases:
|
|
|
77
82
|
- Detects functions and methods with identical **control-flow structure**.
|
|
78
83
|
- Based on **Control Flow Graph (CFG)** fingerprinting.
|
|
79
84
|
- Robust to:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
+
- variable renaming,
|
|
86
|
+
- constant changes,
|
|
87
|
+
- attribute renaming,
|
|
88
|
+
- formatting differences,
|
|
89
|
+
- docstrings and type annotations.
|
|
85
90
|
- Ideal for spotting architectural duplication across layers.
|
|
86
91
|
|
|
87
92
|
### Block-level clone detection (Type-3-lite)
|
|
@@ -89,24 +94,29 @@ Typical use cases:
|
|
|
89
94
|
- Detects repeated **statement blocks** inside larger functions.
|
|
90
95
|
- Uses sliding windows over CFG-normalized statement sequences.
|
|
91
96
|
- Targets:
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
97
|
+
- validation blocks,
|
|
98
|
+
- guard clauses,
|
|
99
|
+
- repeated orchestration logic.
|
|
95
100
|
- Carefully filtered to reduce noise:
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
101
|
+
- no overlapping windows,
|
|
102
|
+
- no clones inside the same function,
|
|
103
|
+
- no `__init__` noise,
|
|
104
|
+
- size and statement-count thresholds.
|
|
100
105
|
|
|
101
106
|
### Control-Flow Awareness (CFG v1)
|
|
102
107
|
|
|
103
108
|
- Each function is converted into a **Control Flow Graph**.
|
|
104
109
|
- CFG nodes contain normalized AST statements.
|
|
105
|
-
- CFG edges represent structural control flow
|
|
110
|
+
- CFG edges represent structural control flow:
|
|
111
|
+
- `if` / `else`
|
|
112
|
+
- `for` / `async for` / `while`
|
|
113
|
+
- `try` / `except` / `finally`
|
|
114
|
+
- `with` / `async with`
|
|
115
|
+
- `match` / `case` (Python 3.10+)
|
|
106
116
|
- Current CFG semantics (v1):
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
117
|
+
- `break` and `continue` are treated as statements (no jump targets),
|
|
118
|
+
- after-blocks are explicit and always present,
|
|
119
|
+
- focus is on **structural similarity**, not precise runtime semantics.
|
|
110
120
|
|
|
111
121
|
This design keeps clone detection **stable, deterministic, and low-noise**.
|
|
112
122
|
|
|
@@ -115,6 +125,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**.
|
|
|
115
125
|
- AST + CFG normalization instead of token matching.
|
|
116
126
|
- Conservative defaults tuned for real-world Python projects.
|
|
117
127
|
- Explicit thresholds for size and statement count.
|
|
128
|
+
- No probabilistic scoring or heuristic similarity thresholds.
|
|
118
129
|
- Focus on *architectural duplication*, not micro-similarities.
|
|
119
130
|
|
|
120
131
|
### CI-friendly baseline mode
|
|
@@ -154,14 +165,14 @@ Generate reports:
|
|
|
154
165
|
|
|
155
166
|
```bash
|
|
156
167
|
codeclone . \
|
|
157
|
-
--json
|
|
158
|
-
--text
|
|
168
|
+
--json .cache/codeclone/report.json \
|
|
169
|
+
--text .cache/codeclone/report.txt
|
|
159
170
|
```
|
|
160
171
|
|
|
161
172
|
Generate an HTML report:
|
|
162
173
|
|
|
163
174
|
```bash
|
|
164
|
-
codeclone . --html
|
|
175
|
+
codeclone . --html .cache/codeclone/report.html
|
|
165
176
|
```
|
|
166
177
|
|
|
167
178
|
---
|
|
@@ -181,14 +192,26 @@ Commit the generated baseline file to the repository.
|
|
|
181
192
|
### 2. Use in CI
|
|
182
193
|
|
|
183
194
|
```bash
|
|
184
|
-
codeclone . --fail-on-new
|
|
195
|
+
codeclone . --fail-on-new --no-progress
|
|
185
196
|
```
|
|
186
197
|
|
|
187
198
|
Behavior:
|
|
188
199
|
|
|
189
|
-
-
|
|
190
|
-
-
|
|
191
|
-
-
|
|
200
|
+
- existing clones are allowed,
|
|
201
|
+
- the build fails if *new* clones appear,
|
|
202
|
+
- refactoring that removes duplication is always allowed.
|
|
203
|
+
|
|
204
|
+
`--fail-on-new` exits with a non-zero code when new clones are detected.
|
|
205
|
+
|
|
206
|
+
### Python Version Consistency for Baseline Checks
|
|
207
|
+
|
|
208
|
+
Due to inherent differences in Python’s AST between interpreter versions, baseline
|
|
209
|
+
generation and verification must be performed using the same Python version.
|
|
210
|
+
|
|
211
|
+
This ensures deterministic and reproducible clone detection results.
|
|
212
|
+
|
|
213
|
+
CI checks therefore pin baseline verification to a single Python version, while the
|
|
214
|
+
test matrix continues to validate compatibility across Python 3.10–3.14.
|
|
192
215
|
|
|
193
216
|
---
|
|
194
217
|
|
|
@@ -196,14 +219,14 @@ Behavior:
|
|
|
196
219
|
|
|
197
220
|
```yaml
|
|
198
221
|
repos:
|
|
199
|
-
-
|
|
222
|
+
- repo: local
|
|
200
223
|
hooks:
|
|
201
|
-
|
|
224
|
+
- id: codeclone
|
|
202
225
|
name: CodeClone
|
|
203
226
|
entry: codeclone
|
|
204
227
|
language: python
|
|
205
|
-
args: [".", "--fail-on-new"]
|
|
206
|
-
types: [python]
|
|
228
|
+
args: [ ".", "--fail-on-new" ]
|
|
229
|
+
types: [ python ]
|
|
207
230
|
```
|
|
208
231
|
|
|
209
232
|
---
|
|
@@ -235,6 +258,10 @@ repos:
|
|
|
235
258
|
5. Detect function-level and block-level clones.
|
|
236
259
|
6. Apply conservative filters to suppress noise.
|
|
237
260
|
|
|
261
|
+
See the architectural overview:
|
|
262
|
+
|
|
263
|
+
- [docs/architecture.md](docs/architecture.md)
|
|
264
|
+
|
|
238
265
|
---
|
|
239
266
|
|
|
240
267
|
## Control Flow Graph (CFG)
|
|
@@ -245,6 +272,7 @@ to improve structural clone detection robustness.
|
|
|
245
272
|
The CFG is a **structural abstraction**, not a runtime execution model.
|
|
246
273
|
|
|
247
274
|
See full design and semantics:
|
|
275
|
+
|
|
248
276
|
- [docs/cfg.md](docs/cfg.md)
|
|
249
277
|
|
|
250
278
|
---
|
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://pypi.org/project/codeclone/)
|
|
4
4
|
[](https://pypi.org/project/codeclone/)
|
|
5
|
+
[](https://github.com/orenlab/codeclone/actions/workflows/tests.yml)
|
|
5
6
|
[](https://pypi.org/project/codeclone/)
|
|
6
7
|
[](LICENSE)
|
|
7
8
|
|
|
8
|
-
**CodeClone** is a Python code clone detector based on **normalized AST and
|
|
9
|
+
**CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**.
|
|
9
10
|
It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
|
|
10
11
|
|
|
11
12
|
CodeClone is designed to help teams:
|
|
@@ -14,15 +15,16 @@ CodeClone is designed to help teams:
|
|
|
14
15
|
- identify architectural hotspots,
|
|
15
16
|
- prevent *new* duplication via CI and pre-commit hooks.
|
|
16
17
|
|
|
17
|
-
Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
|
|
18
|
-
formatting, and minor refactoring.
|
|
18
|
+
Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
|
|
19
|
+
renaming, formatting, and minor refactoring.
|
|
19
20
|
|
|
20
21
|
---
|
|
21
22
|
|
|
22
23
|
## Why CodeClone?
|
|
23
24
|
|
|
24
25
|
Most existing tools detect *textual* duplication.
|
|
25
|
-
CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
|
|
26
|
+
CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
|
|
27
|
+
architectural drift.
|
|
26
28
|
|
|
27
29
|
Typical use cases:
|
|
28
30
|
|
|
@@ -40,11 +42,11 @@ Typical use cases:
|
|
|
40
42
|
- Detects functions and methods with identical **control-flow structure**.
|
|
41
43
|
- Based on **Control Flow Graph (CFG)** fingerprinting.
|
|
42
44
|
- Robust to:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
- variable renaming,
|
|
46
|
+
- constant changes,
|
|
47
|
+
- attribute renaming,
|
|
48
|
+
- formatting differences,
|
|
49
|
+
- docstrings and type annotations.
|
|
48
50
|
- Ideal for spotting architectural duplication across layers.
|
|
49
51
|
|
|
50
52
|
### Block-level clone detection (Type-3-lite)
|
|
@@ -52,24 +54,29 @@ Typical use cases:
|
|
|
52
54
|
- Detects repeated **statement blocks** inside larger functions.
|
|
53
55
|
- Uses sliding windows over CFG-normalized statement sequences.
|
|
54
56
|
- Targets:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
- validation blocks,
|
|
58
|
+
- guard clauses,
|
|
59
|
+
- repeated orchestration logic.
|
|
58
60
|
- Carefully filtered to reduce noise:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
61
|
+
- no overlapping windows,
|
|
62
|
+
- no clones inside the same function,
|
|
63
|
+
- no `__init__` noise,
|
|
64
|
+
- size and statement-count thresholds.
|
|
63
65
|
|
|
64
66
|
### Control-Flow Awareness (CFG v1)
|
|
65
67
|
|
|
66
68
|
- Each function is converted into a **Control Flow Graph**.
|
|
67
69
|
- CFG nodes contain normalized AST statements.
|
|
68
|
-
- CFG edges represent structural control flow
|
|
70
|
+
- CFG edges represent structural control flow:
|
|
71
|
+
- `if` / `else`
|
|
72
|
+
- `for` / `async for` / `while`
|
|
73
|
+
- `try` / `except` / `finally`
|
|
74
|
+
- `with` / `async with`
|
|
75
|
+
- `match` / `case` (Python 3.10+)
|
|
69
76
|
- Current CFG semantics (v1):
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
77
|
+
- `break` and `continue` are treated as statements (no jump targets),
|
|
78
|
+
- after-blocks are explicit and always present,
|
|
79
|
+
- focus is on **structural similarity**, not precise runtime semantics.
|
|
73
80
|
|
|
74
81
|
This design keeps clone detection **stable, deterministic, and low-noise**.
|
|
75
82
|
|
|
@@ -78,6 +85,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**.
|
|
|
78
85
|
- AST + CFG normalization instead of token matching.
|
|
79
86
|
- Conservative defaults tuned for real-world Python projects.
|
|
80
87
|
- Explicit thresholds for size and statement count.
|
|
88
|
+
- No probabilistic scoring or heuristic similarity thresholds.
|
|
81
89
|
- Focus on *architectural duplication*, not micro-similarities.
|
|
82
90
|
|
|
83
91
|
### CI-friendly baseline mode
|
|
@@ -117,14 +125,14 @@ Generate reports:
|
|
|
117
125
|
|
|
118
126
|
```bash
|
|
119
127
|
codeclone . \
|
|
120
|
-
--json
|
|
121
|
-
--text
|
|
128
|
+
--json .cache/codeclone/report.json \
|
|
129
|
+
--text .cache/codeclone/report.txt
|
|
122
130
|
```
|
|
123
131
|
|
|
124
132
|
Generate an HTML report:
|
|
125
133
|
|
|
126
134
|
```bash
|
|
127
|
-
codeclone . --html
|
|
135
|
+
codeclone . --html .cache/codeclone/report.html
|
|
128
136
|
```
|
|
129
137
|
|
|
130
138
|
---
|
|
@@ -144,14 +152,26 @@ Commit the generated baseline file to the repository.
|
|
|
144
152
|
### 2. Use in CI
|
|
145
153
|
|
|
146
154
|
```bash
|
|
147
|
-
codeclone . --fail-on-new
|
|
155
|
+
codeclone . --fail-on-new --no-progress
|
|
148
156
|
```
|
|
149
157
|
|
|
150
158
|
Behavior:
|
|
151
159
|
|
|
152
|
-
-
|
|
153
|
-
-
|
|
154
|
-
-
|
|
160
|
+
- existing clones are allowed,
|
|
161
|
+
- the build fails if *new* clones appear,
|
|
162
|
+
- refactoring that removes duplication is always allowed.
|
|
163
|
+
|
|
164
|
+
`--fail-on-new` exits with a non-zero code when new clones are detected.
|
|
165
|
+
|
|
166
|
+
### Python Version Consistency for Baseline Checks
|
|
167
|
+
|
|
168
|
+
Due to inherent differences in Python’s AST between interpreter versions, baseline
|
|
169
|
+
generation and verification must be performed using the same Python version.
|
|
170
|
+
|
|
171
|
+
This ensures deterministic and reproducible clone detection results.
|
|
172
|
+
|
|
173
|
+
CI checks therefore pin baseline verification to a single Python version, while the
|
|
174
|
+
test matrix continues to validate compatibility across Python 3.10–3.14.
|
|
155
175
|
|
|
156
176
|
---
|
|
157
177
|
|
|
@@ -159,14 +179,14 @@ Behavior:
|
|
|
159
179
|
|
|
160
180
|
```yaml
|
|
161
181
|
repos:
|
|
162
|
-
-
|
|
182
|
+
- repo: local
|
|
163
183
|
hooks:
|
|
164
|
-
|
|
184
|
+
- id: codeclone
|
|
165
185
|
name: CodeClone
|
|
166
186
|
entry: codeclone
|
|
167
187
|
language: python
|
|
168
|
-
args: [".", "--fail-on-new"]
|
|
169
|
-
types: [python]
|
|
188
|
+
args: [ ".", "--fail-on-new" ]
|
|
189
|
+
types: [ python ]
|
|
170
190
|
```
|
|
171
191
|
|
|
172
192
|
---
|
|
@@ -198,6 +218,10 @@ repos:
|
|
|
198
218
|
5. Detect function-level and block-level clones.
|
|
199
219
|
6. Apply conservative filters to suppress noise.
|
|
200
220
|
|
|
221
|
+
See the architectural overview:
|
|
222
|
+
|
|
223
|
+
- [docs/architecture.md](docs/architecture.md)
|
|
224
|
+
|
|
201
225
|
---
|
|
202
226
|
|
|
203
227
|
## Control Flow Graph (CFG)
|
|
@@ -208,6 +232,7 @@ to improve structural clone detection robustness.
|
|
|
208
232
|
The CFG is a **structural abstraction**, not a runtime execution model.
|
|
209
233
|
|
|
210
234
|
See full design and semantics:
|
|
235
|
+
|
|
211
236
|
- [docs/cfg.md](docs/cfg.md)
|
|
212
237
|
|
|
213
238
|
---
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeClone — AST and CFG-based code clone detector for Python
|
|
3
|
+
focused on architectural duplication.
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Den Rozhnovskiy
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from collections.abc import Mapping
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Baseline:
|
|
18
|
+
__slots__ = ("blocks", "functions", "path", "python_version")
|
|
19
|
+
|
|
20
|
+
def __init__(self, path: str | Path):
|
|
21
|
+
self.path = Path(path)
|
|
22
|
+
self.functions: set[str] = set()
|
|
23
|
+
self.blocks: set[str] = set()
|
|
24
|
+
self.python_version: str | None = None
|
|
25
|
+
|
|
26
|
+
def load(self) -> None:
|
|
27
|
+
if not self.path.exists():
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
data = json.loads(self.path.read_text("utf-8"))
|
|
32
|
+
self.functions = set(data.get("functions", []))
|
|
33
|
+
self.blocks = set(data.get("blocks", []))
|
|
34
|
+
python_version = data.get("python_version")
|
|
35
|
+
self.python_version = (
|
|
36
|
+
python_version if isinstance(python_version, str) else None
|
|
37
|
+
)
|
|
38
|
+
except json.JSONDecodeError as e:
|
|
39
|
+
raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e
|
|
40
|
+
|
|
41
|
+
def save(self) -> None:
|
|
42
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
self.path.write_text(
|
|
44
|
+
json.dumps(
|
|
45
|
+
_baseline_payload(self.functions, self.blocks, self.python_version),
|
|
46
|
+
indent=2,
|
|
47
|
+
ensure_ascii=False,
|
|
48
|
+
),
|
|
49
|
+
"utf-8",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def from_groups(
|
|
54
|
+
func_groups: Mapping[str, object],
|
|
55
|
+
block_groups: Mapping[str, object],
|
|
56
|
+
path: str | Path = "",
|
|
57
|
+
python_version: str | None = None,
|
|
58
|
+
) -> Baseline:
|
|
59
|
+
bl = Baseline(path)
|
|
60
|
+
bl.functions = set(func_groups.keys())
|
|
61
|
+
bl.blocks = set(block_groups.keys())
|
|
62
|
+
bl.python_version = python_version
|
|
63
|
+
return bl
|
|
64
|
+
|
|
65
|
+
def diff(
|
|
66
|
+
self, func_groups: Mapping[str, object], block_groups: Mapping[str, object]
|
|
67
|
+
) -> tuple[set[str], set[str]]:
|
|
68
|
+
new_funcs = set(func_groups.keys()) - self.functions
|
|
69
|
+
new_blocks = set(block_groups.keys()) - self.blocks
|
|
70
|
+
return new_funcs, new_blocks
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _baseline_payload(
|
|
74
|
+
functions: set[str],
|
|
75
|
+
blocks: set[str],
|
|
76
|
+
python_version: str | None,
|
|
77
|
+
) -> dict[str, Any]:
|
|
78
|
+
payload: dict[str, Any] = {
|
|
79
|
+
"functions": sorted(functions),
|
|
80
|
+
"blocks": sorted(blocks),
|
|
81
|
+
}
|
|
82
|
+
if python_version:
|
|
83
|
+
payload["python_version"] = python_version
|
|
84
|
+
return payload
|
|
@@ -11,7 +11,7 @@ from __future__ import annotations
|
|
|
11
11
|
import ast
|
|
12
12
|
import hashlib
|
|
13
13
|
|
|
14
|
-
from .normalize import
|
|
14
|
+
from .normalize import AstNormalizer, NormalizationConfig
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
|
|
@@ -15,7 +15,7 @@ from .blockhash import stmt_hash
|
|
|
15
15
|
from .normalize import NormalizationConfig
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
@dataclass(frozen=True)
|
|
18
|
+
@dataclass(frozen=True, slots=True)
|
|
19
19
|
class BlockUnit:
|
|
20
20
|
block_hash: str
|
|
21
21
|
filepath: str
|
|
@@ -42,7 +42,8 @@ def extract_blocks(
|
|
|
42
42
|
|
|
43
43
|
blocks: list[BlockUnit] = []
|
|
44
44
|
last_start: int | None = None
|
|
45
|
-
|
|
45
|
+
# Allow some overlap (50%), but at least 3 lines apart
|
|
46
|
+
min_line_distance = max(block_size // 2, 3)
|
|
46
47
|
|
|
47
48
|
for i in range(len(stmt_hashes) - block_size + 1):
|
|
48
49
|
start = getattr(body[i], "lineno", None)
|
|
@@ -50,7 +51,7 @@ def extract_blocks(
|
|
|
50
51
|
if not start or not end:
|
|
51
52
|
continue
|
|
52
53
|
|
|
53
|
-
if last_start is not None and start - last_start <
|
|
54
|
+
if last_start is not None and start - last_start < min_line_distance:
|
|
54
55
|
continue
|
|
55
56
|
|
|
56
57
|
bh = "|".join(stmt_hashes[i : i + block_size])
|