codeclone 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Denis Rozhnovskiy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,211 @@
1
+ Metadata-Version: 2.4
2
+ Name: codeclone
3
+ Version: 1.0.0
4
+ Summary: AST-based code clone detector for Python focused on architectural duplication
5
+ Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
6
+ Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/orenlab/codeclone
9
+ Project-URL: Repository, https://github.com/orenlab/codeclone
10
+ Project-URL: Issues, https://github.com/orenlab/codeclone/issues
11
+ Project-URL: Changelog, https://github.com/orenlab/codeclone/releases
12
+ Keywords: python,ast,code-clone,duplication,static-analysis,ci,architecture
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Software Development :: Quality Assurance
16
+ Classifier: Topic :: Software Development :: Code Generators
17
+ Classifier: Topic :: Software Development :: Testing
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Operating System :: OS Independent
25
+ Requires-Python: >=3.10
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=9.0.0; extra == "dev"
30
+ Requires-Dist: build>=1.2.0; extra == "dev"
31
+ Requires-Dist: twine>=5.0.0; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+ # CodeClone
35
+
36
+ **CodeClone** is an AST-based code clone detector for Python, focused on **architectural duplication**, not simple
37
+ copy-paste.
38
+
39
+ It is designed to help teams:
40
+
41
+ - discover structural and logical code duplication,
42
+ - understand architectural hotspots,
43
+ - and prevent *new* duplication from entering the codebase via CI.
44
+
45
+ Unlike token- or text-based tools, CodeClone works on **normalized Python AST**, which makes it robust against renaming,
46
+ formatting, and minor refactoring.
47
+
48
+ ---
49
+
50
+ ## Why CodeClone?
51
+
52
+ Most existing tools detect *textual* duplication.
53
+ CodeClone detects **structural and block-level duplication** that usually indicates missing abstractions or
54
+ architectural drift.
55
+
56
+ Typical use cases:
57
+
58
+ - duplicated service logic across layers (API ↔ application),
59
+ - repeated validation or guard blocks,
60
+ - copy-pasted request/handler flows,
61
+ - duplicated orchestration logic in routers, handlers, or services.
62
+
63
+ ---
64
+
65
+ ## Features
66
+
67
+ ### Function-level clone detection (Type-2)
68
+
69
+ - Detects functions and methods with identical structure.
70
+ - Robust to:
71
+ - variable renaming,
72
+ - constant changes,
73
+ - formatting differences.
74
+ - Ideal for spotting architectural duplication between layers.
75
+
76
+ ### Block-level clone detection (Type-3-lite)
77
+
78
+ - Detects repeated **statement blocks** inside larger functions.
79
+ - Targets:
80
+ - validation blocks,
81
+ - guard clauses,
82
+ - repeated orchestration logic.
83
+ - Carefully filtered to avoid noise:
84
+ - no overlapping windows,
85
+ - no clones inside the same function,
86
+ - no `__init__` noise.
87
+
88
+ ### Low-noise by design
89
+
90
+ - AST normalization instead of token matching.
91
+ - Size and statement-count thresholds.
92
+ - Conservative defaults tuned for real-world Python projects.
93
+
94
+ ### CI-friendly baseline mode
95
+
96
+ - Establish a baseline of existing clones.
97
+ - Fail CI **only when new clones are introduced**.
98
+ - Safe for legacy codebases.
99
+
100
+ ---
101
+
102
+ ## Installation
103
+
104
+ ```bash
105
+ pip install codeclone
106
+ ```
107
+
108
+ Python 3.10+ is required.
109
+
110
+
111
+
112
+ Quick Start
113
+
114
+ Run on a project:
115
+
116
+ ```bash
117
+ codeclone .
118
+ ```
119
+
120
+ This will:
121
+
122
+ * scan Python files,
123
+ * detect function-level and block-level clones,
124
+ * print a summary to stdout.
125
+
126
+ Generate reports:
127
+
128
+ ```bash
129
+ codeclone . \
130
+ --json-out .cache/codeclone/report.json \
131
+ --text-out .cache/codeclone/report.txt
132
+ ```
133
+
134
+
135
+
136
+ Baseline Workflow (Recommended)
137
+
138
+ 1. Create a baseline
139
+
140
+ Run once on your current codebase:
141
+
142
+ ```bash
143
+ codeclone . --update-baseline
144
+ ```
145
+
146
+ This creates a file:
147
+
148
+ ```bash
149
+ .codeclone-baseline.json
150
+ ```
151
+
152
+ Commit this file to the repository.
153
+
154
+
155
+
156
+ 2. Use in CI
157
+
158
+ In CI, run:
159
+
160
+ ```bash
161
+ codeclone . --fail-on-new
162
+ ```
163
+
164
+ Behavior:
165
+
166
+ * ✅ existing clones are allowed,
167
+ * ❌ build fails if new function or block clones appear,
168
+ * ✅ refactoring that removes duplication is always allowed.
169
+
170
+ This enables gradual improvement without breaking existing development flow.
171
+
172
+
173
+
174
+ What CodeClone Is (and Is Not)
175
+
176
+ CodeClone is
177
+
178
+ * an architectural analysis tool,
179
+ * a duplication radar,
180
+ * a CI guard against copy-paste.
181
+
182
+ CodeClone is not
183
+
184
+ * a linter,
185
+ * a formatter,
186
+ * a replacement for SonarQube or static analyzers,
187
+ * a semantic equivalence prover.
188
+
189
+ It intentionally focuses on high-signal duplication.
190
+
191
+
192
+
193
+ How It Works (High Level)
194
+
195
+ * Parses Python source into AST.
196
+ * Normalizes:
197
+ - variable names,
198
+ - constants,
199
+ - attributes,
200
+ - docstrings and annotations.
201
+ * Computes stable structural fingerprints.
202
+ * Detects:
203
+ - identical function structures,
204
+ - repeated statement blocks across functions.
205
+ * Applies filters to suppress noise.
206
+
207
+
208
+
209
+ License
210
+
211
+ MIT License
@@ -0,0 +1,178 @@
1
+ # CodeClone
2
+
3
+ **CodeClone** is an AST-based code clone detector for Python, focused on **architectural duplication**, not simple
4
+ copy-paste.
5
+
6
+ It is designed to help teams:
7
+
8
+ - discover structural and logical code duplication,
9
+ - understand architectural hotspots,
10
+ - and prevent *new* duplication from entering the codebase via CI.
11
+
12
+ Unlike token- or text-based tools, CodeClone works on **normalized Python AST**, which makes it robust against renaming,
13
+ formatting, and minor refactoring.
14
+
15
+ ---
16
+
17
+ ## Why CodeClone?
18
+
19
+ Most existing tools detect *textual* duplication.
20
+ CodeClone detects **structural and block-level duplication** that usually indicates missing abstractions or
21
+ architectural drift.
22
+
23
+ Typical use cases:
24
+
25
+ - duplicated service logic across layers (API ↔ application),
26
+ - repeated validation or guard blocks,
27
+ - copy-pasted request/handler flows,
28
+ - duplicated orchestration logic in routers, handlers, or services.
29
+
30
+ ---
31
+
32
+ ## Features
33
+
34
+ ### Function-level clone detection (Type-2)
35
+
36
+ - Detects functions and methods with identical structure.
37
+ - Robust to:
38
+ - variable renaming,
39
+ - constant changes,
40
+ - formatting differences.
41
+ - Ideal for spotting architectural duplication between layers.
42
+
43
+ ### Block-level clone detection (Type-3-lite)
44
+
45
+ - Detects repeated **statement blocks** inside larger functions.
46
+ - Targets:
47
+ - validation blocks,
48
+ - guard clauses,
49
+ - repeated orchestration logic.
50
+ - Carefully filtered to avoid noise:
51
+ - no overlapping windows,
52
+ - no clones inside the same function,
53
+ - no `__init__` noise.
54
+
55
+ ### Low-noise by design
56
+
57
+ - AST normalization instead of token matching.
58
+ - Size and statement-count thresholds.
59
+ - Conservative defaults tuned for real-world Python projects.
60
+
61
+ ### CI-friendly baseline mode
62
+
63
+ - Establish a baseline of existing clones.
64
+ - Fail CI **only when new clones are introduced**.
65
+ - Safe for legacy codebases.
66
+
67
+ ---
68
+
69
+ ## Installation
70
+
71
+ ```bash
72
+ pip install codeclone
73
+ ```
74
+
75
+ Python 3.10+ is required.
76
+
77
+
78
+
79
+ Quick Start
80
+
81
+ Run on a project:
82
+
83
+ ```bash
84
+ codeclone .
85
+ ```
86
+
87
+ This will:
88
+
89
+ * scan Python files,
90
+ * detect function-level and block-level clones,
91
+ * print a summary to stdout.
92
+
93
+ Generate reports:
94
+
95
+ ```bash
96
+ codeclone . \
97
+ --json-out .cache/codeclone/report.json \
98
+ --text-out .cache/codeclone/report.txt
99
+ ```
100
+
101
+
102
+
103
+ Baseline Workflow (Recommended)
104
+
105
+ 1. Create a baseline
106
+
107
+ Run once on your current codebase:
108
+
109
+ ```bash
110
+ codeclone . --update-baseline
111
+ ```
112
+
113
+ This creates a file:
114
+
115
+ ```bash
116
+ .codeclone-baseline.json
117
+ ```
118
+
119
+ Commit this file to the repository.
120
+
121
+
122
+
123
+ 2. Use in CI
124
+
125
+ In CI, run:
126
+
127
+ ```bash
128
+ codeclone . --fail-on-new
129
+ ```
130
+
131
+ Behavior:
132
+
133
+ * ✅ existing clones are allowed,
134
+ * ❌ build fails if new function or block clones appear,
135
+ * ✅ refactoring that removes duplication is always allowed.
136
+
137
+ This enables gradual improvement without breaking existing development flow.
138
+
139
+
140
+
141
+ What CodeClone Is (and Is Not)
142
+
143
+ CodeClone is
144
+
145
+ * an architectural analysis tool,
146
+ * a duplication radar,
147
+ * a CI guard against copy-paste.
148
+
149
+ CodeClone is not
150
+
151
+ * a linter,
152
+ * a formatter,
153
+ * a replacement for SonarQube or static analyzers,
154
+ * a semantic equivalence prover.
155
+
156
+ It intentionally focuses on high-signal duplication.
157
+
158
+
159
+
160
+ How It Works (High Level)
161
+
162
+ * Parses Python source into AST.
163
+ * Normalizes:
164
+ - variable names,
165
+ - constants,
166
+ - attributes,
167
+ - docstrings and annotations.
168
+ * Computes stable structural fingerprints.
169
+ * Detects:
170
+ - identical function structures,
171
+ - repeated statement blocks across functions.
172
+ * Applies filters to suppress noise.
173
+
174
+
175
+
176
+ License
177
+
178
+ MIT License
File without changes
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Set
6
+
7
+
8
+ class Baseline:
9
+ def __init__(self, path: str):
10
+ self.path = Path(path)
11
+ self.functions: Set[str] = set()
12
+ self.blocks: Set[str] = set()
13
+
14
+ def load(self) -> None:
15
+ if not self.path.exists():
16
+ return
17
+
18
+ data = json.loads(self.path.read_text("utf-8"))
19
+ self.functions = set(data.get("functions", []))
20
+ self.blocks = set(data.get("blocks", []))
21
+
22
+ def save(self) -> None:
23
+ self.path.parent.mkdir(parents=True, exist_ok=True)
24
+ self.path.write_text(
25
+ json.dumps(
26
+ {
27
+ "functions": sorted(self.functions),
28
+ "blocks": sorted(self.blocks),
29
+ },
30
+ indent=2,
31
+ ensure_ascii=False,
32
+ ),
33
+ "utf-8",
34
+ )
35
+
36
+ @staticmethod
37
+ def from_groups(func_groups: dict, block_groups: dict) -> "Baseline":
38
+ bl = Baseline("")
39
+ bl.functions = set(func_groups.keys())
40
+ bl.blocks = set(block_groups.keys())
41
+ return bl
42
+
43
+ def diff(self, func_groups: dict, block_groups: dict) -> tuple[set, set]:
44
+ new_funcs = set(func_groups.keys()) - self.functions
45
+ new_blocks = set(block_groups.keys()) - self.blocks
46
+ return new_funcs, new_blocks
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import hashlib
5
+
6
+ from .normalize import NormalizationConfig, AstNormalizer
7
+
8
+ def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
9
+ normalizer = AstNormalizer(cfg)
10
+ stmt = ast.fix_missing_locations(normalizer.visit(stmt))
11
+ dump = ast.dump(stmt, annotate_fields=True, include_attributes=False)
12
+ return hashlib.sha1(dump.encode("utf-8")).hexdigest()
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ from dataclasses import dataclass
5
+
6
+ from .blockhash import stmt_hash
7
+ from .normalize import NormalizationConfig
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class BlockUnit:
12
+ block_hash: str
13
+ filepath: str
14
+ qualname: str
15
+ start_line: int
16
+ end_line: int
17
+ size: int
18
+
19
+
20
+ def extract_blocks(
21
+ func_node: ast.AST,
22
+ *,
23
+ filepath: str,
24
+ qualname: str,
25
+ cfg: NormalizationConfig,
26
+ block_size: int,
27
+ max_blocks: int,
28
+ ) -> list[BlockUnit]:
29
+ body = getattr(func_node, "body", None)
30
+ if not isinstance(body, list) or len(body) < block_size:
31
+ return []
32
+
33
+ stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
34
+
35
+ blocks: list[BlockUnit] = []
36
+ last_start: int | None = None
37
+ MIN_LINE_DISTANCE = 5 # suppress overlapping windows
38
+
39
+ for i in range(len(stmt_hashes) - block_size + 1):
40
+ start = getattr(body[i], "lineno", None)
41
+ end = getattr(body[i + block_size - 1], "end_lineno", None)
42
+ if not start or not end:
43
+ continue
44
+
45
+ if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
46
+ continue
47
+
48
+ bh = "|".join(stmt_hashes[i:i + block_size])
49
+
50
+ blocks.append(BlockUnit(
51
+ block_hash=bh,
52
+ filepath=filepath,
53
+ qualname=qualname,
54
+ start_line=start,
55
+ end_line=end,
56
+ size=block_size,
57
+ ))
58
+
59
+ last_start = start
60
+ if len(blocks) >= max_blocks:
61
+ break
62
+
63
+ return blocks
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from dataclasses import asdict
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+
10
+ class Cache:
11
+ def __init__(self, path: str):
12
+ self.path = Path(path)
13
+ self.data: dict = {"files": {}}
14
+
15
+ def load(self) -> None:
16
+ if self.path.exists():
17
+ self.data = json.loads(self.path.read_text("utf-8"))
18
+
19
+ def save(self) -> None:
20
+ self.path.parent.mkdir(parents=True, exist_ok=True)
21
+ self.path.write_text(
22
+ json.dumps(self.data, ensure_ascii=False, indent=2),
23
+ "utf-8",
24
+ )
25
+
26
+ def get_file_entry(self, filepath: str) -> Optional[dict]:
27
+ return self.data.get("files", {}).get(filepath)
28
+
29
+ def put_file_entry(self, filepath: str, stat_sig: dict, units, blocks) -> None:
30
+ self.data.setdefault("files", {})[filepath] = {
31
+ "stat": stat_sig,
32
+ "units": [asdict(u) for u in units],
33
+ "blocks": [asdict(b) for b in blocks],
34
+ }
35
+
36
+
37
+ def file_stat_signature(path: str) -> dict:
38
+ st = os.stat(path)
39
+ return {
40
+ "mtime_ns": st.st_mtime_ns,
41
+ "size": st.st_size,
42
+ }