codeclone 1.1.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {codeclone-1.1.0 → codeclone-1.2.1}/PKG-INFO +62 -34
  2. {codeclone-1.1.0 → codeclone-1.2.1}/README.md +56 -31
  3. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/__init__.py +1 -1
  4. codeclone-1.2.1/codeclone/baseline.py +84 -0
  5. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/blockhash.py +1 -1
  6. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/blocks.py +4 -3
  7. codeclone-1.2.1/codeclone/cache.py +187 -0
  8. codeclone-1.2.1/codeclone/cfg.py +263 -0
  9. codeclone-1.2.1/codeclone/cfg_model.py +47 -0
  10. codeclone-1.2.1/codeclone/cli.py +603 -0
  11. codeclone-1.2.1/codeclone/errors.py +27 -0
  12. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/extractor.py +101 -24
  13. codeclone-1.2.1/codeclone/html_report.py +492 -0
  14. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/normalize.py +43 -13
  15. codeclone-1.2.1/codeclone/py.typed +0 -0
  16. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/report.py +23 -12
  17. codeclone-1.2.1/codeclone/scanner.py +111 -0
  18. codeclone-1.2.1/codeclone/templates.py +1262 -0
  19. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/PKG-INFO +62 -34
  20. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/SOURCES.txt +16 -1
  21. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/requires.txt +3 -0
  22. {codeclone-1.1.0 → codeclone-1.2.1}/pyproject.toml +38 -4
  23. codeclone-1.2.1/tests/test_baseline.py +81 -0
  24. codeclone-1.2.1/tests/test_blockhash.py +11 -0
  25. codeclone-1.2.1/tests/test_blocks.py +107 -0
  26. codeclone-1.2.1/tests/test_cache.py +198 -0
  27. codeclone-1.2.1/tests/test_cfg.py +417 -0
  28. codeclone-1.2.1/tests/test_cfg_model.py +18 -0
  29. codeclone-1.2.1/tests/test_cli_inprocess.py +812 -0
  30. codeclone-1.2.1/tests/test_cli_main_guard.py +17 -0
  31. codeclone-1.2.1/tests/test_cli_smoke.py +110 -0
  32. codeclone-1.2.1/tests/test_cli_unit.py +69 -0
  33. codeclone-1.2.1/tests/test_extractor.py +212 -0
  34. codeclone-1.2.1/tests/test_fingerprint.py +15 -0
  35. codeclone-1.2.1/tests/test_html_report.py +216 -0
  36. codeclone-1.2.1/tests/test_init.py +26 -0
  37. codeclone-1.2.1/tests/test_normalize.py +147 -0
  38. codeclone-1.2.1/tests/test_report.py +67 -0
  39. codeclone-1.2.1/tests/test_scanner_extra.py +165 -0
  40. codeclone-1.2.1/tests/test_security.py +44 -0
  41. codeclone-1.1.0/codeclone/baseline.py +0 -54
  42. codeclone-1.1.0/codeclone/cache.py +0 -50
  43. codeclone-1.1.0/codeclone/cfg.py +0 -173
  44. codeclone-1.1.0/codeclone/cli.py +0 -179
  45. codeclone-1.1.0/codeclone/html_report.py +0 -953
  46. codeclone-1.1.0/codeclone/scanner.py +0 -48
  47. codeclone-1.1.0/tests/test_baseline.py +0 -15
  48. codeclone-1.1.0/tests/test_blocks.py +0 -32
  49. codeclone-1.1.0/tests/test_cfg.py +0 -133
  50. codeclone-1.1.0/tests/test_cli_smoke.py +0 -24
  51. codeclone-1.1.0/tests/test_extractor.py +0 -49
  52. codeclone-1.1.0/tests/test_normalize.py +0 -22
  53. codeclone-1.1.0/tests/test_report.py +0 -24
  54. {codeclone-1.1.0 → codeclone-1.2.1}/LICENSE +0 -0
  55. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone/fingerprint.py +0 -0
  56. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/dependency_links.txt +0 -0
  57. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/entry_points.txt +0 -0
  58. {codeclone-1.1.0 → codeclone-1.2.1}/codeclone.egg-info/top_level.txt +0 -0
  59. {codeclone-1.1.0 → codeclone-1.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeclone
3
- Version: 1.1.0
3
+ Version: 1.2.1
4
4
  Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
5
5
  Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
6
6
  Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
@@ -10,11 +10,10 @@ Project-URL: Repository, https://github.com/orenlab/codeclone
10
10
  Project-URL: Issues, https://github.com/orenlab/codeclone/issues
11
11
  Project-URL: Changelog, https://github.com/orenlab/codeclone/releases
12
12
  Project-URL: Documentation, https://github.com/orenlab/codeclone/tree/main/docs
13
- Keywords: python,ast,code-clone,duplication,static-analysis,ci,architecture
13
+ Keywords: python,ast,cfg,code-clone,duplication,static-analysis,architecture,control-flow,ci
14
14
  Classifier: Development Status :: 5 - Production/Stable
15
15
  Classifier: Intended Audience :: Developers
16
16
  Classifier: Topic :: Software Development :: Quality Assurance
17
- Classifier: Topic :: Software Development :: Code Generators
18
17
  Classifier: Topic :: Software Development :: Testing
19
18
  Classifier: Typing :: Typed
20
19
  Classifier: License :: OSI Approved :: MIT License
@@ -23,26 +22,31 @@ Classifier: Programming Language :: Python :: 3.10
23
22
  Classifier: Programming Language :: Python :: 3.11
24
23
  Classifier: Programming Language :: Python :: 3.12
25
24
  Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Programming Language :: Python :: 3.14
26
26
  Classifier: Operating System :: OS Independent
27
27
  Requires-Python: >=3.10
28
28
  Description-Content-Type: text/markdown
29
29
  License-File: LICENSE
30
30
  Requires-Dist: pygments>=2.19.2
31
+ Requires-Dist: rich>=14.3.2
31
32
  Provides-Extra: dev
32
33
  Requires-Dist: pytest>=9.0.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=6.1.0; extra == "dev"
33
35
  Requires-Dist: build>=1.2.0; extra == "dev"
34
36
  Requires-Dist: twine>=5.0.0; extra == "dev"
35
37
  Requires-Dist: mypy>=1.19.1; extra == "dev"
38
+ Requires-Dist: ruff>=0.12.0; extra == "dev"
36
39
  Dynamic: license-file
37
40
 
38
41
  # CodeClone
39
42
 
40
43
  [![PyPI](https://img.shields.io/pypi/v/codeclone.svg)](https://pypi.org/project/codeclone/)
41
44
  [![Downloads](https://img.shields.io/pypi/dm/codeclone.svg)](https://pypi.org/project/codeclone/)
45
+ [![tests](https://github.com/orenlab/codeclone/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/orenlab/codeclone/actions/workflows/tests.yml)
42
46
  [![Python](https://img.shields.io/pypi/pyversions/codeclone.svg)](https://pypi.org/project/codeclone/)
43
47
  [![License](https://img.shields.io/pypi/l/codeclone.svg)](LICENSE)
44
48
 
45
- **CodeClone** is a Python code clone detector based on **normalized AST and control-flow graphs (CFG)**.
49
+ **CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**.
46
50
  It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
47
51
 
48
52
  CodeClone is designed to help teams:
@@ -51,15 +55,16 @@ CodeClone is designed to help teams:
51
55
  - identify architectural hotspots,
52
56
  - prevent *new* duplication via CI and pre-commit hooks.
53
57
 
54
- Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against renaming,
55
- formatting, and minor refactoring.
58
+ Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
59
+ renaming, formatting, and minor refactoring.
56
60
 
57
61
  ---
58
62
 
59
63
  ## Why CodeClone?
60
64
 
61
65
  Most existing tools detect *textual* duplication.
62
- CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or architectural drift.
66
+ CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
67
+ architectural drift.
63
68
 
64
69
  Typical use cases:
65
70
 
@@ -77,11 +82,11 @@ Typical use cases:
77
82
  - Detects functions and methods with identical **control-flow structure**.
78
83
  - Based on **Control Flow Graph (CFG)** fingerprinting.
79
84
  - Robust to:
80
- - variable renaming,
81
- - constant changes,
82
- - attribute renaming,
83
- - formatting differences,
84
- - docstrings and type annotations.
85
+ - variable renaming,
86
+ - constant changes,
87
+ - attribute renaming,
88
+ - formatting differences,
89
+ - docstrings and type annotations.
85
90
  - Ideal for spotting architectural duplication across layers.
86
91
 
87
92
  ### Block-level clone detection (Type-3-lite)
@@ -89,24 +94,29 @@ Typical use cases:
89
94
  - Detects repeated **statement blocks** inside larger functions.
90
95
  - Uses sliding windows over CFG-normalized statement sequences.
91
96
  - Targets:
92
- - validation blocks,
93
- - guard clauses,
94
- - repeated orchestration logic.
97
+ - validation blocks,
98
+ - guard clauses,
99
+ - repeated orchestration logic.
95
100
  - Carefully filtered to reduce noise:
96
- - no overlapping windows,
97
- - no clones inside the same function,
98
- - no `__init__` noise,
99
- - size and statement-count thresholds.
101
+ - no overlapping windows,
102
+ - no clones inside the same function,
103
+ - no `__init__` noise,
104
+ - size and statement-count thresholds.
100
105
 
101
106
  ### Control-Flow Awareness (CFG v1)
102
107
 
103
108
  - Each function is converted into a **Control Flow Graph**.
104
109
  - CFG nodes contain normalized AST statements.
105
- - CFG edges represent structural control flow (`if`, `for`, `while`).
110
+ - CFG edges represent structural control flow:
111
+ - `if` / `else`
112
+ - `for` / `async for` / `while`
113
+ - `try` / `except` / `finally`
114
+ - `with` / `async with`
115
+ - `match` / `case` (Python 3.10+)
106
116
  - Current CFG semantics (v1):
107
- - `break` and `continue` are treated as statements (no jump targets),
108
- - after-blocks are explicit and always present,
109
- - focus is on **structural similarity**, not precise runtime semantics.
117
+ - `break` and `continue` are treated as statements (no jump targets),
118
+ - after-blocks are explicit and always present,
119
+ - focus is on **structural similarity**, not precise runtime semantics.
110
120
 
111
121
  This design keeps clone detection **stable, deterministic, and low-noise**.
112
122
 
@@ -115,6 +125,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**.
115
125
  - AST + CFG normalization instead of token matching.
116
126
  - Conservative defaults tuned for real-world Python projects.
117
127
  - Explicit thresholds for size and statement count.
128
+ - No probabilistic scoring or heuristic similarity thresholds.
118
129
  - Focus on *architectural duplication*, not micro-similarities.
119
130
 
120
131
  ### CI-friendly baseline mode
@@ -154,14 +165,14 @@ Generate reports:
154
165
 
155
166
  ```bash
156
167
  codeclone . \
157
- --json-out .cache/codeclone/report.json \
158
- --text-out .cache/codeclone/report.txt
168
+ --json .cache/codeclone/report.json \
169
+ --text .cache/codeclone/report.txt
159
170
  ```
160
171
 
161
172
  Generate an HTML report:
162
173
 
163
174
  ```bash
164
- codeclone . --html-out .cache/codeclone/report.html
175
+ codeclone . --html .cache/codeclone/report.html
165
176
  ```
166
177
 
167
178
  ---
@@ -181,14 +192,26 @@ Commit the generated baseline file to the repository.
181
192
  ### 2. Use in CI
182
193
 
183
194
  ```bash
184
- codeclone . --fail-on-new
195
+ codeclone . --fail-on-new --no-progress
185
196
  ```
186
197
 
187
198
  Behavior:
188
199
 
189
- - existing clones are allowed,
190
- - build fails if *new* clones appear,
191
- - refactoring that removes duplication is always allowed.
200
+ - existing clones are allowed,
201
+ - the build fails if *new* clones appear,
202
+ - refactoring that removes duplication is always allowed.
203
+
204
+ `--fail-on-new` exits with a non-zero code when new clones are detected.
205
+
206
+ ### Python Version Consistency for Baseline Checks
207
+
208
+ Due to inherent differences in Python’s AST between interpreter versions, baseline
209
+ generation and verification must be performed using the same Python version.
210
+
211
+ This ensures deterministic and reproducible clone detection results.
212
+
213
+ CI checks therefore pin baseline verification to a single Python version, while the
214
+ test matrix continues to validate compatibility across Python 3.10–3.14.
192
215
 
193
216
  ---
194
217
 
@@ -196,14 +219,14 @@ Behavior:
196
219
 
197
220
  ```yaml
198
221
  repos:
199
- - repo: local
222
+ - repo: local
200
223
  hooks:
201
- - id: codeclone
224
+ - id: codeclone
202
225
  name: CodeClone
203
226
  entry: codeclone
204
227
  language: python
205
- args: [".", "--fail-on-new"]
206
- types: [python]
228
+ args: [ ".", "--fail-on-new" ]
229
+ types: [ python ]
207
230
  ```
208
231
 
209
232
  ---
@@ -235,6 +258,10 @@ repos:
235
258
  5. Detect function-level and block-level clones.
236
259
  6. Apply conservative filters to suppress noise.
237
260
 
261
+ See the architectural overview:
262
+
263
+ - [docs/architecture.md](docs/architecture.md)
264
+
238
265
  ---
239
266
 
240
267
  ## Control Flow Graph (CFG)
@@ -245,6 +272,7 @@ to improve structural clone detection robustness.
245
272
  The CFG is a **structural abstraction**, not a runtime execution model.
246
273
 
247
274
  See full design and semantics:
275
+
248
276
  - [docs/cfg.md](docs/cfg.md)
249
277
 
250
278
  ---
@@ -2,10 +2,11 @@
2
2
 
3
3
  [![PyPI](https://img.shields.io/pypi/v/codeclone.svg)](https://pypi.org/project/codeclone/)
4
4
  [![Downloads](https://img.shields.io/pypi/dm/codeclone.svg)](https://pypi.org/project/codeclone/)
5
+ [![tests](https://github.com/orenlab/codeclone/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/orenlab/codeclone/actions/workflows/tests.yml)
5
6
  [![Python](https://img.shields.io/pypi/pyversions/codeclone.svg)](https://pypi.org/project/codeclone/)
6
7
  [![License](https://img.shields.io/pypi/l/codeclone.svg)](LICENSE)
7
8
 
8
- **CodeClone** is a Python code clone detector based on **normalized AST and control-flow graphs (CFG)**.
9
+ **CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**.
9
10
  It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI.
10
11
 
11
12
  CodeClone is designed to help teams:
@@ -14,15 +15,16 @@ CodeClone is designed to help teams:
14
15
  - identify architectural hotspots,
15
16
  - prevent *new* duplication via CI and pre-commit hooks.
16
17
 
17
- Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against renaming,
18
- formatting, and minor refactoring.
18
+ Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against
19
+ renaming, formatting, and minor refactoring.
19
20
 
20
21
  ---
21
22
 
22
23
  ## Why CodeClone?
23
24
 
24
25
  Most existing tools detect *textual* duplication.
25
- CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or architectural drift.
26
+ CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or
27
+ architectural drift.
26
28
 
27
29
  Typical use cases:
28
30
 
@@ -40,11 +42,11 @@ Typical use cases:
40
42
  - Detects functions and methods with identical **control-flow structure**.
41
43
  - Based on **Control Flow Graph (CFG)** fingerprinting.
42
44
  - Robust to:
43
- - variable renaming,
44
- - constant changes,
45
- - attribute renaming,
46
- - formatting differences,
47
- - docstrings and type annotations.
45
+ - variable renaming,
46
+ - constant changes,
47
+ - attribute renaming,
48
+ - formatting differences,
49
+ - docstrings and type annotations.
48
50
  - Ideal for spotting architectural duplication across layers.
49
51
 
50
52
  ### Block-level clone detection (Type-3-lite)
@@ -52,24 +54,29 @@ Typical use cases:
52
54
  - Detects repeated **statement blocks** inside larger functions.
53
55
  - Uses sliding windows over CFG-normalized statement sequences.
54
56
  - Targets:
55
- - validation blocks,
56
- - guard clauses,
57
- - repeated orchestration logic.
57
+ - validation blocks,
58
+ - guard clauses,
59
+ - repeated orchestration logic.
58
60
  - Carefully filtered to reduce noise:
59
- - no overlapping windows,
60
- - no clones inside the same function,
61
- - no `__init__` noise,
62
- - size and statement-count thresholds.
61
+ - no overlapping windows,
62
+ - no clones inside the same function,
63
+ - no `__init__` noise,
64
+ - size and statement-count thresholds.
63
65
 
64
66
  ### Control-Flow Awareness (CFG v1)
65
67
 
66
68
  - Each function is converted into a **Control Flow Graph**.
67
69
  - CFG nodes contain normalized AST statements.
68
- - CFG edges represent structural control flow (`if`, `for`, `while`).
70
+ - CFG edges represent structural control flow:
71
+ - `if` / `else`
72
+ - `for` / `async for` / `while`
73
+ - `try` / `except` / `finally`
74
+ - `with` / `async with`
75
+ - `match` / `case` (Python 3.10+)
69
76
  - Current CFG semantics (v1):
70
- - `break` and `continue` are treated as statements (no jump targets),
71
- - after-blocks are explicit and always present,
72
- - focus is on **structural similarity**, not precise runtime semantics.
77
+ - `break` and `continue` are treated as statements (no jump targets),
78
+ - after-blocks are explicit and always present,
79
+ - focus is on **structural similarity**, not precise runtime semantics.
73
80
 
74
81
  This design keeps clone detection **stable, deterministic, and low-noise**.
75
82
 
@@ -78,6 +85,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**.
78
85
  - AST + CFG normalization instead of token matching.
79
86
  - Conservative defaults tuned for real-world Python projects.
80
87
  - Explicit thresholds for size and statement count.
88
+ - No probabilistic scoring or heuristic similarity thresholds.
81
89
  - Focus on *architectural duplication*, not micro-similarities.
82
90
 
83
91
  ### CI-friendly baseline mode
@@ -117,14 +125,14 @@ Generate reports:
117
125
 
118
126
  ```bash
119
127
  codeclone . \
120
- --json-out .cache/codeclone/report.json \
121
- --text-out .cache/codeclone/report.txt
128
+ --json .cache/codeclone/report.json \
129
+ --text .cache/codeclone/report.txt
122
130
  ```
123
131
 
124
132
  Generate an HTML report:
125
133
 
126
134
  ```bash
127
- codeclone . --html-out .cache/codeclone/report.html
135
+ codeclone . --html .cache/codeclone/report.html
128
136
  ```
129
137
 
130
138
  ---
@@ -144,14 +152,26 @@ Commit the generated baseline file to the repository.
144
152
  ### 2. Use in CI
145
153
 
146
154
  ```bash
147
- codeclone . --fail-on-new
155
+ codeclone . --fail-on-new --no-progress
148
156
  ```
149
157
 
150
158
  Behavior:
151
159
 
152
- - existing clones are allowed,
153
- - build fails if *new* clones appear,
154
- - refactoring that removes duplication is always allowed.
160
+ - existing clones are allowed,
161
+ - the build fails if *new* clones appear,
162
+ - refactoring that removes duplication is always allowed.
163
+
164
+ `--fail-on-new` exits with a non-zero code when new clones are detected.
165
+
166
+ ### Python Version Consistency for Baseline Checks
167
+
168
+ Due to inherent differences in Python’s AST between interpreter versions, baseline
169
+ generation and verification must be performed using the same Python version.
170
+
171
+ This ensures deterministic and reproducible clone detection results.
172
+
173
+ CI checks therefore pin baseline verification to a single Python version, while the
174
+ test matrix continues to validate compatibility across Python 3.10–3.14.
155
175
 
156
176
  ---
157
177
 
@@ -159,14 +179,14 @@ Behavior:
159
179
 
160
180
  ```yaml
161
181
  repos:
162
- - repo: local
182
+ - repo: local
163
183
  hooks:
164
- - id: codeclone
184
+ - id: codeclone
165
185
  name: CodeClone
166
186
  entry: codeclone
167
187
  language: python
168
- args: [".", "--fail-on-new"]
169
- types: [python]
188
+ args: [ ".", "--fail-on-new" ]
189
+ types: [ python ]
170
190
  ```
171
191
 
172
192
  ---
@@ -198,6 +218,10 @@ repos:
198
218
  5. Detect function-level and block-level clones.
199
219
  6. Apply conservative filters to suppress noise.
200
220
 
221
+ See the architectural overview:
222
+
223
+ - [docs/architecture.md](docs/architecture.md)
224
+
201
225
  ---
202
226
 
203
227
  ## Control Flow Graph (CFG)
@@ -208,6 +232,7 @@ to improve structural clone detection robustness.
208
232
  The CFG is a **structural abstraction**, not a runtime execution model.
209
233
 
210
234
  See full design and semantics:
235
+
211
236
  - [docs/cfg.md](docs/cfg.md)
212
237
 
213
238
  ---
@@ -6,7 +6,7 @@ Copyright (c) 2026 Den Rozhnovskiy
6
6
  Licensed under the MIT License.
7
7
  """
8
8
 
9
- from importlib.metadata import version, PackageNotFoundError
9
+ from importlib.metadata import PackageNotFoundError, version
10
10
 
11
11
  try:
12
12
  __version__ = version("codeclone")
@@ -0,0 +1,84 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from collections.abc import Mapping
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+
17
+ class Baseline:
18
+ __slots__ = ("blocks", "functions", "path", "python_version")
19
+
20
+ def __init__(self, path: str | Path):
21
+ self.path = Path(path)
22
+ self.functions: set[str] = set()
23
+ self.blocks: set[str] = set()
24
+ self.python_version: str | None = None
25
+
26
+ def load(self) -> None:
27
+ if not self.path.exists():
28
+ return
29
+
30
+ try:
31
+ data = json.loads(self.path.read_text("utf-8"))
32
+ self.functions = set(data.get("functions", []))
33
+ self.blocks = set(data.get("blocks", []))
34
+ python_version = data.get("python_version")
35
+ self.python_version = (
36
+ python_version if isinstance(python_version, str) else None
37
+ )
38
+ except json.JSONDecodeError as e:
39
+ raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e
40
+
41
+ def save(self) -> None:
42
+ self.path.parent.mkdir(parents=True, exist_ok=True)
43
+ self.path.write_text(
44
+ json.dumps(
45
+ _baseline_payload(self.functions, self.blocks, self.python_version),
46
+ indent=2,
47
+ ensure_ascii=False,
48
+ ),
49
+ "utf-8",
50
+ )
51
+
52
+ @staticmethod
53
+ def from_groups(
54
+ func_groups: Mapping[str, object],
55
+ block_groups: Mapping[str, object],
56
+ path: str | Path = "",
57
+ python_version: str | None = None,
58
+ ) -> Baseline:
59
+ bl = Baseline(path)
60
+ bl.functions = set(func_groups.keys())
61
+ bl.blocks = set(block_groups.keys())
62
+ bl.python_version = python_version
63
+ return bl
64
+
65
+ def diff(
66
+ self, func_groups: Mapping[str, object], block_groups: Mapping[str, object]
67
+ ) -> tuple[set[str], set[str]]:
68
+ new_funcs = set(func_groups.keys()) - self.functions
69
+ new_blocks = set(block_groups.keys()) - self.blocks
70
+ return new_funcs, new_blocks
71
+
72
+
73
+ def _baseline_payload(
74
+ functions: set[str],
75
+ blocks: set[str],
76
+ python_version: str | None,
77
+ ) -> dict[str, Any]:
78
+ payload: dict[str, Any] = {
79
+ "functions": sorted(functions),
80
+ "blocks": sorted(blocks),
81
+ }
82
+ if python_version:
83
+ payload["python_version"] = python_version
84
+ return payload
@@ -11,7 +11,7 @@ from __future__ import annotations
11
11
  import ast
12
12
  import hashlib
13
13
 
14
- from .normalize import NormalizationConfig, AstNormalizer
14
+ from .normalize import AstNormalizer, NormalizationConfig
15
15
 
16
16
 
17
17
  def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
@@ -15,7 +15,7 @@ from .blockhash import stmt_hash
15
15
  from .normalize import NormalizationConfig
16
16
 
17
17
 
18
- @dataclass(frozen=True)
18
+ @dataclass(frozen=True, slots=True)
19
19
  class BlockUnit:
20
20
  block_hash: str
21
21
  filepath: str
@@ -42,7 +42,8 @@ def extract_blocks(
42
42
 
43
43
  blocks: list[BlockUnit] = []
44
44
  last_start: int | None = None
45
- MIN_LINE_DISTANCE = 5 # suppress overlapping windows
45
+ # Allow some overlap (50%), but at least 3 lines apart
46
+ min_line_distance = max(block_size // 2, 3)
46
47
 
47
48
  for i in range(len(stmt_hashes) - block_size + 1):
48
49
  start = getattr(body[i], "lineno", None)
@@ -50,7 +51,7 @@ def extract_blocks(
50
51
  if not start or not end:
51
52
  continue
52
53
 
53
- if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
54
+ if last_start is not None and start - last_start < min_line_distance:
54
55
  continue
55
56
 
56
57
  bh = "|".join(stmt_hashes[i : i + block_size])