codeclone 1.4.2__tar.gz → 1.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeclone-1.4.2 → codeclone-1.4.4}/LICENSE +1 -1
- {codeclone-1.4.2 → codeclone-1.4.4}/PKG-INFO +13 -12
- {codeclone-1.4.2 → codeclone-1.4.4}/README.md +12 -11
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_html_snippets.py +50 -24
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_explain.py +93 -23
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/cache.py +59 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/cli.py +2 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/contracts.py +1 -1
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/PKG-INFO +13 -12
- {codeclone-1.4.2 → codeclone-1.4.4}/pyproject.toml +1 -1
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cache.py +93 -26
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_inprocess.py +117 -1
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_html_report.py +2 -2
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_report.py +3 -3
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/__init__.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_cli_args.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_cli_meta.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_cli_paths.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_cli_summary.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_html_escape.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_blocks.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_explain_contract.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_grouping.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_segments.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_serialize.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_types.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/baseline.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/blockhash.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/blocks.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/cfg.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/cfg_model.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/errors.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/extractor.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/fingerprint.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/html_report.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/meta_markers.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/normalize.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/py.typed +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/report.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/scanner.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/templates.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/ui_messages.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/SOURCES.txt +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/dependency_links.txt +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/entry_points.txt +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/requires.txt +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/top_level.txt +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/setup.cfg +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_baseline.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_blockhash.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_blocks.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cfg.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cfg_model.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_main_guard.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_main_guard_runpy.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_smoke.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_unit.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_detector_golden.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_extractor.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_fingerprint.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_init.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_normalize.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_report_explain.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_scanner_extra.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_security.py +0 -0
- {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_segments.py +0 -0
|
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeclone
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.4
|
|
4
4
|
Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
|
|
5
5
|
Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
6
6
|
Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
@@ -49,7 +49,7 @@ Dynamic: license-file
|
|
|
49
49
|

|
|
50
50
|
[](LICENSE)
|
|
51
51
|
|
|
52
|
-
**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
|
|
52
|
+
**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
|
|
53
53
|
It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI.
|
|
54
54
|
|
|
55
55
|
---
|
|
@@ -75,13 +75,13 @@ Unlike token-based tools, CodeClone compares **structure and control flow**, mak
|
|
|
75
75
|
|
|
76
76
|
**Three Detection Levels:**
|
|
77
77
|
|
|
78
|
-
1. **Function clones (CFG fingerprint)**
|
|
78
|
+
1. **Function clones (CFG fingerprint)**
|
|
79
79
|
Strong structural signal for cross-layer duplication
|
|
80
80
|
|
|
81
|
-
2. **Block clones (statement windows)**
|
|
81
|
+
2. **Block clones (statement windows)**
|
|
82
82
|
Detects repeated local logic patterns
|
|
83
83
|
|
|
84
|
-
3. **Segment clones (report-only)**
|
|
84
|
+
3. **Segment clones (report-only)**
|
|
85
85
|
Internal function repetition for explainability; not used for baseline gating
|
|
86
86
|
|
|
87
87
|
**CI-Ready Features:**
|
|
@@ -158,12 +158,12 @@ Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md)
|
|
|
158
158
|
|
|
159
159
|
CodeClone uses a deterministic exit code contract:
|
|
160
160
|
|
|
161
|
-
| Code | Meaning
|
|
162
|
-
|
|
163
|
-
| `0` | Success — run completed without gating failures
|
|
161
|
+
| Code | Meaning |
|
|
162
|
+
|------|-------------------------------------------------------------------------------------------------------------------------------------|
|
|
163
|
+
| `0` | Success — run completed without gating failures |
|
|
164
164
|
| `2` | Contract error — baseline missing/untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating |
|
|
165
|
-
| `3` | Gating failure — new clones detected or threshold exceeded
|
|
166
|
-
| `5` | Internal error — unexpected exception
|
|
165
|
+
| `3` | Gating failure — new clones detected or threshold exceeded |
|
|
166
|
+
| `5` | Internal error — unexpected exception |
|
|
167
167
|
|
|
168
168
|
**Priority:** Contract errors (`2`) override gating failures (`3`) when both occur.
|
|
169
169
|
|
|
@@ -223,7 +223,7 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
|
|
|
223
223
|
"cache_path": "/path/to/.cache/codeclone/cache.json",
|
|
224
224
|
"cache_used": true,
|
|
225
225
|
"cache_status": "ok",
|
|
226
|
-
"cache_schema_version": "1.
|
|
226
|
+
"cache_schema_version": "1.3",
|
|
227
227
|
"files_skipped_source_io": 0,
|
|
228
228
|
"groups_counts": {
|
|
229
229
|
"functions": {
|
|
@@ -304,7 +304,8 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
|
|
|
304
304
|
Cache is an optimization layer only and is never a source of truth.
|
|
305
305
|
|
|
306
306
|
- Default path: `<root>/.cache/codeclone/cache.json`
|
|
307
|
-
- Schema version: **v1.
|
|
307
|
+
- Schema version: **v1.3**
|
|
308
|
+
- Compatibility includes analysis profile (`min_loc`, `min_stmt`)
|
|
308
309
|
- Invalid or oversized cache is ignored with warning and rebuilt (fail-open)
|
|
309
310
|
|
|
310
311
|
Full contract details: [`docs/book/07-cache.md`](docs/book/07-cache.md)
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|

|
|
9
9
|
[](LICENSE)
|
|
10
10
|
|
|
11
|
-
**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
|
|
11
|
+
**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
|
|
12
12
|
It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI.
|
|
13
13
|
|
|
14
14
|
---
|
|
@@ -34,13 +34,13 @@ Unlike token-based tools, CodeClone compares **structure and control flow**, mak
|
|
|
34
34
|
|
|
35
35
|
**Three Detection Levels:**
|
|
36
36
|
|
|
37
|
-
1. **Function clones (CFG fingerprint)**
|
|
37
|
+
1. **Function clones (CFG fingerprint)**
|
|
38
38
|
Strong structural signal for cross-layer duplication
|
|
39
39
|
|
|
40
|
-
2. **Block clones (statement windows)**
|
|
40
|
+
2. **Block clones (statement windows)**
|
|
41
41
|
Detects repeated local logic patterns
|
|
42
42
|
|
|
43
|
-
3. **Segment clones (report-only)**
|
|
43
|
+
3. **Segment clones (report-only)**
|
|
44
44
|
Internal function repetition for explainability; not used for baseline gating
|
|
45
45
|
|
|
46
46
|
**CI-Ready Features:**
|
|
@@ -117,12 +117,12 @@ Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md)
|
|
|
117
117
|
|
|
118
118
|
CodeClone uses a deterministic exit code contract:
|
|
119
119
|
|
|
120
|
-
| Code | Meaning
|
|
121
|
-
|
|
122
|
-
| `0` | Success — run completed without gating failures
|
|
120
|
+
| Code | Meaning |
|
|
121
|
+
|------|-------------------------------------------------------------------------------------------------------------------------------------|
|
|
122
|
+
| `0` | Success — run completed without gating failures |
|
|
123
123
|
| `2` | Contract error — baseline missing/untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating |
|
|
124
|
-
| `3` | Gating failure — new clones detected or threshold exceeded
|
|
125
|
-
| `5` | Internal error — unexpected exception
|
|
124
|
+
| `3` | Gating failure — new clones detected or threshold exceeded |
|
|
125
|
+
| `5` | Internal error — unexpected exception |
|
|
126
126
|
|
|
127
127
|
**Priority:** Contract errors (`2`) override gating failures (`3`) when both occur.
|
|
128
128
|
|
|
@@ -182,7 +182,7 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
|
|
|
182
182
|
"cache_path": "/path/to/.cache/codeclone/cache.json",
|
|
183
183
|
"cache_used": true,
|
|
184
184
|
"cache_status": "ok",
|
|
185
|
-
"cache_schema_version": "1.
|
|
185
|
+
"cache_schema_version": "1.3",
|
|
186
186
|
"files_skipped_source_io": 0,
|
|
187
187
|
"groups_counts": {
|
|
188
188
|
"functions": {
|
|
@@ -263,7 +263,8 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
|
|
|
263
263
|
Cache is an optimization layer only and is never a source of truth.
|
|
264
264
|
|
|
265
265
|
- Default path: `<root>/.cache/codeclone/cache.json`
|
|
266
|
-
- Schema version: **v1.
|
|
266
|
+
- Schema version: **v1.3**
|
|
267
|
+
- Compatibility includes analysis profile (`min_loc`, `min_stmt`)
|
|
267
268
|
- Invalid or oversized cache is ignored with warning and rebuilt (fail-open)
|
|
268
269
|
|
|
269
270
|
Full contract details: [`docs/book/07-cache.md`](docs/book/07-cache.md)
|
|
@@ -14,6 +14,7 @@ import itertools
|
|
|
14
14
|
from collections.abc import Iterable
|
|
15
15
|
from dataclasses import dataclass
|
|
16
16
|
from functools import lru_cache
|
|
17
|
+
from types import ModuleType
|
|
17
18
|
from typing import NamedTuple, cast
|
|
18
19
|
|
|
19
20
|
from .errors import FileProcessingError
|
|
@@ -34,33 +35,19 @@ class _Snippet:
|
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
class _FileCache:
|
|
37
|
-
__slots__ = ("
|
|
38
|
+
__slots__ = ("_get_file_lines_impl", "maxsize")
|
|
38
39
|
|
|
39
40
|
def __init__(self, maxsize: int = 128) -> None:
|
|
40
41
|
self.maxsize = maxsize
|
|
41
|
-
self.
|
|
42
|
+
self._get_file_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_lines)
|
|
42
43
|
|
|
43
44
|
@staticmethod
|
|
44
|
-
def
|
|
45
|
-
filepath: str, start_line: int, end_line: int
|
|
46
|
-
) -> tuple[str, ...]:
|
|
47
|
-
if start_line < 1:
|
|
48
|
-
start_line = 1
|
|
49
|
-
if end_line < start_line:
|
|
50
|
-
return ()
|
|
51
|
-
|
|
45
|
+
def _read_file_lines(filepath: str) -> tuple[str, ...]:
|
|
52
46
|
try:
|
|
53
47
|
|
|
54
48
|
def _read_with_errors(errors: str) -> tuple[str, ...]:
|
|
55
|
-
lines: list[str] = []
|
|
56
49
|
with open(filepath, encoding="utf-8", errors=errors) as f:
|
|
57
|
-
for
|
|
58
|
-
if lineno < start_line:
|
|
59
|
-
continue
|
|
60
|
-
if lineno > end_line:
|
|
61
|
-
break
|
|
62
|
-
lines.append(line.rstrip("\n"))
|
|
63
|
-
return tuple(lines)
|
|
50
|
+
return tuple(line.rstrip("\n") for line in f)
|
|
64
51
|
|
|
65
52
|
try:
|
|
66
53
|
return _read_with_errors("strict")
|
|
@@ -72,7 +59,16 @@ class _FileCache:
|
|
|
72
59
|
def get_lines_range(
|
|
73
60
|
self, filepath: str, start_line: int, end_line: int
|
|
74
61
|
) -> tuple[str, ...]:
|
|
75
|
-
|
|
62
|
+
if start_line < 1:
|
|
63
|
+
start_line = 1
|
|
64
|
+
if end_line < start_line:
|
|
65
|
+
return ()
|
|
66
|
+
lines = self._get_file_lines_impl(filepath)
|
|
67
|
+
start_index = start_line - 1
|
|
68
|
+
if start_index >= len(lines):
|
|
69
|
+
return ()
|
|
70
|
+
end_index = min(len(lines), end_line)
|
|
71
|
+
return lines[start_index:end_index]
|
|
76
72
|
|
|
77
73
|
class _CacheInfo(NamedTuple):
|
|
78
74
|
hits: int
|
|
@@ -81,10 +77,30 @@ class _FileCache:
|
|
|
81
77
|
currsize: int
|
|
82
78
|
|
|
83
79
|
def cache_info(self) -> _CacheInfo:
|
|
84
|
-
return cast(_FileCache._CacheInfo, self.
|
|
80
|
+
return cast(_FileCache._CacheInfo, self._get_file_lines_impl.cache_info())
|
|
85
81
|
|
|
86
82
|
|
|
87
|
-
|
|
83
|
+
_PYGMENTS_IMPORTER_ID: int | None = None
|
|
84
|
+
_PYGMENTS_API: tuple[ModuleType, ModuleType, ModuleType] | None = None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _load_pygments_api() -> tuple[ModuleType, ModuleType, ModuleType] | None:
|
|
88
|
+
"""
|
|
89
|
+
Load pygments modules once per import-function identity.
|
|
90
|
+
|
|
91
|
+
Tests monkeypatch `importlib.import_module`; tracking importer identity keeps
|
|
92
|
+
behavior deterministic and allows import-error branches to stay testable.
|
|
93
|
+
"""
|
|
94
|
+
global _PYGMENTS_IMPORTER_ID
|
|
95
|
+
global _PYGMENTS_API
|
|
96
|
+
|
|
97
|
+
importer_id = id(importlib.import_module)
|
|
98
|
+
if importer_id != _PYGMENTS_IMPORTER_ID:
|
|
99
|
+
_PYGMENTS_IMPORTER_ID = importer_id
|
|
100
|
+
_PYGMENTS_API = None
|
|
101
|
+
if _PYGMENTS_API is not None:
|
|
102
|
+
return _PYGMENTS_API
|
|
103
|
+
|
|
88
104
|
try:
|
|
89
105
|
pygments = importlib.import_module("pygments")
|
|
90
106
|
formatters = importlib.import_module("pygments.formatters")
|
|
@@ -92,6 +108,16 @@ def _try_pygments(code: str) -> str | None:
|
|
|
92
108
|
except ImportError:
|
|
93
109
|
return None
|
|
94
110
|
|
|
111
|
+
_PYGMENTS_API = (pygments, formatters, lexers)
|
|
112
|
+
return _PYGMENTS_API
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _try_pygments(code: str) -> str | None:
|
|
116
|
+
pygments_api = _load_pygments_api()
|
|
117
|
+
if pygments_api is None:
|
|
118
|
+
return None
|
|
119
|
+
pygments, formatters, lexers = pygments_api
|
|
120
|
+
|
|
95
121
|
highlight = pygments.highlight
|
|
96
122
|
formatter_cls = formatters.HtmlFormatter
|
|
97
123
|
lexer_cls = lexers.PythonLexer
|
|
@@ -104,10 +130,10 @@ def _pygments_css(style_name: str) -> str:
|
|
|
104
130
|
Returns CSS for pygments tokens. Scoped to `.codebox` to avoid leaking styles.
|
|
105
131
|
If Pygments is not available or style missing, returns "".
|
|
106
132
|
"""
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
except ImportError:
|
|
133
|
+
pygments_api = _load_pygments_api()
|
|
134
|
+
if pygments_api is None:
|
|
110
135
|
return ""
|
|
136
|
+
_, formatters, _ = pygments_api
|
|
111
137
|
|
|
112
138
|
try:
|
|
113
139
|
formatter_cls = formatters.HtmlFormatter
|
|
@@ -9,6 +9,8 @@ Licensed under the MIT License.
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
11
|
import ast
|
|
12
|
+
from bisect import bisect_left, bisect_right
|
|
13
|
+
from dataclasses import dataclass
|
|
12
14
|
from pathlib import Path
|
|
13
15
|
|
|
14
16
|
from ._report_explain_contract import (
|
|
@@ -23,6 +25,19 @@ from ._report_explain_contract import (
|
|
|
23
25
|
from ._report_types import GroupItem, GroupMap
|
|
24
26
|
|
|
25
27
|
|
|
28
|
+
@dataclass(frozen=True, slots=True)
|
|
29
|
+
class _StatementRecord:
|
|
30
|
+
node: ast.stmt
|
|
31
|
+
start_line: int
|
|
32
|
+
end_line: int
|
|
33
|
+
start_col: int
|
|
34
|
+
end_col: int
|
|
35
|
+
type_name: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
_StatementIndex = tuple[tuple[_StatementRecord, ...], tuple[int, ...]]
|
|
39
|
+
|
|
40
|
+
|
|
26
41
|
def _signature_parts(group_key: str) -> list[str]:
|
|
27
42
|
return [part for part in group_key.split("|") if part]
|
|
28
43
|
|
|
@@ -42,6 +57,53 @@ def _parsed_file_tree(
|
|
|
42
57
|
return tree
|
|
43
58
|
|
|
44
59
|
|
|
60
|
+
def _build_statement_index(tree: ast.AST) -> _StatementIndex:
|
|
61
|
+
records = tuple(
|
|
62
|
+
sorted(
|
|
63
|
+
(
|
|
64
|
+
_StatementRecord(
|
|
65
|
+
node=node,
|
|
66
|
+
start_line=int(getattr(node, "lineno", 0)),
|
|
67
|
+
end_line=int(getattr(node, "end_lineno", 0)),
|
|
68
|
+
start_col=int(getattr(node, "col_offset", 0)),
|
|
69
|
+
end_col=int(getattr(node, "end_col_offset", 0)),
|
|
70
|
+
type_name=type(node).__name__,
|
|
71
|
+
)
|
|
72
|
+
for node in ast.walk(tree)
|
|
73
|
+
if isinstance(node, ast.stmt)
|
|
74
|
+
),
|
|
75
|
+
key=lambda record: (
|
|
76
|
+
record.start_line,
|
|
77
|
+
record.end_line,
|
|
78
|
+
record.start_col,
|
|
79
|
+
record.end_col,
|
|
80
|
+
record.type_name,
|
|
81
|
+
),
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
start_lines = tuple(record.start_line for record in records)
|
|
85
|
+
return records, start_lines
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _parsed_statement_index(
|
|
89
|
+
filepath: str,
|
|
90
|
+
*,
|
|
91
|
+
ast_cache: dict[str, ast.AST | None],
|
|
92
|
+
stmt_index_cache: dict[str, _StatementIndex | None],
|
|
93
|
+
) -> _StatementIndex | None:
|
|
94
|
+
if filepath in stmt_index_cache:
|
|
95
|
+
return stmt_index_cache[filepath]
|
|
96
|
+
|
|
97
|
+
tree = _parsed_file_tree(filepath, ast_cache=ast_cache)
|
|
98
|
+
if tree is None:
|
|
99
|
+
stmt_index_cache[filepath] = None
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
index = _build_statement_index(tree)
|
|
103
|
+
stmt_index_cache[filepath] = index
|
|
104
|
+
return index
|
|
105
|
+
|
|
106
|
+
|
|
45
107
|
def _is_assert_like_stmt(stmt: ast.stmt) -> bool:
|
|
46
108
|
if isinstance(stmt, ast.Assert):
|
|
47
109
|
return True
|
|
@@ -64,45 +126,42 @@ def _assert_range_stats(
|
|
|
64
126
|
start_line: int,
|
|
65
127
|
end_line: int,
|
|
66
128
|
ast_cache: dict[str, ast.AST | None],
|
|
129
|
+
stmt_index_cache: dict[str, _StatementIndex | None],
|
|
67
130
|
range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
|
|
68
131
|
) -> tuple[int, int, int]:
|
|
69
132
|
cache_key = (filepath, start_line, end_line)
|
|
70
133
|
if cache_key in range_cache:
|
|
71
134
|
return range_cache[cache_key]
|
|
72
135
|
|
|
73
|
-
|
|
74
|
-
|
|
136
|
+
statement_index = _parsed_statement_index(
|
|
137
|
+
filepath,
|
|
138
|
+
ast_cache=ast_cache,
|
|
139
|
+
stmt_index_cache=stmt_index_cache,
|
|
140
|
+
)
|
|
141
|
+
if statement_index is None:
|
|
75
142
|
range_cache[cache_key] = (0, 0, 0)
|
|
76
143
|
return 0, 0, 0
|
|
77
144
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
for node in ast.walk(tree)
|
|
81
|
-
if isinstance(node, ast.stmt)
|
|
82
|
-
and int(getattr(node, "lineno", 0)) >= start_line
|
|
83
|
-
and int(getattr(node, "end_lineno", 0)) <= end_line
|
|
84
|
-
]
|
|
85
|
-
if not stmts:
|
|
145
|
+
records, start_lines = statement_index
|
|
146
|
+
if not records:
|
|
86
147
|
range_cache[cache_key] = (0, 0, 0)
|
|
87
148
|
return 0, 0, 0
|
|
88
149
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
int(getattr(stmt, "col_offset", 0)),
|
|
95
|
-
int(getattr(stmt, "end_col_offset", 0)),
|
|
96
|
-
type(stmt).__name__,
|
|
97
|
-
),
|
|
98
|
-
)
|
|
150
|
+
left = bisect_left(start_lines, start_line)
|
|
151
|
+
right = bisect_right(start_lines, end_line)
|
|
152
|
+
if left >= right:
|
|
153
|
+
range_cache[cache_key] = (0, 0, 0)
|
|
154
|
+
return 0, 0, 0
|
|
99
155
|
|
|
100
|
-
total =
|
|
156
|
+
total = 0
|
|
101
157
|
assert_like = 0
|
|
102
158
|
max_consecutive = 0
|
|
103
159
|
current_consecutive = 0
|
|
104
|
-
for
|
|
105
|
-
if
|
|
160
|
+
for record in records[left:right]:
|
|
161
|
+
if record.end_line > end_line:
|
|
162
|
+
continue
|
|
163
|
+
total += 1
|
|
164
|
+
if _is_assert_like_stmt(record.node):
|
|
106
165
|
assert_like += 1
|
|
107
166
|
current_consecutive += 1
|
|
108
167
|
if current_consecutive > max_consecutive:
|
|
@@ -110,6 +169,10 @@ def _assert_range_stats(
|
|
|
110
169
|
else:
|
|
111
170
|
current_consecutive = 0
|
|
112
171
|
|
|
172
|
+
if total == 0:
|
|
173
|
+
range_cache[cache_key] = (0, 0, 0)
|
|
174
|
+
return 0, 0, 0
|
|
175
|
+
|
|
113
176
|
stats = (total, assert_like, max_consecutive)
|
|
114
177
|
range_cache[cache_key] = stats
|
|
115
178
|
return stats
|
|
@@ -121,6 +184,7 @@ def _is_assert_only_range(
|
|
|
121
184
|
start_line: int,
|
|
122
185
|
end_line: int,
|
|
123
186
|
ast_cache: dict[str, ast.AST | None],
|
|
187
|
+
stmt_index_cache: dict[str, _StatementIndex | None],
|
|
124
188
|
range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
|
|
125
189
|
) -> bool:
|
|
126
190
|
total, assert_like, _ = _assert_range_stats(
|
|
@@ -128,6 +192,7 @@ def _is_assert_only_range(
|
|
|
128
192
|
start_line=start_line,
|
|
129
193
|
end_line=end_line,
|
|
130
194
|
ast_cache=ast_cache,
|
|
195
|
+
stmt_index_cache=stmt_index_cache,
|
|
131
196
|
range_cache=range_cache,
|
|
132
197
|
)
|
|
133
198
|
return total > 0 and total == assert_like
|
|
@@ -157,6 +222,7 @@ def _enrich_with_assert_facts(
|
|
|
157
222
|
facts: dict[str, str],
|
|
158
223
|
items: list[GroupItem],
|
|
159
224
|
ast_cache: dict[str, ast.AST | None],
|
|
225
|
+
stmt_index_cache: dict[str, _StatementIndex | None],
|
|
160
226
|
range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
|
|
161
227
|
) -> None:
|
|
162
228
|
assert_only = True
|
|
@@ -181,6 +247,7 @@ def _enrich_with_assert_facts(
|
|
|
181
247
|
start_line=start_line,
|
|
182
248
|
end_line=end_line,
|
|
183
249
|
ast_cache=ast_cache,
|
|
250
|
+
stmt_index_cache=stmt_index_cache,
|
|
184
251
|
range_cache=range_cache,
|
|
185
252
|
)
|
|
186
253
|
total_statements += range_total
|
|
@@ -198,6 +265,7 @@ def _enrich_with_assert_facts(
|
|
|
198
265
|
start_line=start_line,
|
|
199
266
|
end_line=end_line,
|
|
200
267
|
ast_cache=ast_cache,
|
|
268
|
+
stmt_index_cache=stmt_index_cache,
|
|
201
269
|
range_cache=range_cache,
|
|
202
270
|
)
|
|
203
271
|
):
|
|
@@ -223,6 +291,7 @@ def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]]
|
|
|
223
291
|
Renderers (HTML/TXT/JSON) should only display these facts.
|
|
224
292
|
"""
|
|
225
293
|
ast_cache: dict[str, ast.AST | None] = {}
|
|
294
|
+
stmt_index_cache: dict[str, _StatementIndex | None] = {}
|
|
226
295
|
range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {}
|
|
227
296
|
facts_by_group: dict[str, dict[str, str]] = {}
|
|
228
297
|
|
|
@@ -232,6 +301,7 @@ def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]]
|
|
|
232
301
|
facts=facts,
|
|
233
302
|
items=items,
|
|
234
303
|
ast_cache=ast_cache,
|
|
304
|
+
stmt_index_cache=stmt_index_cache,
|
|
235
305
|
range_cache=range_cache,
|
|
236
306
|
)
|
|
237
307
|
group_arity = len(items)
|
|
@@ -39,6 +39,7 @@ class CacheStatus(str, Enum):
|
|
|
39
39
|
VERSION_MISMATCH = "version_mismatch"
|
|
40
40
|
PYTHON_TAG_MISMATCH = "python_tag_mismatch"
|
|
41
41
|
FINGERPRINT_MISMATCH = "mismatch_fingerprint_version"
|
|
42
|
+
ANALYSIS_PROFILE_MISMATCH = "analysis_profile_mismatch"
|
|
42
43
|
INTEGRITY_FAILED = "integrity_failed"
|
|
43
44
|
|
|
44
45
|
|
|
@@ -84,15 +85,22 @@ class CacheEntry(TypedDict):
|
|
|
84
85
|
segments: list[SegmentDict]
|
|
85
86
|
|
|
86
87
|
|
|
88
|
+
class AnalysisProfile(TypedDict):
|
|
89
|
+
min_loc: int
|
|
90
|
+
min_stmt: int
|
|
91
|
+
|
|
92
|
+
|
|
87
93
|
class CacheData(TypedDict):
|
|
88
94
|
version: str
|
|
89
95
|
python_tag: str
|
|
90
96
|
fingerprint_version: str
|
|
97
|
+
analysis_profile: AnalysisProfile
|
|
91
98
|
files: dict[str, CacheEntry]
|
|
92
99
|
|
|
93
100
|
|
|
94
101
|
class Cache:
|
|
95
102
|
__slots__ = (
|
|
103
|
+
"analysis_profile",
|
|
96
104
|
"cache_schema_version",
|
|
97
105
|
"data",
|
|
98
106
|
"fingerprint_version",
|
|
@@ -112,14 +120,21 @@ class Cache:
|
|
|
112
120
|
*,
|
|
113
121
|
root: str | Path | None = None,
|
|
114
122
|
max_size_bytes: int | None = None,
|
|
123
|
+
min_loc: int = 15,
|
|
124
|
+
min_stmt: int = 6,
|
|
115
125
|
):
|
|
116
126
|
self.path = Path(path)
|
|
117
127
|
self.root = _resolve_root(root)
|
|
118
128
|
self.fingerprint_version = BASELINE_FINGERPRINT_VERSION
|
|
129
|
+
self.analysis_profile: AnalysisProfile = {
|
|
130
|
+
"min_loc": min_loc,
|
|
131
|
+
"min_stmt": min_stmt,
|
|
132
|
+
}
|
|
119
133
|
self.data: CacheData = _empty_cache_data(
|
|
120
134
|
version=self._CACHE_VERSION,
|
|
121
135
|
python_tag=current_python_tag(),
|
|
122
136
|
fingerprint_version=self.fingerprint_version,
|
|
137
|
+
analysis_profile=self.analysis_profile,
|
|
123
138
|
)
|
|
124
139
|
self.legacy_secret_warning = self._detect_legacy_secret_warning()
|
|
125
140
|
self.cache_schema_version: str | None = None
|
|
@@ -164,6 +179,7 @@ class Cache:
|
|
|
164
179
|
version=self._CACHE_VERSION,
|
|
165
180
|
python_tag=current_python_tag(),
|
|
166
181
|
fingerprint_version=self.fingerprint_version,
|
|
182
|
+
analysis_profile=self.analysis_profile,
|
|
167
183
|
)
|
|
168
184
|
|
|
169
185
|
def _sign_data(self, data: Mapping[str, object]) -> str:
|
|
@@ -309,6 +325,28 @@ class Cache:
|
|
|
309
325
|
)
|
|
310
326
|
return None
|
|
311
327
|
|
|
328
|
+
analysis_profile = _as_analysis_profile(payload.get("ap"))
|
|
329
|
+
if analysis_profile is None:
|
|
330
|
+
self._ignore_cache(
|
|
331
|
+
"Cache format invalid; ignoring cache.",
|
|
332
|
+
status=CacheStatus.INVALID_TYPE,
|
|
333
|
+
schema_version=version,
|
|
334
|
+
)
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
if analysis_profile != self.analysis_profile:
|
|
338
|
+
self._ignore_cache(
|
|
339
|
+
"Cache analysis profile mismatch "
|
|
340
|
+
f"(found min_loc={analysis_profile['min_loc']}, "
|
|
341
|
+
f"min_stmt={analysis_profile['min_stmt']}; "
|
|
342
|
+
f"expected min_loc={self.analysis_profile['min_loc']}, "
|
|
343
|
+
f"min_stmt={self.analysis_profile['min_stmt']}); "
|
|
344
|
+
"ignoring cache.",
|
|
345
|
+
status=CacheStatus.ANALYSIS_PROFILE_MISMATCH,
|
|
346
|
+
schema_version=version,
|
|
347
|
+
)
|
|
348
|
+
return None
|
|
349
|
+
|
|
312
350
|
files_obj = payload.get("files")
|
|
313
351
|
files_dict = _as_str_dict(files_obj)
|
|
314
352
|
if files_dict is None:
|
|
@@ -337,6 +375,7 @@ class Cache:
|
|
|
337
375
|
"version": self._CACHE_VERSION,
|
|
338
376
|
"python_tag": runtime_tag,
|
|
339
377
|
"fingerprint_version": self.fingerprint_version,
|
|
378
|
+
"analysis_profile": self.analysis_profile,
|
|
340
379
|
"files": parsed_files,
|
|
341
380
|
}
|
|
342
381
|
|
|
@@ -356,6 +395,7 @@ class Cache:
|
|
|
356
395
|
payload: dict[str, object] = {
|
|
357
396
|
"py": current_python_tag(),
|
|
358
397
|
"fp": self.fingerprint_version,
|
|
398
|
+
"ap": self.analysis_profile,
|
|
359
399
|
"files": wire_files,
|
|
360
400
|
}
|
|
361
401
|
signed_doc = {
|
|
@@ -371,6 +411,7 @@ class Cache:
|
|
|
371
411
|
self.data["version"] = self._CACHE_VERSION
|
|
372
412
|
self.data["python_tag"] = current_python_tag()
|
|
373
413
|
self.data["fingerprint_version"] = self.fingerprint_version
|
|
414
|
+
self.data["analysis_profile"] = self.analysis_profile
|
|
374
415
|
|
|
375
416
|
except OSError as e:
|
|
376
417
|
raise CacheError(f"Failed to save cache: {e}") from e
|
|
@@ -508,11 +549,13 @@ def _empty_cache_data(
|
|
|
508
549
|
version: str,
|
|
509
550
|
python_tag: str,
|
|
510
551
|
fingerprint_version: str,
|
|
552
|
+
analysis_profile: AnalysisProfile,
|
|
511
553
|
) -> CacheData:
|
|
512
554
|
return {
|
|
513
555
|
"version": version,
|
|
514
556
|
"python_tag": python_tag,
|
|
515
557
|
"fingerprint_version": fingerprint_version,
|
|
558
|
+
"analysis_profile": analysis_profile,
|
|
516
559
|
"files": {},
|
|
517
560
|
}
|
|
518
561
|
|
|
@@ -542,6 +585,22 @@ def _as_str_dict(value: object) -> dict[str, object] | None:
|
|
|
542
585
|
return value
|
|
543
586
|
|
|
544
587
|
|
|
588
|
+
def _as_analysis_profile(value: object) -> AnalysisProfile | None:
|
|
589
|
+
obj = _as_str_dict(value)
|
|
590
|
+
if obj is None:
|
|
591
|
+
return None
|
|
592
|
+
|
|
593
|
+
if set(obj.keys()) != {"min_loc", "min_stmt"}:
|
|
594
|
+
return None
|
|
595
|
+
|
|
596
|
+
min_loc = _as_int(obj.get("min_loc"))
|
|
597
|
+
min_stmt = _as_int(obj.get("min_stmt"))
|
|
598
|
+
if min_loc is None or min_stmt is None:
|
|
599
|
+
return None
|
|
600
|
+
|
|
601
|
+
return {"min_loc": min_loc, "min_stmt": min_stmt}
|
|
602
|
+
|
|
603
|
+
|
|
545
604
|
def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None:
|
|
546
605
|
obj = _as_str_dict(value)
|
|
547
606
|
if obj is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeclone
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.4
|
|
4
4
|
Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
|
|
5
5
|
Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
6
6
|
Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
|
|
@@ -49,7 +49,7 @@ Dynamic: license-file
|
|
|
49
49
|

|
|
50
50
|
[](LICENSE)
|
|
51
51
|
|
|
52
|
-
**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
|
|
52
|
+
**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
|
|
53
53
|
It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI.
|
|
54
54
|
|
|
55
55
|
---
|
|
@@ -75,13 +75,13 @@ Unlike token-based tools, CodeClone compares **structure and control flow**, mak
|
|
|
75
75
|
|
|
76
76
|
**Three Detection Levels:**
|
|
77
77
|
|
|
78
|
-
1. **Function clones (CFG fingerprint)**
|
|
78
|
+
1. **Function clones (CFG fingerprint)**
|
|
79
79
|
Strong structural signal for cross-layer duplication
|
|
80
80
|
|
|
81
|
-
2. **Block clones (statement windows)**
|
|
81
|
+
2. **Block clones (statement windows)**
|
|
82
82
|
Detects repeated local logic patterns
|
|
83
83
|
|
|
84
|
-
3. **Segment clones (report-only)**
|
|
84
|
+
3. **Segment clones (report-only)**
|
|
85
85
|
Internal function repetition for explainability; not used for baseline gating
|
|
86
86
|
|
|
87
87
|
**CI-Ready Features:**
|
|
@@ -158,12 +158,12 @@ Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md)
|
|
|
158
158
|
|
|
159
159
|
CodeClone uses a deterministic exit code contract:
|
|
160
160
|
|
|
161
|
-
| Code | Meaning
|
|
162
|
-
|
|
163
|
-
| `0` | Success — run completed without gating failures
|
|
161
|
+
| Code | Meaning |
|
|
162
|
+
|------|-------------------------------------------------------------------------------------------------------------------------------------|
|
|
163
|
+
| `0` | Success — run completed without gating failures |
|
|
164
164
|
| `2` | Contract error — baseline missing/untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating |
|
|
165
|
-
| `3` | Gating failure — new clones detected or threshold exceeded
|
|
166
|
-
| `5` | Internal error — unexpected exception
|
|
165
|
+
| `3` | Gating failure — new clones detected or threshold exceeded |
|
|
166
|
+
| `5` | Internal error — unexpected exception |
|
|
167
167
|
|
|
168
168
|
**Priority:** Contract errors (`2`) override gating failures (`3`) when both occur.
|
|
169
169
|
|
|
@@ -223,7 +223,7 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
|
|
|
223
223
|
"cache_path": "/path/to/.cache/codeclone/cache.json",
|
|
224
224
|
"cache_used": true,
|
|
225
225
|
"cache_status": "ok",
|
|
226
|
-
"cache_schema_version": "1.
|
|
226
|
+
"cache_schema_version": "1.3",
|
|
227
227
|
"files_skipped_source_io": 0,
|
|
228
228
|
"groups_counts": {
|
|
229
229
|
"functions": {
|
|
@@ -304,7 +304,8 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
|
|
|
304
304
|
Cache is an optimization layer only and is never a source of truth.
|
|
305
305
|
|
|
306
306
|
- Default path: `<root>/.cache/codeclone/cache.json`
|
|
307
|
-
- Schema version: **v1.
|
|
307
|
+
- Schema version: **v1.3**
|
|
308
|
+
- Compatibility includes analysis profile (`min_loc`, `min_stmt`)
|
|
308
309
|
- Invalid or oversized cache is ignored with warning and rebuilt (fail-open)
|
|
309
310
|
|
|
310
311
|
Full contract details: [`docs/book/07-cache.md`](docs/book/07-cache.md)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "codeclone"
|
|
7
|
-
version = "1.4.
|
|
7
|
+
version = "1.4.4"
|
|
8
8
|
description = "AST and CFG-based code clone detector for Python focused on architectural duplication"
|
|
9
9
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -50,6 +50,15 @@ def _make_segment(filepath: str) -> SegmentUnit:
|
|
|
50
50
|
)
|
|
51
51
|
|
|
52
52
|
|
|
53
|
+
def _analysis_payload(cache: Cache, *, files: object) -> dict[str, object]:
|
|
54
|
+
return {
|
|
55
|
+
"py": cache.data["python_tag"],
|
|
56
|
+
"fp": cache.data["fingerprint_version"],
|
|
57
|
+
"ap": cache.data["analysis_profile"],
|
|
58
|
+
"files": files,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
53
62
|
def test_cache_roundtrip(tmp_path: Path) -> None:
|
|
54
63
|
cache_path = tmp_path / "cache.json"
|
|
55
64
|
cache = Cache(cache_path)
|
|
@@ -97,7 +106,7 @@ def test_get_file_entry_missing_after_fallback_returns_none(tmp_path: Path) -> N
|
|
|
97
106
|
assert cache.get_file_entry(str(root / "pkg" / "missing.py")) is None
|
|
98
107
|
|
|
99
108
|
|
|
100
|
-
def
|
|
109
|
+
def test_cache_v13_uses_relpaths_when_root_set(tmp_path: Path) -> None:
|
|
101
110
|
project_root = tmp_path / "project"
|
|
102
111
|
target = project_root / "pkg" / "module.py"
|
|
103
112
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -121,14 +130,10 @@ def test_cache_v12_uses_relpaths_when_root_set(tmp_path: Path) -> None:
|
|
|
121
130
|
assert str(target) not in files
|
|
122
131
|
|
|
123
132
|
|
|
124
|
-
def
|
|
133
|
+
def test_cache_v13_missing_optional_sections_default_empty(tmp_path: Path) -> None:
|
|
125
134
|
cache_path = tmp_path / "cache.json"
|
|
126
135
|
cache = Cache(cache_path)
|
|
127
|
-
payload = {
|
|
128
|
-
"py": cache.data["python_tag"],
|
|
129
|
-
"fp": cache.data["fingerprint_version"],
|
|
130
|
-
"files": {"x.py": {"st": [1, 2]}},
|
|
131
|
-
}
|
|
136
|
+
payload = _analysis_payload(cache, files={"x.py": {"st": [1, 2]}})
|
|
132
137
|
signature = cache._sign_data(payload)
|
|
133
138
|
cache_path.write_text(
|
|
134
139
|
json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": signature}),
|
|
@@ -201,11 +206,7 @@ def test_cache_version_mismatch_warns(tmp_path: Path) -> None:
|
|
|
201
206
|
def test_cache_v_field_version_mismatch_warns(tmp_path: Path) -> None:
|
|
202
207
|
cache_path = tmp_path / "cache.json"
|
|
203
208
|
cache = Cache(cache_path)
|
|
204
|
-
payload = {
|
|
205
|
-
"py": cache.data["python_tag"],
|
|
206
|
-
"fp": cache.data["fingerprint_version"],
|
|
207
|
-
"files": {},
|
|
208
|
-
}
|
|
209
|
+
payload = _analysis_payload(cache, files={})
|
|
209
210
|
signature = cache._sign_data(payload)
|
|
210
211
|
cache_path.write_text(
|
|
211
212
|
json.dumps({"v": "0.0", "payload": payload, "sig": signature}), "utf-8"
|
|
@@ -527,11 +528,7 @@ def test_cache_load_unreadable_read_graceful_ignore(
|
|
|
527
528
|
def test_cache_load_invalid_files_type(tmp_path: Path) -> None:
|
|
528
529
|
cache_path = tmp_path / "cache.json"
|
|
529
530
|
cache = Cache(cache_path)
|
|
530
|
-
payload =
|
|
531
|
-
"py": cache.data["python_tag"],
|
|
532
|
-
"fp": cache.data["fingerprint_version"],
|
|
533
|
-
"files": [],
|
|
534
|
-
}
|
|
531
|
+
payload = _analysis_payload(cache, files=[])
|
|
535
532
|
signature = cache._sign_data(payload)
|
|
536
533
|
cache_path.write_text(
|
|
537
534
|
json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": signature}),
|
|
@@ -644,11 +641,7 @@ def test_cache_load_invalid_top_level_type(tmp_path: Path) -> None:
|
|
|
644
641
|
def test_cache_load_missing_v_field(tmp_path: Path) -> None:
|
|
645
642
|
cache_path = tmp_path / "cache.json"
|
|
646
643
|
cache = Cache(cache_path)
|
|
647
|
-
payload = {
|
|
648
|
-
"py": cache.data["python_tag"],
|
|
649
|
-
"fp": cache.data["fingerprint_version"],
|
|
650
|
-
"files": {},
|
|
651
|
-
}
|
|
644
|
+
payload = _analysis_payload(cache, files={})
|
|
652
645
|
sig = cache._sign_data(payload)
|
|
653
646
|
cache_path.write_text(json.dumps({"payload": payload, "sig": sig}), "utf-8")
|
|
654
647
|
cache.load()
|
|
@@ -683,7 +676,12 @@ def test_cache_load_missing_python_tag_in_payload(tmp_path: Path) -> None:
|
|
|
683
676
|
def test_cache_load_python_tag_mismatch(tmp_path: Path) -> None:
|
|
684
677
|
cache_path = tmp_path / "cache.json"
|
|
685
678
|
cache = Cache(cache_path)
|
|
686
|
-
payload = {
|
|
679
|
+
payload = {
|
|
680
|
+
"py": "cp999",
|
|
681
|
+
"fp": cache.data["fingerprint_version"],
|
|
682
|
+
"ap": cache.data["analysis_profile"],
|
|
683
|
+
"files": {},
|
|
684
|
+
}
|
|
687
685
|
sig = cache._sign_data(payload)
|
|
688
686
|
cache_path.write_text(
|
|
689
687
|
json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
|
|
@@ -709,7 +707,12 @@ def test_cache_load_missing_fingerprint_version(tmp_path: Path) -> None:
|
|
|
709
707
|
def test_cache_load_fingerprint_version_mismatch(tmp_path: Path) -> None:
|
|
710
708
|
cache_path = tmp_path / "cache.json"
|
|
711
709
|
cache = Cache(cache_path)
|
|
712
|
-
payload = {
|
|
710
|
+
payload = {
|
|
711
|
+
"py": cache.data["python_tag"],
|
|
712
|
+
"fp": "old",
|
|
713
|
+
"ap": cache.data["analysis_profile"],
|
|
714
|
+
"files": {},
|
|
715
|
+
}
|
|
713
716
|
sig = cache._sign_data(payload)
|
|
714
717
|
cache_path.write_text(
|
|
715
718
|
json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
|
|
@@ -719,18 +722,82 @@ def test_cache_load_fingerprint_version_mismatch(tmp_path: Path) -> None:
|
|
|
719
722
|
assert "fingerprint version mismatch" in cache.load_warning
|
|
720
723
|
|
|
721
724
|
|
|
722
|
-
def
|
|
725
|
+
def test_cache_load_analysis_profile_mismatch(tmp_path: Path) -> None:
|
|
726
|
+
cache_path = tmp_path / "cache.json"
|
|
727
|
+
cache = Cache(cache_path, min_loc=1, min_stmt=1)
|
|
728
|
+
cache.put_file_entry("x.py", {"mtime_ns": 1, "size": 10}, [], [], [])
|
|
729
|
+
cache.save()
|
|
730
|
+
|
|
731
|
+
loaded = Cache(cache_path, min_loc=15, min_stmt=6)
|
|
732
|
+
loaded.load()
|
|
733
|
+
|
|
734
|
+
assert loaded.load_warning is not None
|
|
735
|
+
assert "analysis profile mismatch" in loaded.load_warning
|
|
736
|
+
assert loaded.data["files"] == {}
|
|
737
|
+
assert loaded.load_status == CacheStatus.ANALYSIS_PROFILE_MISMATCH
|
|
738
|
+
assert loaded.cache_schema_version == Cache._CACHE_VERSION
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def test_cache_load_missing_analysis_profile_in_payload(tmp_path: Path) -> None:
|
|
742
|
+
cache_path = tmp_path / "cache.json"
|
|
743
|
+
cache = Cache(cache_path)
|
|
744
|
+
payload = {
|
|
745
|
+
"py": cache.data["python_tag"],
|
|
746
|
+
"fp": cache.data["fingerprint_version"],
|
|
747
|
+
"files": {},
|
|
748
|
+
}
|
|
749
|
+
sig = cache._sign_data(payload)
|
|
750
|
+
cache_path.write_text(
|
|
751
|
+
json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
cache.load()
|
|
755
|
+
assert cache.load_warning is not None
|
|
756
|
+
assert "format invalid" in cache.load_warning
|
|
757
|
+
assert cache.load_status == CacheStatus.INVALID_TYPE
|
|
758
|
+
assert cache.cache_schema_version == Cache._CACHE_VERSION
|
|
759
|
+
assert cache.data["files"] == {}
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
@pytest.mark.parametrize(
|
|
763
|
+
"bad_analysis_profile",
|
|
764
|
+
[
|
|
765
|
+
{"min_loc": 15},
|
|
766
|
+
{"min_loc": "15", "min_stmt": 6},
|
|
767
|
+
],
|
|
768
|
+
)
|
|
769
|
+
def test_cache_load_invalid_analysis_profile_payload(
|
|
770
|
+
tmp_path: Path, bad_analysis_profile: object
|
|
771
|
+
) -> None:
|
|
723
772
|
cache_path = tmp_path / "cache.json"
|
|
724
773
|
cache = Cache(cache_path)
|
|
725
774
|
payload = {
|
|
726
775
|
"py": cache.data["python_tag"],
|
|
727
776
|
"fp": cache.data["fingerprint_version"],
|
|
728
|
-
"
|
|
777
|
+
"ap": bad_analysis_profile,
|
|
778
|
+
"files": {},
|
|
729
779
|
}
|
|
730
780
|
sig = cache._sign_data(payload)
|
|
731
781
|
cache_path.write_text(
|
|
732
782
|
json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
|
|
733
783
|
)
|
|
784
|
+
|
|
785
|
+
cache.load()
|
|
786
|
+
assert cache.load_warning is not None
|
|
787
|
+
assert "format invalid" in cache.load_warning
|
|
788
|
+
assert cache.load_status == CacheStatus.INVALID_TYPE
|
|
789
|
+
assert cache.cache_schema_version == Cache._CACHE_VERSION
|
|
790
|
+
assert cache.data["files"] == {}
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
def test_cache_load_invalid_wire_file_entry(tmp_path: Path) -> None:
|
|
794
|
+
cache_path = tmp_path / "cache.json"
|
|
795
|
+
cache = Cache(cache_path)
|
|
796
|
+
payload = _analysis_payload(cache, files={"x.py": {"st": "bad"}})
|
|
797
|
+
sig = cache._sign_data(payload)
|
|
798
|
+
cache_path.write_text(
|
|
799
|
+
json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
|
|
800
|
+
)
|
|
734
801
|
cache.load()
|
|
735
802
|
assert cache.load_warning is not None
|
|
736
803
|
assert "format invalid" in cache.load_warning
|
|
@@ -708,7 +708,7 @@ def test_cli_cache_status_string_fallback(
|
|
|
708
708
|
def __init__(self, _path: Path, **_kwargs: object) -> None:
|
|
709
709
|
self.load_warning = load_warning
|
|
710
710
|
self.load_status = "not-a-cache-status"
|
|
711
|
-
self.cache_schema_version =
|
|
711
|
+
self.cache_schema_version = CACHE_VERSION
|
|
712
712
|
|
|
713
713
|
def load(self) -> None:
|
|
714
714
|
return None
|
|
@@ -1716,6 +1716,122 @@ def test_cli_reports_cache_meta_when_cache_missing(
|
|
|
1716
1716
|
assert meta["cache_schema_version"] is None
|
|
1717
1717
|
|
|
1718
1718
|
|
|
1719
|
+
@pytest.mark.parametrize(
|
|
1720
|
+
(
|
|
1721
|
+
"first_min_loc",
|
|
1722
|
+
"first_min_stmt",
|
|
1723
|
+
"second_min_loc",
|
|
1724
|
+
"second_min_stmt",
|
|
1725
|
+
"expected_cache_used",
|
|
1726
|
+
"expected_cache_status",
|
|
1727
|
+
"expected_functions_total",
|
|
1728
|
+
"expected_warning",
|
|
1729
|
+
),
|
|
1730
|
+
[
|
|
1731
|
+
(
|
|
1732
|
+
1,
|
|
1733
|
+
1,
|
|
1734
|
+
15,
|
|
1735
|
+
6,
|
|
1736
|
+
False,
|
|
1737
|
+
"analysis_profile_mismatch",
|
|
1738
|
+
0,
|
|
1739
|
+
"analysis profile mismatch",
|
|
1740
|
+
),
|
|
1741
|
+
(
|
|
1742
|
+
15,
|
|
1743
|
+
6,
|
|
1744
|
+
1,
|
|
1745
|
+
1,
|
|
1746
|
+
False,
|
|
1747
|
+
"analysis_profile_mismatch",
|
|
1748
|
+
1,
|
|
1749
|
+
"analysis profile mismatch",
|
|
1750
|
+
),
|
|
1751
|
+
(1, 1, 1, 1, True, "ok", 1, None),
|
|
1752
|
+
],
|
|
1753
|
+
)
|
|
1754
|
+
def test_cli_cache_analysis_profile_compatibility(
|
|
1755
|
+
tmp_path: Path,
|
|
1756
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
1757
|
+
capsys: pytest.CaptureFixture[str],
|
|
1758
|
+
first_min_loc: int,
|
|
1759
|
+
first_min_stmt: int,
|
|
1760
|
+
second_min_loc: int,
|
|
1761
|
+
second_min_stmt: int,
|
|
1762
|
+
expected_cache_used: bool,
|
|
1763
|
+
expected_cache_status: str,
|
|
1764
|
+
expected_functions_total: int,
|
|
1765
|
+
expected_warning: str | None,
|
|
1766
|
+
) -> None:
|
|
1767
|
+
src = tmp_path / "a.py"
|
|
1768
|
+
src.write_text(
|
|
1769
|
+
"""
|
|
1770
|
+
def f1():
|
|
1771
|
+
x = 1
|
|
1772
|
+
return x
|
|
1773
|
+
|
|
1774
|
+
def f2():
|
|
1775
|
+
y = 1
|
|
1776
|
+
return y
|
|
1777
|
+
""",
|
|
1778
|
+
"utf-8",
|
|
1779
|
+
)
|
|
1780
|
+
baseline_path = _write_baseline(
|
|
1781
|
+
tmp_path / "baseline.json",
|
|
1782
|
+
python_version=f"{sys.version_info.major}.{sys.version_info.minor}",
|
|
1783
|
+
)
|
|
1784
|
+
cache_path = tmp_path / "cache.json"
|
|
1785
|
+
json_first = tmp_path / "report-first.json"
|
|
1786
|
+
json_second = tmp_path / "report-second.json"
|
|
1787
|
+
_patch_parallel(monkeypatch)
|
|
1788
|
+
|
|
1789
|
+
_run_main(
|
|
1790
|
+
monkeypatch,
|
|
1791
|
+
[
|
|
1792
|
+
str(tmp_path),
|
|
1793
|
+
"--baseline",
|
|
1794
|
+
str(baseline_path),
|
|
1795
|
+
"--cache-path",
|
|
1796
|
+
str(cache_path),
|
|
1797
|
+
"--json",
|
|
1798
|
+
str(json_first),
|
|
1799
|
+
"--min-loc",
|
|
1800
|
+
str(first_min_loc),
|
|
1801
|
+
"--min-stmt",
|
|
1802
|
+
str(first_min_stmt),
|
|
1803
|
+
"--no-progress",
|
|
1804
|
+
],
|
|
1805
|
+
)
|
|
1806
|
+
capsys.readouterr()
|
|
1807
|
+
|
|
1808
|
+
_run_main(
|
|
1809
|
+
monkeypatch,
|
|
1810
|
+
[
|
|
1811
|
+
str(tmp_path),
|
|
1812
|
+
"--baseline",
|
|
1813
|
+
str(baseline_path),
|
|
1814
|
+
"--cache-path",
|
|
1815
|
+
str(cache_path),
|
|
1816
|
+
"--json",
|
|
1817
|
+
str(json_second),
|
|
1818
|
+
"--min-loc",
|
|
1819
|
+
str(second_min_loc),
|
|
1820
|
+
"--min-stmt",
|
|
1821
|
+
str(second_min_stmt),
|
|
1822
|
+
"--no-progress",
|
|
1823
|
+
],
|
|
1824
|
+
)
|
|
1825
|
+
out = capsys.readouterr().out
|
|
1826
|
+
payload = json.loads(json_second.read_text("utf-8"))
|
|
1827
|
+
meta = payload["meta"]
|
|
1828
|
+
if expected_warning is not None:
|
|
1829
|
+
assert expected_warning in out
|
|
1830
|
+
assert meta["cache_used"] is expected_cache_used
|
|
1831
|
+
assert meta["cache_status"] == expected_cache_status
|
|
1832
|
+
assert meta["groups_counts"]["functions"]["total"] == expected_functions_total
|
|
1833
|
+
|
|
1834
|
+
|
|
1719
1835
|
@pytest.mark.parametrize(
|
|
1720
1836
|
("flag", "bad_name", "label", "expected"),
|
|
1721
1837
|
[
|
|
@@ -6,7 +6,7 @@ from typing import Any
|
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
8
|
|
|
9
|
-
from codeclone.contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL
|
|
9
|
+
from codeclone.contracts import CACHE_VERSION, DOCS_URL, ISSUES_URL, REPOSITORY_URL
|
|
10
10
|
from codeclone.errors import FileProcessingError
|
|
11
11
|
from codeclone.html_report import (
|
|
12
12
|
_FileCache,
|
|
@@ -507,7 +507,7 @@ def test_html_report_includes_provenance_metadata(
|
|
|
507
507
|
'data-cache-used="true"',
|
|
508
508
|
"Cache schema",
|
|
509
509
|
"Cache status",
|
|
510
|
-
'data-cache-schema-version="
|
|
510
|
+
f'data-cache-schema-version="{CACHE_VERSION}"',
|
|
511
511
|
'data-cache-status="ok"',
|
|
512
512
|
'data-files-skipped-source-io="0"',
|
|
513
513
|
"Source IO skipped",
|
|
@@ -7,7 +7,7 @@ from typing import cast
|
|
|
7
7
|
import pytest
|
|
8
8
|
|
|
9
9
|
import codeclone.report as report_mod
|
|
10
|
-
from codeclone.contracts import REPORT_SCHEMA_VERSION
|
|
10
|
+
from codeclone.contracts import CACHE_VERSION, REPORT_SCHEMA_VERSION
|
|
11
11
|
from codeclone.report import (
|
|
12
12
|
GroupMap,
|
|
13
13
|
build_block_group_facts,
|
|
@@ -276,7 +276,7 @@ def test_report_output_formats(
|
|
|
276
276
|
'"baseline_schema_version": 1',
|
|
277
277
|
f'"baseline_payload_sha256": "{"a" * 64}"',
|
|
278
278
|
'"baseline_payload_sha256_verified": true',
|
|
279
|
-
'"cache_schema_version": "
|
|
279
|
+
f'"cache_schema_version": "{CACHE_VERSION}"',
|
|
280
280
|
'"cache_status": "ok"',
|
|
281
281
|
'"files_skipped_source_io": 0',
|
|
282
282
|
]
|
|
@@ -288,7 +288,7 @@ def test_report_output_formats(
|
|
|
288
288
|
"Baseline generator name: codeclone",
|
|
289
289
|
f"Baseline payload sha256: {'a' * 64}",
|
|
290
290
|
"Baseline payload verified: true",
|
|
291
|
-
"Cache schema version:
|
|
291
|
+
f"Cache schema version: {CACHE_VERSION}",
|
|
292
292
|
"Cache status: ok",
|
|
293
293
|
"Source IO skipped: 0",
|
|
294
294
|
"FUNCTION CLONES (NEW) (groups=2)",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|