codeclone 1.4.2__tar.gz → 1.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {codeclone-1.4.2 → codeclone-1.4.4}/LICENSE +1 -1
  2. {codeclone-1.4.2 → codeclone-1.4.4}/PKG-INFO +13 -12
  3. {codeclone-1.4.2 → codeclone-1.4.4}/README.md +12 -11
  4. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_html_snippets.py +50 -24
  5. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_explain.py +93 -23
  6. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/cache.py +59 -0
  7. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/cli.py +2 -0
  8. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/contracts.py +1 -1
  9. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/PKG-INFO +13 -12
  10. {codeclone-1.4.2 → codeclone-1.4.4}/pyproject.toml +1 -1
  11. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cache.py +93 -26
  12. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_inprocess.py +117 -1
  13. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_html_report.py +2 -2
  14. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_report.py +3 -3
  15. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/__init__.py +0 -0
  16. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_cli_args.py +0 -0
  17. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_cli_meta.py +0 -0
  18. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_cli_paths.py +0 -0
  19. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_cli_summary.py +0 -0
  20. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_html_escape.py +0 -0
  21. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_blocks.py +0 -0
  22. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_explain_contract.py +0 -0
  23. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_grouping.py +0 -0
  24. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_segments.py +0 -0
  25. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_serialize.py +0 -0
  26. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/_report_types.py +0 -0
  27. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/baseline.py +0 -0
  28. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/blockhash.py +0 -0
  29. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/blocks.py +0 -0
  30. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/cfg.py +0 -0
  31. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/cfg_model.py +0 -0
  32. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/errors.py +0 -0
  33. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/extractor.py +0 -0
  34. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/fingerprint.py +0 -0
  35. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/html_report.py +0 -0
  36. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/meta_markers.py +0 -0
  37. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/normalize.py +0 -0
  38. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/py.typed +0 -0
  39. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/report.py +0 -0
  40. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/scanner.py +0 -0
  41. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/templates.py +0 -0
  42. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone/ui_messages.py +0 -0
  43. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/SOURCES.txt +0 -0
  44. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/dependency_links.txt +0 -0
  45. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/entry_points.txt +0 -0
  46. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/requires.txt +0 -0
  47. {codeclone-1.4.2 → codeclone-1.4.4}/codeclone.egg-info/top_level.txt +0 -0
  48. {codeclone-1.4.2 → codeclone-1.4.4}/setup.cfg +0 -0
  49. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_baseline.py +0 -0
  50. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_blockhash.py +0 -0
  51. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_blocks.py +0 -0
  52. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cfg.py +0 -0
  53. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cfg_model.py +0 -0
  54. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_main_guard.py +0 -0
  55. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_main_guard_runpy.py +0 -0
  56. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_smoke.py +0 -0
  57. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_cli_unit.py +0 -0
  58. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_detector_golden.py +0 -0
  59. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_extractor.py +0 -0
  60. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_fingerprint.py +0 -0
  61. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_init.py +0 -0
  62. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_normalize.py +0 -0
  63. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_report_explain.py +0 -0
  64. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_scanner_extra.py +0 -0
  65. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_security.py +0 -0
  66. {codeclone-1.4.2 → codeclone-1.4.4}/tests/test_segments.py +0 -0
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
18
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
19
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
20
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeclone
3
- Version: 1.4.2
3
+ Version: 1.4.4
4
4
  Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
5
5
  Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
6
6
  Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
@@ -49,7 +49,7 @@ Dynamic: license-file
49
49
  ![Baseline](https://img.shields.io/badge/baseline-versioned-green?style=flat-square)
50
50
  [![License](https://img.shields.io/pypi/l/codeclone.svg?style=flat-square)](LICENSE)
51
51
 
52
- **CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
52
+ **CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
53
53
  It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI.
54
54
 
55
55
  ---
@@ -75,13 +75,13 @@ Unlike token-based tools, CodeClone compares **structure and control flow**, mak
75
75
 
76
76
  **Three Detection Levels:**
77
77
 
78
- 1. **Function clones (CFG fingerprint)**
78
+ 1. **Function clones (CFG fingerprint)**
79
79
  Strong structural signal for cross-layer duplication
80
80
 
81
- 2. **Block clones (statement windows)**
81
+ 2. **Block clones (statement windows)**
82
82
  Detects repeated local logic patterns
83
83
 
84
- 3. **Segment clones (report-only)**
84
+ 3. **Segment clones (report-only)**
85
85
  Internal function repetition for explainability; not used for baseline gating
86
86
 
87
87
  **CI-Ready Features:**
@@ -158,12 +158,12 @@ Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md)
158
158
 
159
159
  CodeClone uses a deterministic exit code contract:
160
160
 
161
- | Code | Meaning |
162
- |------|-----------------------------------------------------------------------------|
163
- | `0` | Success — run completed without gating failures |
161
+ | Code | Meaning |
162
+ |------|-------------------------------------------------------------------------------------------------------------------------------------|
163
+ | `0` | Success — run completed without gating failures |
164
164
  | `2` | Contract error — baseline missing/untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating |
165
- | `3` | Gating failure — new clones detected or threshold exceeded |
166
- | `5` | Internal error — unexpected exception |
165
+ | `3` | Gating failure — new clones detected or threshold exceeded |
166
+ | `5` | Internal error — unexpected exception |
167
167
 
168
168
  **Priority:** Contract errors (`2`) override gating failures (`3`) when both occur.
169
169
 
@@ -223,7 +223,7 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
223
223
  "cache_path": "/path/to/.cache/codeclone/cache.json",
224
224
  "cache_used": true,
225
225
  "cache_status": "ok",
226
- "cache_schema_version": "1.2",
226
+ "cache_schema_version": "1.3",
227
227
  "files_skipped_source_io": 0,
228
228
  "groups_counts": {
229
229
  "functions": {
@@ -304,7 +304,8 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
304
304
  Cache is an optimization layer only and is never a source of truth.
305
305
 
306
306
  - Default path: `<root>/.cache/codeclone/cache.json`
307
- - Schema version: **v1.2**
307
+ - Schema version: **v1.3**
308
+ - Compatibility includes analysis profile (`min_loc`, `min_stmt`)
308
309
  - Invalid or oversized cache is ignored with warning and rebuilt (fail-open)
309
310
 
310
311
  Full contract details: [`docs/book/07-cache.md`](docs/book/07-cache.md)
@@ -8,7 +8,7 @@
8
8
  ![Baseline](https://img.shields.io/badge/baseline-versioned-green?style=flat-square)
9
9
  [![License](https://img.shields.io/pypi/l/codeclone.svg?style=flat-square)](LICENSE)
10
10
 
11
- **CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
11
+ **CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
12
12
  It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI.
13
13
 
14
14
  ---
@@ -34,13 +34,13 @@ Unlike token-based tools, CodeClone compares **structure and control flow**, mak
34
34
 
35
35
  **Three Detection Levels:**
36
36
 
37
- 1. **Function clones (CFG fingerprint)**
37
+ 1. **Function clones (CFG fingerprint)**
38
38
  Strong structural signal for cross-layer duplication
39
39
 
40
- 2. **Block clones (statement windows)**
40
+ 2. **Block clones (statement windows)**
41
41
  Detects repeated local logic patterns
42
42
 
43
- 3. **Segment clones (report-only)**
43
+ 3. **Segment clones (report-only)**
44
44
  Internal function repetition for explainability; not used for baseline gating
45
45
 
46
46
  **CI-Ready Features:**
@@ -117,12 +117,12 @@ Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md)
117
117
 
118
118
  CodeClone uses a deterministic exit code contract:
119
119
 
120
- | Code | Meaning |
121
- |------|-----------------------------------------------------------------------------|
122
- | `0` | Success — run completed without gating failures |
120
+ | Code | Meaning |
121
+ |------|-------------------------------------------------------------------------------------------------------------------------------------|
122
+ | `0` | Success — run completed without gating failures |
123
123
  | `2` | Contract error — baseline missing/untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating |
124
- | `3` | Gating failure — new clones detected or threshold exceeded |
125
- | `5` | Internal error — unexpected exception |
124
+ | `3` | Gating failure — new clones detected or threshold exceeded |
125
+ | `5` | Internal error — unexpected exception |
126
126
 
127
127
  **Priority:** Contract errors (`2`) override gating failures (`3`) when both occur.
128
128
 
@@ -182,7 +182,7 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
182
182
  "cache_path": "/path/to/.cache/codeclone/cache.json",
183
183
  "cache_used": true,
184
184
  "cache_status": "ok",
185
- "cache_schema_version": "1.2",
185
+ "cache_schema_version": "1.3",
186
186
  "files_skipped_source_io": 0,
187
187
  "groups_counts": {
188
188
  "functions": {
@@ -263,7 +263,8 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
263
263
  Cache is an optimization layer only and is never a source of truth.
264
264
 
265
265
  - Default path: `<root>/.cache/codeclone/cache.json`
266
- - Schema version: **v1.2**
266
+ - Schema version: **v1.3**
267
+ - Compatibility includes analysis profile (`min_loc`, `min_stmt`)
267
268
  - Invalid or oversized cache is ignored with warning and rebuilt (fail-open)
268
269
 
269
270
  Full contract details: [`docs/book/07-cache.md`](docs/book/07-cache.md)
@@ -14,6 +14,7 @@ import itertools
14
14
  from collections.abc import Iterable
15
15
  from dataclasses import dataclass
16
16
  from functools import lru_cache
17
+ from types import ModuleType
17
18
  from typing import NamedTuple, cast
18
19
 
19
20
  from .errors import FileProcessingError
@@ -34,33 +35,19 @@ class _Snippet:
34
35
 
35
36
 
36
37
  class _FileCache:
37
- __slots__ = ("_get_lines_impl", "maxsize")
38
+ __slots__ = ("_get_file_lines_impl", "maxsize")
38
39
 
39
40
  def __init__(self, maxsize: int = 128) -> None:
40
41
  self.maxsize = maxsize
41
- self._get_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_range)
42
+ self._get_file_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_lines)
42
43
 
43
44
  @staticmethod
44
- def _read_file_range(
45
- filepath: str, start_line: int, end_line: int
46
- ) -> tuple[str, ...]:
47
- if start_line < 1:
48
- start_line = 1
49
- if end_line < start_line:
50
- return ()
51
-
45
+ def _read_file_lines(filepath: str) -> tuple[str, ...]:
52
46
  try:
53
47
 
54
48
  def _read_with_errors(errors: str) -> tuple[str, ...]:
55
- lines: list[str] = []
56
49
  with open(filepath, encoding="utf-8", errors=errors) as f:
57
- for lineno, line in enumerate(f, start=1):
58
- if lineno < start_line:
59
- continue
60
- if lineno > end_line:
61
- break
62
- lines.append(line.rstrip("\n"))
63
- return tuple(lines)
50
+ return tuple(line.rstrip("\n") for line in f)
64
51
 
65
52
  try:
66
53
  return _read_with_errors("strict")
@@ -72,7 +59,16 @@ class _FileCache:
72
59
  def get_lines_range(
73
60
  self, filepath: str, start_line: int, end_line: int
74
61
  ) -> tuple[str, ...]:
75
- return self._get_lines_impl(filepath, start_line, end_line)
62
+ if start_line < 1:
63
+ start_line = 1
64
+ if end_line < start_line:
65
+ return ()
66
+ lines = self._get_file_lines_impl(filepath)
67
+ start_index = start_line - 1
68
+ if start_index >= len(lines):
69
+ return ()
70
+ end_index = min(len(lines), end_line)
71
+ return lines[start_index:end_index]
76
72
 
77
73
  class _CacheInfo(NamedTuple):
78
74
  hits: int
@@ -81,10 +77,30 @@ class _FileCache:
81
77
  currsize: int
82
78
 
83
79
  def cache_info(self) -> _CacheInfo:
84
- return cast(_FileCache._CacheInfo, self._get_lines_impl.cache_info())
80
+ return cast(_FileCache._CacheInfo, self._get_file_lines_impl.cache_info())
85
81
 
86
82
 
87
- def _try_pygments(code: str) -> str | None:
83
+ _PYGMENTS_IMPORTER_ID: int | None = None
84
+ _PYGMENTS_API: tuple[ModuleType, ModuleType, ModuleType] | None = None
85
+
86
+
87
+ def _load_pygments_api() -> tuple[ModuleType, ModuleType, ModuleType] | None:
88
+ """
89
+ Load pygments modules once per import-function identity.
90
+
91
+ Tests monkeypatch `importlib.import_module`; tracking importer identity keeps
92
+ behavior deterministic and allows import-error branches to stay testable.
93
+ """
94
+ global _PYGMENTS_IMPORTER_ID
95
+ global _PYGMENTS_API
96
+
97
+ importer_id = id(importlib.import_module)
98
+ if importer_id != _PYGMENTS_IMPORTER_ID:
99
+ _PYGMENTS_IMPORTER_ID = importer_id
100
+ _PYGMENTS_API = None
101
+ if _PYGMENTS_API is not None:
102
+ return _PYGMENTS_API
103
+
88
104
  try:
89
105
  pygments = importlib.import_module("pygments")
90
106
  formatters = importlib.import_module("pygments.formatters")
@@ -92,6 +108,16 @@ def _try_pygments(code: str) -> str | None:
92
108
  except ImportError:
93
109
  return None
94
110
 
111
+ _PYGMENTS_API = (pygments, formatters, lexers)
112
+ return _PYGMENTS_API
113
+
114
+
115
+ def _try_pygments(code: str) -> str | None:
116
+ pygments_api = _load_pygments_api()
117
+ if pygments_api is None:
118
+ return None
119
+ pygments, formatters, lexers = pygments_api
120
+
95
121
  highlight = pygments.highlight
96
122
  formatter_cls = formatters.HtmlFormatter
97
123
  lexer_cls = lexers.PythonLexer
@@ -104,10 +130,10 @@ def _pygments_css(style_name: str) -> str:
104
130
  Returns CSS for pygments tokens. Scoped to `.codebox` to avoid leaking styles.
105
131
  If Pygments is not available or style missing, returns "".
106
132
  """
107
- try:
108
- formatters = importlib.import_module("pygments.formatters")
109
- except ImportError:
133
+ pygments_api = _load_pygments_api()
134
+ if pygments_api is None:
110
135
  return ""
136
+ _, formatters, _ = pygments_api
111
137
 
112
138
  try:
113
139
  formatter_cls = formatters.HtmlFormatter
@@ -9,6 +9,8 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import ast
12
+ from bisect import bisect_left, bisect_right
13
+ from dataclasses import dataclass
12
14
  from pathlib import Path
13
15
 
14
16
  from ._report_explain_contract import (
@@ -23,6 +25,19 @@ from ._report_explain_contract import (
23
25
  from ._report_types import GroupItem, GroupMap
24
26
 
25
27
 
28
+ @dataclass(frozen=True, slots=True)
29
+ class _StatementRecord:
30
+ node: ast.stmt
31
+ start_line: int
32
+ end_line: int
33
+ start_col: int
34
+ end_col: int
35
+ type_name: str
36
+
37
+
38
+ _StatementIndex = tuple[tuple[_StatementRecord, ...], tuple[int, ...]]
39
+
40
+
26
41
  def _signature_parts(group_key: str) -> list[str]:
27
42
  return [part for part in group_key.split("|") if part]
28
43
 
@@ -42,6 +57,53 @@ def _parsed_file_tree(
42
57
  return tree
43
58
 
44
59
 
60
+ def _build_statement_index(tree: ast.AST) -> _StatementIndex:
61
+ records = tuple(
62
+ sorted(
63
+ (
64
+ _StatementRecord(
65
+ node=node,
66
+ start_line=int(getattr(node, "lineno", 0)),
67
+ end_line=int(getattr(node, "end_lineno", 0)),
68
+ start_col=int(getattr(node, "col_offset", 0)),
69
+ end_col=int(getattr(node, "end_col_offset", 0)),
70
+ type_name=type(node).__name__,
71
+ )
72
+ for node in ast.walk(tree)
73
+ if isinstance(node, ast.stmt)
74
+ ),
75
+ key=lambda record: (
76
+ record.start_line,
77
+ record.end_line,
78
+ record.start_col,
79
+ record.end_col,
80
+ record.type_name,
81
+ ),
82
+ )
83
+ )
84
+ start_lines = tuple(record.start_line for record in records)
85
+ return records, start_lines
86
+
87
+
88
+ def _parsed_statement_index(
89
+ filepath: str,
90
+ *,
91
+ ast_cache: dict[str, ast.AST | None],
92
+ stmt_index_cache: dict[str, _StatementIndex | None],
93
+ ) -> _StatementIndex | None:
94
+ if filepath in stmt_index_cache:
95
+ return stmt_index_cache[filepath]
96
+
97
+ tree = _parsed_file_tree(filepath, ast_cache=ast_cache)
98
+ if tree is None:
99
+ stmt_index_cache[filepath] = None
100
+ return None
101
+
102
+ index = _build_statement_index(tree)
103
+ stmt_index_cache[filepath] = index
104
+ return index
105
+
106
+
45
107
  def _is_assert_like_stmt(stmt: ast.stmt) -> bool:
46
108
  if isinstance(stmt, ast.Assert):
47
109
  return True
@@ -64,45 +126,42 @@ def _assert_range_stats(
64
126
  start_line: int,
65
127
  end_line: int,
66
128
  ast_cache: dict[str, ast.AST | None],
129
+ stmt_index_cache: dict[str, _StatementIndex | None],
67
130
  range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
68
131
  ) -> tuple[int, int, int]:
69
132
  cache_key = (filepath, start_line, end_line)
70
133
  if cache_key in range_cache:
71
134
  return range_cache[cache_key]
72
135
 
73
- tree = _parsed_file_tree(filepath, ast_cache=ast_cache)
74
- if tree is None:
136
+ statement_index = _parsed_statement_index(
137
+ filepath,
138
+ ast_cache=ast_cache,
139
+ stmt_index_cache=stmt_index_cache,
140
+ )
141
+ if statement_index is None:
75
142
  range_cache[cache_key] = (0, 0, 0)
76
143
  return 0, 0, 0
77
144
 
78
- stmts = [
79
- node
80
- for node in ast.walk(tree)
81
- if isinstance(node, ast.stmt)
82
- and int(getattr(node, "lineno", 0)) >= start_line
83
- and int(getattr(node, "end_lineno", 0)) <= end_line
84
- ]
85
- if not stmts:
145
+ records, start_lines = statement_index
146
+ if not records:
86
147
  range_cache[cache_key] = (0, 0, 0)
87
148
  return 0, 0, 0
88
149
 
89
- ordered_stmts = sorted(
90
- stmts,
91
- key=lambda stmt: (
92
- int(getattr(stmt, "lineno", 0)),
93
- int(getattr(stmt, "end_lineno", 0)),
94
- int(getattr(stmt, "col_offset", 0)),
95
- int(getattr(stmt, "end_col_offset", 0)),
96
- type(stmt).__name__,
97
- ),
98
- )
150
+ left = bisect_left(start_lines, start_line)
151
+ right = bisect_right(start_lines, end_line)
152
+ if left >= right:
153
+ range_cache[cache_key] = (0, 0, 0)
154
+ return 0, 0, 0
99
155
 
100
- total = len(ordered_stmts)
156
+ total = 0
101
157
  assert_like = 0
102
158
  max_consecutive = 0
103
159
  current_consecutive = 0
104
- for stmt in ordered_stmts:
105
- if _is_assert_like_stmt(stmt):
160
+ for record in records[left:right]:
161
+ if record.end_line > end_line:
162
+ continue
163
+ total += 1
164
+ if _is_assert_like_stmt(record.node):
106
165
  assert_like += 1
107
166
  current_consecutive += 1
108
167
  if current_consecutive > max_consecutive:
@@ -110,6 +169,10 @@ def _assert_range_stats(
110
169
  else:
111
170
  current_consecutive = 0
112
171
 
172
+ if total == 0:
173
+ range_cache[cache_key] = (0, 0, 0)
174
+ return 0, 0, 0
175
+
113
176
  stats = (total, assert_like, max_consecutive)
114
177
  range_cache[cache_key] = stats
115
178
  return stats
@@ -121,6 +184,7 @@ def _is_assert_only_range(
121
184
  start_line: int,
122
185
  end_line: int,
123
186
  ast_cache: dict[str, ast.AST | None],
187
+ stmt_index_cache: dict[str, _StatementIndex | None],
124
188
  range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
125
189
  ) -> bool:
126
190
  total, assert_like, _ = _assert_range_stats(
@@ -128,6 +192,7 @@ def _is_assert_only_range(
128
192
  start_line=start_line,
129
193
  end_line=end_line,
130
194
  ast_cache=ast_cache,
195
+ stmt_index_cache=stmt_index_cache,
131
196
  range_cache=range_cache,
132
197
  )
133
198
  return total > 0 and total == assert_like
@@ -157,6 +222,7 @@ def _enrich_with_assert_facts(
157
222
  facts: dict[str, str],
158
223
  items: list[GroupItem],
159
224
  ast_cache: dict[str, ast.AST | None],
225
+ stmt_index_cache: dict[str, _StatementIndex | None],
160
226
  range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
161
227
  ) -> None:
162
228
  assert_only = True
@@ -181,6 +247,7 @@ def _enrich_with_assert_facts(
181
247
  start_line=start_line,
182
248
  end_line=end_line,
183
249
  ast_cache=ast_cache,
250
+ stmt_index_cache=stmt_index_cache,
184
251
  range_cache=range_cache,
185
252
  )
186
253
  total_statements += range_total
@@ -198,6 +265,7 @@ def _enrich_with_assert_facts(
198
265
  start_line=start_line,
199
266
  end_line=end_line,
200
267
  ast_cache=ast_cache,
268
+ stmt_index_cache=stmt_index_cache,
201
269
  range_cache=range_cache,
202
270
  )
203
271
  ):
@@ -223,6 +291,7 @@ def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]]
223
291
  Renderers (HTML/TXT/JSON) should only display these facts.
224
292
  """
225
293
  ast_cache: dict[str, ast.AST | None] = {}
294
+ stmt_index_cache: dict[str, _StatementIndex | None] = {}
226
295
  range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {}
227
296
  facts_by_group: dict[str, dict[str, str]] = {}
228
297
 
@@ -232,6 +301,7 @@ def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]]
232
301
  facts=facts,
233
302
  items=items,
234
303
  ast_cache=ast_cache,
304
+ stmt_index_cache=stmt_index_cache,
235
305
  range_cache=range_cache,
236
306
  )
237
307
  group_arity = len(items)
@@ -39,6 +39,7 @@ class CacheStatus(str, Enum):
39
39
  VERSION_MISMATCH = "version_mismatch"
40
40
  PYTHON_TAG_MISMATCH = "python_tag_mismatch"
41
41
  FINGERPRINT_MISMATCH = "mismatch_fingerprint_version"
42
+ ANALYSIS_PROFILE_MISMATCH = "analysis_profile_mismatch"
42
43
  INTEGRITY_FAILED = "integrity_failed"
43
44
 
44
45
 
@@ -84,15 +85,22 @@ class CacheEntry(TypedDict):
84
85
  segments: list[SegmentDict]
85
86
 
86
87
 
88
+ class AnalysisProfile(TypedDict):
89
+ min_loc: int
90
+ min_stmt: int
91
+
92
+
87
93
  class CacheData(TypedDict):
88
94
  version: str
89
95
  python_tag: str
90
96
  fingerprint_version: str
97
+ analysis_profile: AnalysisProfile
91
98
  files: dict[str, CacheEntry]
92
99
 
93
100
 
94
101
  class Cache:
95
102
  __slots__ = (
103
+ "analysis_profile",
96
104
  "cache_schema_version",
97
105
  "data",
98
106
  "fingerprint_version",
@@ -112,14 +120,21 @@ class Cache:
112
120
  *,
113
121
  root: str | Path | None = None,
114
122
  max_size_bytes: int | None = None,
123
+ min_loc: int = 15,
124
+ min_stmt: int = 6,
115
125
  ):
116
126
  self.path = Path(path)
117
127
  self.root = _resolve_root(root)
118
128
  self.fingerprint_version = BASELINE_FINGERPRINT_VERSION
129
+ self.analysis_profile: AnalysisProfile = {
130
+ "min_loc": min_loc,
131
+ "min_stmt": min_stmt,
132
+ }
119
133
  self.data: CacheData = _empty_cache_data(
120
134
  version=self._CACHE_VERSION,
121
135
  python_tag=current_python_tag(),
122
136
  fingerprint_version=self.fingerprint_version,
137
+ analysis_profile=self.analysis_profile,
123
138
  )
124
139
  self.legacy_secret_warning = self._detect_legacy_secret_warning()
125
140
  self.cache_schema_version: str | None = None
@@ -164,6 +179,7 @@ class Cache:
164
179
  version=self._CACHE_VERSION,
165
180
  python_tag=current_python_tag(),
166
181
  fingerprint_version=self.fingerprint_version,
182
+ analysis_profile=self.analysis_profile,
167
183
  )
168
184
 
169
185
  def _sign_data(self, data: Mapping[str, object]) -> str:
@@ -309,6 +325,28 @@ class Cache:
309
325
  )
310
326
  return None
311
327
 
328
+ analysis_profile = _as_analysis_profile(payload.get("ap"))
329
+ if analysis_profile is None:
330
+ self._ignore_cache(
331
+ "Cache format invalid; ignoring cache.",
332
+ status=CacheStatus.INVALID_TYPE,
333
+ schema_version=version,
334
+ )
335
+ return None
336
+
337
+ if analysis_profile != self.analysis_profile:
338
+ self._ignore_cache(
339
+ "Cache analysis profile mismatch "
340
+ f"(found min_loc={analysis_profile['min_loc']}, "
341
+ f"min_stmt={analysis_profile['min_stmt']}; "
342
+ f"expected min_loc={self.analysis_profile['min_loc']}, "
343
+ f"min_stmt={self.analysis_profile['min_stmt']}); "
344
+ "ignoring cache.",
345
+ status=CacheStatus.ANALYSIS_PROFILE_MISMATCH,
346
+ schema_version=version,
347
+ )
348
+ return None
349
+
312
350
  files_obj = payload.get("files")
313
351
  files_dict = _as_str_dict(files_obj)
314
352
  if files_dict is None:
@@ -337,6 +375,7 @@ class Cache:
337
375
  "version": self._CACHE_VERSION,
338
376
  "python_tag": runtime_tag,
339
377
  "fingerprint_version": self.fingerprint_version,
378
+ "analysis_profile": self.analysis_profile,
340
379
  "files": parsed_files,
341
380
  }
342
381
 
@@ -356,6 +395,7 @@ class Cache:
356
395
  payload: dict[str, object] = {
357
396
  "py": current_python_tag(),
358
397
  "fp": self.fingerprint_version,
398
+ "ap": self.analysis_profile,
359
399
  "files": wire_files,
360
400
  }
361
401
  signed_doc = {
@@ -371,6 +411,7 @@ class Cache:
371
411
  self.data["version"] = self._CACHE_VERSION
372
412
  self.data["python_tag"] = current_python_tag()
373
413
  self.data["fingerprint_version"] = self.fingerprint_version
414
+ self.data["analysis_profile"] = self.analysis_profile
374
415
 
375
416
  except OSError as e:
376
417
  raise CacheError(f"Failed to save cache: {e}") from e
@@ -508,11 +549,13 @@ def _empty_cache_data(
508
549
  version: str,
509
550
  python_tag: str,
510
551
  fingerprint_version: str,
552
+ analysis_profile: AnalysisProfile,
511
553
  ) -> CacheData:
512
554
  return {
513
555
  "version": version,
514
556
  "python_tag": python_tag,
515
557
  "fingerprint_version": fingerprint_version,
558
+ "analysis_profile": analysis_profile,
516
559
  "files": {},
517
560
  }
518
561
 
@@ -542,6 +585,22 @@ def _as_str_dict(value: object) -> dict[str, object] | None:
542
585
  return value
543
586
 
544
587
 
588
+ def _as_analysis_profile(value: object) -> AnalysisProfile | None:
589
+ obj = _as_str_dict(value)
590
+ if obj is None:
591
+ return None
592
+
593
+ if set(obj.keys()) != {"min_loc", "min_stmt"}:
594
+ return None
595
+
596
+ min_loc = _as_int(obj.get("min_loc"))
597
+ min_stmt = _as_int(obj.get("min_stmt"))
598
+ if min_loc is None or min_stmt is None:
599
+ return None
600
+
601
+ return {"min_loc": min_loc, "min_stmt": min_stmt}
602
+
603
+
545
604
  def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None:
546
605
  obj = _as_str_dict(value)
547
606
  if obj is None:
@@ -310,6 +310,8 @@ def _main_impl() -> None:
310
310
  cache_path,
311
311
  root=root_path,
312
312
  max_size_bytes=args.max_cache_size_mb * 1024 * 1024,
313
+ min_loc=args.min_loc,
314
+ min_stmt=args.min_stmt,
313
315
  )
314
316
  cache.load()
315
317
  if cache.load_warning:
@@ -14,7 +14,7 @@ from typing import Final
14
14
  BASELINE_SCHEMA_VERSION: Final = "1.0"
15
15
  BASELINE_FINGERPRINT_VERSION: Final = "1"
16
16
 
17
- CACHE_VERSION: Final = "1.2"
17
+ CACHE_VERSION: Final = "1.3"
18
18
  REPORT_SCHEMA_VERSION: Final = "1.1"
19
19
 
20
20
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeclone
3
- Version: 1.4.2
3
+ Version: 1.4.4
4
4
  Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
5
5
  Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
6
6
  Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
@@ -49,7 +49,7 @@ Dynamic: license-file
49
49
  ![Baseline](https://img.shields.io/badge/baseline-versioned-green?style=flat-square)
50
50
  [![License](https://img.shields.io/pypi/l/codeclone.svg?style=flat-square)](LICENSE)
51
51
 
52
- **CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
52
+ **CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**.
53
53
  It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI.
54
54
 
55
55
  ---
@@ -75,13 +75,13 @@ Unlike token-based tools, CodeClone compares **structure and control flow**, mak
75
75
 
76
76
  **Three Detection Levels:**
77
77
 
78
- 1. **Function clones (CFG fingerprint)**
78
+ 1. **Function clones (CFG fingerprint)**
79
79
  Strong structural signal for cross-layer duplication
80
80
 
81
- 2. **Block clones (statement windows)**
81
+ 2. **Block clones (statement windows)**
82
82
  Detects repeated local logic patterns
83
83
 
84
- 3. **Segment clones (report-only)**
84
+ 3. **Segment clones (report-only)**
85
85
  Internal function repetition for explainability; not used for baseline gating
86
86
 
87
87
  **CI-Ready Features:**
@@ -158,12 +158,12 @@ Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md)
158
158
 
159
159
  CodeClone uses a deterministic exit code contract:
160
160
 
161
- | Code | Meaning |
162
- |------|-----------------------------------------------------------------------------|
163
- | `0` | Success — run completed without gating failures |
161
+ | Code | Meaning |
162
+ |------|-------------------------------------------------------------------------------------------------------------------------------------|
163
+ | `0` | Success — run completed without gating failures |
164
164
  | `2` | Contract error — baseline missing/untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating |
165
- | `3` | Gating failure — new clones detected or threshold exceeded |
166
- | `5` | Internal error — unexpected exception |
165
+ | `3` | Gating failure — new clones detected or threshold exceeded |
166
+ | `5` | Internal error — unexpected exception |
167
167
 
168
168
  **Priority:** Contract errors (`2`) override gating failures (`3`) when both occur.
169
169
 
@@ -223,7 +223,7 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
223
223
  "cache_path": "/path/to/.cache/codeclone/cache.json",
224
224
  "cache_used": true,
225
225
  "cache_status": "ok",
226
- "cache_schema_version": "1.2",
226
+ "cache_schema_version": "1.3",
227
227
  "files_skipped_source_io": 0,
228
228
  "groups_counts": {
229
229
  "functions": {
@@ -304,7 +304,8 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md)
304
304
  Cache is an optimization layer only and is never a source of truth.
305
305
 
306
306
  - Default path: `<root>/.cache/codeclone/cache.json`
307
- - Schema version: **v1.2**
307
+ - Schema version: **v1.3**
308
+ - Compatibility includes analysis profile (`min_loc`, `min_stmt`)
308
309
  - Invalid or oversized cache is ignored with warning and rebuilt (fail-open)
309
310
 
310
311
  Full contract details: [`docs/book/07-cache.md`](docs/book/07-cache.md)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "codeclone"
7
- version = "1.4.2"
7
+ version = "1.4.4"
8
8
  description = "AST and CFG-based code clone detector for Python focused on architectural duplication"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  license = { text = "MIT" }
@@ -50,6 +50,15 @@ def _make_segment(filepath: str) -> SegmentUnit:
50
50
  )
51
51
 
52
52
 
53
+ def _analysis_payload(cache: Cache, *, files: object) -> dict[str, object]:
54
+ return {
55
+ "py": cache.data["python_tag"],
56
+ "fp": cache.data["fingerprint_version"],
57
+ "ap": cache.data["analysis_profile"],
58
+ "files": files,
59
+ }
60
+
61
+
53
62
  def test_cache_roundtrip(tmp_path: Path) -> None:
54
63
  cache_path = tmp_path / "cache.json"
55
64
  cache = Cache(cache_path)
@@ -97,7 +106,7 @@ def test_get_file_entry_missing_after_fallback_returns_none(tmp_path: Path) -> N
97
106
  assert cache.get_file_entry(str(root / "pkg" / "missing.py")) is None
98
107
 
99
108
 
100
- def test_cache_v12_uses_relpaths_when_root_set(tmp_path: Path) -> None:
109
+ def test_cache_v13_uses_relpaths_when_root_set(tmp_path: Path) -> None:
101
110
  project_root = tmp_path / "project"
102
111
  target = project_root / "pkg" / "module.py"
103
112
  target.parent.mkdir(parents=True, exist_ok=True)
@@ -121,14 +130,10 @@ def test_cache_v12_uses_relpaths_when_root_set(tmp_path: Path) -> None:
121
130
  assert str(target) not in files
122
131
 
123
132
 
124
- def test_cache_v12_missing_optional_sections_default_empty(tmp_path: Path) -> None:
133
+ def test_cache_v13_missing_optional_sections_default_empty(tmp_path: Path) -> None:
125
134
  cache_path = tmp_path / "cache.json"
126
135
  cache = Cache(cache_path)
127
- payload = {
128
- "py": cache.data["python_tag"],
129
- "fp": cache.data["fingerprint_version"],
130
- "files": {"x.py": {"st": [1, 2]}},
131
- }
136
+ payload = _analysis_payload(cache, files={"x.py": {"st": [1, 2]}})
132
137
  signature = cache._sign_data(payload)
133
138
  cache_path.write_text(
134
139
  json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": signature}),
@@ -201,11 +206,7 @@ def test_cache_version_mismatch_warns(tmp_path: Path) -> None:
201
206
  def test_cache_v_field_version_mismatch_warns(tmp_path: Path) -> None:
202
207
  cache_path = tmp_path / "cache.json"
203
208
  cache = Cache(cache_path)
204
- payload = {
205
- "py": cache.data["python_tag"],
206
- "fp": cache.data["fingerprint_version"],
207
- "files": {},
208
- }
209
+ payload = _analysis_payload(cache, files={})
209
210
  signature = cache._sign_data(payload)
210
211
  cache_path.write_text(
211
212
  json.dumps({"v": "0.0", "payload": payload, "sig": signature}), "utf-8"
@@ -527,11 +528,7 @@ def test_cache_load_unreadable_read_graceful_ignore(
527
528
  def test_cache_load_invalid_files_type(tmp_path: Path) -> None:
528
529
  cache_path = tmp_path / "cache.json"
529
530
  cache = Cache(cache_path)
530
- payload = {
531
- "py": cache.data["python_tag"],
532
- "fp": cache.data["fingerprint_version"],
533
- "files": [],
534
- }
531
+ payload = _analysis_payload(cache, files=[])
535
532
  signature = cache._sign_data(payload)
536
533
  cache_path.write_text(
537
534
  json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": signature}),
@@ -644,11 +641,7 @@ def test_cache_load_invalid_top_level_type(tmp_path: Path) -> None:
644
641
  def test_cache_load_missing_v_field(tmp_path: Path) -> None:
645
642
  cache_path = tmp_path / "cache.json"
646
643
  cache = Cache(cache_path)
647
- payload = {
648
- "py": cache.data["python_tag"],
649
- "fp": cache.data["fingerprint_version"],
650
- "files": {},
651
- }
644
+ payload = _analysis_payload(cache, files={})
652
645
  sig = cache._sign_data(payload)
653
646
  cache_path.write_text(json.dumps({"payload": payload, "sig": sig}), "utf-8")
654
647
  cache.load()
@@ -683,7 +676,12 @@ def test_cache_load_missing_python_tag_in_payload(tmp_path: Path) -> None:
683
676
  def test_cache_load_python_tag_mismatch(tmp_path: Path) -> None:
684
677
  cache_path = tmp_path / "cache.json"
685
678
  cache = Cache(cache_path)
686
- payload = {"py": "cp999", "fp": cache.data["fingerprint_version"], "files": {}}
679
+ payload = {
680
+ "py": "cp999",
681
+ "fp": cache.data["fingerprint_version"],
682
+ "ap": cache.data["analysis_profile"],
683
+ "files": {},
684
+ }
687
685
  sig = cache._sign_data(payload)
688
686
  cache_path.write_text(
689
687
  json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
@@ -709,7 +707,12 @@ def test_cache_load_missing_fingerprint_version(tmp_path: Path) -> None:
709
707
  def test_cache_load_fingerprint_version_mismatch(tmp_path: Path) -> None:
710
708
  cache_path = tmp_path / "cache.json"
711
709
  cache = Cache(cache_path)
712
- payload = {"py": cache.data["python_tag"], "fp": "old", "files": {}}
710
+ payload = {
711
+ "py": cache.data["python_tag"],
712
+ "fp": "old",
713
+ "ap": cache.data["analysis_profile"],
714
+ "files": {},
715
+ }
713
716
  sig = cache._sign_data(payload)
714
717
  cache_path.write_text(
715
718
  json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
@@ -719,18 +722,82 @@ def test_cache_load_fingerprint_version_mismatch(tmp_path: Path) -> None:
719
722
  assert "fingerprint version mismatch" in cache.load_warning
720
723
 
721
724
 
722
- def test_cache_load_invalid_wire_file_entry(tmp_path: Path) -> None:
725
+ def test_cache_load_analysis_profile_mismatch(tmp_path: Path) -> None:
726
+ cache_path = tmp_path / "cache.json"
727
+ cache = Cache(cache_path, min_loc=1, min_stmt=1)
728
+ cache.put_file_entry("x.py", {"mtime_ns": 1, "size": 10}, [], [], [])
729
+ cache.save()
730
+
731
+ loaded = Cache(cache_path, min_loc=15, min_stmt=6)
732
+ loaded.load()
733
+
734
+ assert loaded.load_warning is not None
735
+ assert "analysis profile mismatch" in loaded.load_warning
736
+ assert loaded.data["files"] == {}
737
+ assert loaded.load_status == CacheStatus.ANALYSIS_PROFILE_MISMATCH
738
+ assert loaded.cache_schema_version == Cache._CACHE_VERSION
739
+
740
+
741
+ def test_cache_load_missing_analysis_profile_in_payload(tmp_path: Path) -> None:
742
+ cache_path = tmp_path / "cache.json"
743
+ cache = Cache(cache_path)
744
+ payload = {
745
+ "py": cache.data["python_tag"],
746
+ "fp": cache.data["fingerprint_version"],
747
+ "files": {},
748
+ }
749
+ sig = cache._sign_data(payload)
750
+ cache_path.write_text(
751
+ json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
752
+ )
753
+
754
+ cache.load()
755
+ assert cache.load_warning is not None
756
+ assert "format invalid" in cache.load_warning
757
+ assert cache.load_status == CacheStatus.INVALID_TYPE
758
+ assert cache.cache_schema_version == Cache._CACHE_VERSION
759
+ assert cache.data["files"] == {}
760
+
761
+
762
+ @pytest.mark.parametrize(
763
+ "bad_analysis_profile",
764
+ [
765
+ {"min_loc": 15},
766
+ {"min_loc": "15", "min_stmt": 6},
767
+ ],
768
+ )
769
+ def test_cache_load_invalid_analysis_profile_payload(
770
+ tmp_path: Path, bad_analysis_profile: object
771
+ ) -> None:
723
772
  cache_path = tmp_path / "cache.json"
724
773
  cache = Cache(cache_path)
725
774
  payload = {
726
775
  "py": cache.data["python_tag"],
727
776
  "fp": cache.data["fingerprint_version"],
728
- "files": {"x.py": {"st": "bad"}},
777
+ "ap": bad_analysis_profile,
778
+ "files": {},
729
779
  }
730
780
  sig = cache._sign_data(payload)
731
781
  cache_path.write_text(
732
782
  json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
733
783
  )
784
+
785
+ cache.load()
786
+ assert cache.load_warning is not None
787
+ assert "format invalid" in cache.load_warning
788
+ assert cache.load_status == CacheStatus.INVALID_TYPE
789
+ assert cache.cache_schema_version == Cache._CACHE_VERSION
790
+ assert cache.data["files"] == {}
791
+
792
+
793
+ def test_cache_load_invalid_wire_file_entry(tmp_path: Path) -> None:
794
+ cache_path = tmp_path / "cache.json"
795
+ cache = Cache(cache_path)
796
+ payload = _analysis_payload(cache, files={"x.py": {"st": "bad"}})
797
+ sig = cache._sign_data(payload)
798
+ cache_path.write_text(
799
+ json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8"
800
+ )
734
801
  cache.load()
735
802
  assert cache.load_warning is not None
736
803
  assert "format invalid" in cache.load_warning
@@ -708,7 +708,7 @@ def test_cli_cache_status_string_fallback(
708
708
  def __init__(self, _path: Path, **_kwargs: object) -> None:
709
709
  self.load_warning = load_warning
710
710
  self.load_status = "not-a-cache-status"
711
- self.cache_schema_version = "1.2"
711
+ self.cache_schema_version = CACHE_VERSION
712
712
 
713
713
  def load(self) -> None:
714
714
  return None
@@ -1716,6 +1716,122 @@ def test_cli_reports_cache_meta_when_cache_missing(
1716
1716
  assert meta["cache_schema_version"] is None
1717
1717
 
1718
1718
 
1719
+ @pytest.mark.parametrize(
1720
+ (
1721
+ "first_min_loc",
1722
+ "first_min_stmt",
1723
+ "second_min_loc",
1724
+ "second_min_stmt",
1725
+ "expected_cache_used",
1726
+ "expected_cache_status",
1727
+ "expected_functions_total",
1728
+ "expected_warning",
1729
+ ),
1730
+ [
1731
+ (
1732
+ 1,
1733
+ 1,
1734
+ 15,
1735
+ 6,
1736
+ False,
1737
+ "analysis_profile_mismatch",
1738
+ 0,
1739
+ "analysis profile mismatch",
1740
+ ),
1741
+ (
1742
+ 15,
1743
+ 6,
1744
+ 1,
1745
+ 1,
1746
+ False,
1747
+ "analysis_profile_mismatch",
1748
+ 1,
1749
+ "analysis profile mismatch",
1750
+ ),
1751
+ (1, 1, 1, 1, True, "ok", 1, None),
1752
+ ],
1753
+ )
1754
+ def test_cli_cache_analysis_profile_compatibility(
1755
+ tmp_path: Path,
1756
+ monkeypatch: pytest.MonkeyPatch,
1757
+ capsys: pytest.CaptureFixture[str],
1758
+ first_min_loc: int,
1759
+ first_min_stmt: int,
1760
+ second_min_loc: int,
1761
+ second_min_stmt: int,
1762
+ expected_cache_used: bool,
1763
+ expected_cache_status: str,
1764
+ expected_functions_total: int,
1765
+ expected_warning: str | None,
1766
+ ) -> None:
1767
+ src = tmp_path / "a.py"
1768
+ src.write_text(
1769
+ """
1770
+ def f1():
1771
+ x = 1
1772
+ return x
1773
+
1774
+ def f2():
1775
+ y = 1
1776
+ return y
1777
+ """,
1778
+ "utf-8",
1779
+ )
1780
+ baseline_path = _write_baseline(
1781
+ tmp_path / "baseline.json",
1782
+ python_version=f"{sys.version_info.major}.{sys.version_info.minor}",
1783
+ )
1784
+ cache_path = tmp_path / "cache.json"
1785
+ json_first = tmp_path / "report-first.json"
1786
+ json_second = tmp_path / "report-second.json"
1787
+ _patch_parallel(monkeypatch)
1788
+
1789
+ _run_main(
1790
+ monkeypatch,
1791
+ [
1792
+ str(tmp_path),
1793
+ "--baseline",
1794
+ str(baseline_path),
1795
+ "--cache-path",
1796
+ str(cache_path),
1797
+ "--json",
1798
+ str(json_first),
1799
+ "--min-loc",
1800
+ str(first_min_loc),
1801
+ "--min-stmt",
1802
+ str(first_min_stmt),
1803
+ "--no-progress",
1804
+ ],
1805
+ )
1806
+ capsys.readouterr()
1807
+
1808
+ _run_main(
1809
+ monkeypatch,
1810
+ [
1811
+ str(tmp_path),
1812
+ "--baseline",
1813
+ str(baseline_path),
1814
+ "--cache-path",
1815
+ str(cache_path),
1816
+ "--json",
1817
+ str(json_second),
1818
+ "--min-loc",
1819
+ str(second_min_loc),
1820
+ "--min-stmt",
1821
+ str(second_min_stmt),
1822
+ "--no-progress",
1823
+ ],
1824
+ )
1825
+ out = capsys.readouterr().out
1826
+ payload = json.loads(json_second.read_text("utf-8"))
1827
+ meta = payload["meta"]
1828
+ if expected_warning is not None:
1829
+ assert expected_warning in out
1830
+ assert meta["cache_used"] is expected_cache_used
1831
+ assert meta["cache_status"] == expected_cache_status
1832
+ assert meta["groups_counts"]["functions"]["total"] == expected_functions_total
1833
+
1834
+
1719
1835
  @pytest.mark.parametrize(
1720
1836
  ("flag", "bad_name", "label", "expected"),
1721
1837
  [
@@ -6,7 +6,7 @@ from typing import Any
6
6
 
7
7
  import pytest
8
8
 
9
- from codeclone.contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL
9
+ from codeclone.contracts import CACHE_VERSION, DOCS_URL, ISSUES_URL, REPOSITORY_URL
10
10
  from codeclone.errors import FileProcessingError
11
11
  from codeclone.html_report import (
12
12
  _FileCache,
@@ -507,7 +507,7 @@ def test_html_report_includes_provenance_metadata(
507
507
  'data-cache-used="true"',
508
508
  "Cache schema",
509
509
  "Cache status",
510
- 'data-cache-schema-version="1.2"',
510
+ f'data-cache-schema-version="{CACHE_VERSION}"',
511
511
  'data-cache-status="ok"',
512
512
  'data-files-skipped-source-io="0"',
513
513
  "Source IO skipped",
@@ -7,7 +7,7 @@ from typing import cast
7
7
  import pytest
8
8
 
9
9
  import codeclone.report as report_mod
10
- from codeclone.contracts import REPORT_SCHEMA_VERSION
10
+ from codeclone.contracts import CACHE_VERSION, REPORT_SCHEMA_VERSION
11
11
  from codeclone.report import (
12
12
  GroupMap,
13
13
  build_block_group_facts,
@@ -276,7 +276,7 @@ def test_report_output_formats(
276
276
  '"baseline_schema_version": 1',
277
277
  f'"baseline_payload_sha256": "{"a" * 64}"',
278
278
  '"baseline_payload_sha256_verified": true',
279
- '"cache_schema_version": "1.2"',
279
+ f'"cache_schema_version": "{CACHE_VERSION}"',
280
280
  '"cache_status": "ok"',
281
281
  '"files_skipped_source_io": 0',
282
282
  ]
@@ -288,7 +288,7 @@ def test_report_output_formats(
288
288
  "Baseline generator name: codeclone",
289
289
  f"Baseline payload sha256: {'a' * 64}",
290
290
  "Baseline payload verified: true",
291
- "Cache schema version: 1.2",
291
+ f"Cache schema version: {CACHE_VERSION}",
292
292
  "Cache status: ok",
293
293
  "Source IO skipped: 0",
294
294
  "FUNCTION CLONES (NEW) (groups=2)",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes