codeclone 1.4.0__tar.gz → 1.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {codeclone-1.4.0 → codeclone-1.4.2}/PKG-INFO +1 -1
  2. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_cli_summary.py +11 -1
  3. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_html_snippets.py +9 -2
  4. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/blocks.py +19 -2
  5. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/cache.py +5 -5
  6. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/cli.py +46 -63
  7. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/extractor.py +38 -23
  8. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/html_report.py +1 -2
  9. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/scanner.py +12 -8
  10. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/templates.py +205 -110
  11. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/ui_messages.py +3 -3
  12. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone.egg-info/PKG-INFO +1 -1
  13. {codeclone-1.4.0 → codeclone-1.4.2}/pyproject.toml +1 -1
  14. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_cli_unit.py +8 -4
  15. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_extractor.py +105 -0
  16. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_html_report.py +2 -2
  17. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_security.py +22 -6
  18. {codeclone-1.4.0 → codeclone-1.4.2}/LICENSE +0 -0
  19. {codeclone-1.4.0 → codeclone-1.4.2}/README.md +0 -0
  20. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/__init__.py +0 -0
  21. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_cli_args.py +0 -0
  22. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_cli_meta.py +0 -0
  23. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_cli_paths.py +0 -0
  24. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_html_escape.py +0 -0
  25. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_report_blocks.py +0 -0
  26. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_report_explain.py +0 -0
  27. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_report_explain_contract.py +0 -0
  28. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_report_grouping.py +0 -0
  29. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_report_segments.py +0 -0
  30. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_report_serialize.py +0 -0
  31. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/_report_types.py +0 -0
  32. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/baseline.py +0 -0
  33. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/blockhash.py +0 -0
  34. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/cfg.py +0 -0
  35. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/cfg_model.py +0 -0
  36. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/contracts.py +0 -0
  37. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/errors.py +0 -0
  38. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/fingerprint.py +0 -0
  39. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/meta_markers.py +0 -0
  40. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/normalize.py +0 -0
  41. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/py.typed +0 -0
  42. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone/report.py +0 -0
  43. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone.egg-info/SOURCES.txt +0 -0
  44. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone.egg-info/dependency_links.txt +0 -0
  45. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone.egg-info/entry_points.txt +0 -0
  46. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone.egg-info/requires.txt +0 -0
  47. {codeclone-1.4.0 → codeclone-1.4.2}/codeclone.egg-info/top_level.txt +0 -0
  48. {codeclone-1.4.0 → codeclone-1.4.2}/setup.cfg +0 -0
  49. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_baseline.py +0 -0
  50. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_blockhash.py +0 -0
  51. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_blocks.py +0 -0
  52. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_cache.py +0 -0
  53. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_cfg.py +0 -0
  54. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_cfg_model.py +0 -0
  55. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_cli_inprocess.py +0 -0
  56. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_cli_main_guard.py +0 -0
  57. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_cli_main_guard_runpy.py +0 -0
  58. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_cli_smoke.py +0 -0
  59. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_detector_golden.py +0 -0
  60. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_fingerprint.py +0 -0
  61. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_init.py +0 -0
  62. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_normalize.py +0 -0
  63. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_report.py +0 -0
  64. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_report_explain.py +0 -0
  65. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_scanner_extra.py +0 -0
  66. {codeclone-1.4.0 → codeclone-1.4.2}/tests/test_segments.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeclone
3
- Version: 1.4.0
3
+ Version: 1.4.2
4
4
  Summary: AST and CFG-based code clone detector for Python focused on architectural duplication
5
5
  Author-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
6
6
  Maintainer-email: Den Rozhnovskiy <pytelemonbot@mail.ru>
@@ -14,6 +14,14 @@ from rich.text import Text
14
14
 
15
15
  from . import ui_messages as ui
16
16
 
17
+ _CLONE_LABELS = frozenset(
18
+ {
19
+ ui.SUMMARY_LABEL_FUNCTION,
20
+ ui.SUMMARY_LABEL_BLOCK,
21
+ ui.SUMMARY_LABEL_SEGMENT,
22
+ }
23
+ )
24
+
17
25
 
18
26
  def _summary_value_style(*, label: str, value: int) -> str:
19
27
  if value == 0:
@@ -22,7 +30,9 @@ def _summary_value_style(*, label: str, value: int) -> str:
22
30
  return "bold red"
23
31
  if label == ui.SUMMARY_LABEL_SUPPRESSED:
24
32
  return "yellow"
25
- return "bold green"
33
+ if label in _CLONE_LABELS:
34
+ return "bold yellow"
35
+ return "bold"
26
36
 
27
37
 
28
38
  def _build_summary_rows(
@@ -196,9 +196,16 @@ def _render_code_block(
196
196
  rendered.append(
197
197
  f'<div class="{cls}">{html.escape(text, quote=False)}</div>'
198
198
  )
199
- body = "\n".join(rendered)
199
+ body = "".join(rendered)
200
200
  else:
201
- body = highlighted
201
+ hit_flags = [hit for hit, _ in numbered]
202
+ pyg_lines = highlighted.split("\n")
203
+ rendered_pyg: list[str] = []
204
+ for i, pyg_line in enumerate(pyg_lines):
205
+ hit = hit_flags[i] if i < len(hit_flags) else False
206
+ cls = "hitline" if hit else "line"
207
+ rendered_pyg.append(f'<div class="{cls}">{pyg_line}</div>')
208
+ body = "".join(rendered_pyg)
202
209
 
203
210
  return _Snippet(
204
211
  filepath=filepath,
@@ -9,6 +9,7 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import ast
12
+ from collections.abc import Sequence
12
13
  from dataclasses import dataclass
13
14
 
14
15
  from .blockhash import stmt_hash
@@ -45,12 +46,20 @@ def extract_blocks(
45
46
  cfg: NormalizationConfig,
46
47
  block_size: int,
47
48
  max_blocks: int,
49
+ precomputed_hashes: Sequence[str] | None = None,
48
50
  ) -> list[BlockUnit]:
49
51
  body = getattr(func_node, "body", None)
50
52
  if not isinstance(body, list) or len(body) < block_size:
51
53
  return []
52
54
 
53
- stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
55
+ if precomputed_hashes is not None:
56
+ assert len(precomputed_hashes) == len(body), (
57
+ f"precomputed_hashes length {len(precomputed_hashes)} "
58
+ f"!= body length {len(body)}"
59
+ )
60
+ stmt_hashes = precomputed_hashes
61
+ else:
62
+ stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
54
63
 
55
64
  blocks: list[BlockUnit] = []
56
65
  last_start: int | None = None
@@ -94,12 +103,20 @@ def extract_segments(
94
103
  cfg: NormalizationConfig,
95
104
  window_size: int,
96
105
  max_segments: int,
106
+ precomputed_hashes: Sequence[str] | None = None,
97
107
  ) -> list[SegmentUnit]:
98
108
  body = getattr(func_node, "body", None)
99
109
  if not isinstance(body, list) or len(body) < window_size:
100
110
  return []
101
111
 
102
- stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
112
+ if precomputed_hashes is not None:
113
+ assert len(precomputed_hashes) == len(body), (
114
+ f"precomputed_hashes length {len(precomputed_hashes)} "
115
+ f"!= body length {len(body)}"
116
+ )
117
+ stmt_hashes = precomputed_hashes
118
+ else:
119
+ stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
103
120
 
104
121
  segments: list[SegmentUnit] = []
105
122
 
@@ -344,14 +344,14 @@ class Cache:
344
344
  try:
345
345
  self.path.parent.mkdir(parents=True, exist_ok=True)
346
346
  wire_files: dict[str, object] = {}
347
- for runtime_path in sorted(
348
- self.data["files"], key=self._wire_filepath_from_runtime
349
- ):
347
+ wire_map = {
348
+ rp: self._wire_filepath_from_runtime(rp) for rp in self.data["files"]
349
+ }
350
+ for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__):
350
351
  entry = self.get_file_entry(runtime_path)
351
352
  if entry is None:
352
353
  continue
353
- wire_path = self._wire_filepath_from_runtime(runtime_path)
354
- wire_files[wire_path] = _encode_wire_file_entry(entry)
354
+ wire_files[wire_map[runtime_path]] = _encode_wire_file_entry(entry)
355
355
 
356
356
  payload: dict[str, object] = {
357
357
  "py": current_python_tag(),
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  import sys
5
+ import time
5
6
  from collections.abc import Mapping, Sequence
6
7
  from concurrent.futures import Future, ProcessPoolExecutor, as_completed
7
8
  from dataclasses import asdict, dataclass
@@ -71,7 +72,6 @@ custom_theme = Theme(
71
72
  }
72
73
  )
73
74
 
74
-
75
75
  LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser()
76
76
 
77
77
 
@@ -122,14 +122,14 @@ def process_file(
122
122
  """
123
123
 
124
124
  try:
125
- # Check file size
125
+ # Single os.stat() for both size check and cache signature
126
126
  try:
127
- st_size = os.path.getsize(filepath)
128
- if st_size > MAX_FILE_SIZE:
127
+ st = os.stat(filepath)
128
+ if st.st_size > MAX_FILE_SIZE:
129
129
  return ProcessingResult(
130
130
  filepath=filepath,
131
131
  success=False,
132
- error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})",
132
+ error=f"File too large: {st.st_size} bytes (max {MAX_FILE_SIZE})",
133
133
  error_kind="file_too_large",
134
134
  )
135
135
  except OSError as e:
@@ -140,6 +140,8 @@ def process_file(
140
140
  error_kind="stat_error",
141
141
  )
142
142
 
143
+ stat: FileStat = {"mtime_ns": st.st_mtime_ns, "size": st.st_size}
144
+
143
145
  try:
144
146
  source = Path(filepath).read_text("utf-8")
145
147
  except UnicodeDecodeError as e:
@@ -157,7 +159,6 @@ def process_file(
157
159
  error_kind="source_read_error",
158
160
  )
159
161
 
160
- stat = file_stat_signature(filepath)
161
162
  module_name = module_name_from_path(root, filepath)
162
163
 
163
164
  units, blocks, segments = extract_units_from_source(
@@ -238,6 +239,8 @@ def _main_impl() -> None:
238
239
  )
239
240
  sys.exit(ExitCode.CONTRACT_ERROR)
240
241
 
242
+ t0 = time.monotonic()
243
+
241
244
  if not args.quiet:
242
245
  print_banner()
243
246
 
@@ -353,68 +356,44 @@ def _main_impl() -> None:
353
356
  return None, str(e)
354
357
 
355
358
  # Discovery phase
356
- try:
357
- if args.quiet:
358
- for fp in iter_py_files(str(root_path)):
359
- files_found += 1
360
- stat, cached, warn = _get_cached_entry(fp)
361
- if warn:
362
- console.print(warn)
363
- files_skipped += 1
364
- continue
365
- if cached and cached.get("stat") == stat:
366
- cache_hits += 1
367
- all_units.extend(
368
- cast(
369
- list[GroupItem],
370
- cast(object, cached.get("units", [])),
371
- )
359
+ def _discover_files() -> None:
360
+ nonlocal files_found, cache_hits, files_skipped
361
+ for fp in iter_py_files(str(root_path)):
362
+ files_found += 1
363
+ stat, cached, warn = _get_cached_entry(fp)
364
+ if warn:
365
+ console.print(warn)
366
+ files_skipped += 1
367
+ continue
368
+ if cached and cached.get("stat") == stat:
369
+ cache_hits += 1
370
+ all_units.extend(
371
+ cast(
372
+ list[GroupItem],
373
+ cast(object, cached.get("units", [])),
372
374
  )
373
- all_blocks.extend(
374
- cast(
375
- list[GroupItem],
376
- cast(object, cached.get("blocks", [])),
377
- )
375
+ )
376
+ all_blocks.extend(
377
+ cast(
378
+ list[GroupItem],
379
+ cast(object, cached.get("blocks", [])),
378
380
  )
379
- all_segments.extend(
380
- cast(
381
- list[GroupItem],
382
- cast(object, cached.get("segments", [])),
383
- )
381
+ )
382
+ all_segments.extend(
383
+ cast(
384
+ list[GroupItem],
385
+ cast(object, cached.get("segments", [])),
384
386
  )
385
- else:
386
- files_to_process.append(fp)
387
+ )
388
+ else:
389
+ files_to_process.append(fp)
390
+
391
+ try:
392
+ if args.quiet:
393
+ _discover_files()
387
394
  else:
388
395
  with console.status(ui.STATUS_DISCOVERING, spinner="dots"):
389
- for fp in iter_py_files(str(root_path)):
390
- files_found += 1
391
- stat, cached, warn = _get_cached_entry(fp)
392
- if warn:
393
- console.print(warn)
394
- files_skipped += 1
395
- continue
396
- if cached and cached.get("stat") == stat:
397
- cache_hits += 1
398
- all_units.extend(
399
- cast(
400
- list[GroupItem],
401
- cast(object, cached.get("units", [])),
402
- )
403
- )
404
- all_blocks.extend(
405
- cast(
406
- list[GroupItem],
407
- cast(object, cached.get("blocks", [])),
408
- )
409
- )
410
- all_segments.extend(
411
- cast(
412
- list[GroupItem],
413
- cast(object, cached.get("segments", [])),
414
- )
415
- )
416
- else:
417
- files_to_process.append(fp)
396
+ _discover_files()
418
397
  except OSError as e:
419
398
  console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e)))
420
399
  sys.exit(ExitCode.CONTRACT_ERROR)
@@ -900,6 +879,10 @@ def _main_impl() -> None:
900
879
  if not args.update_baseline and not args.fail_on_new and new_clones_count > 0:
901
880
  console.print(ui.WARN_NEW_CLONES_WITHOUT_FAIL)
902
881
 
882
+ if not args.quiet:
883
+ elapsed = time.monotonic() - t0
884
+ console.print(f"\n[dim]Done in {elapsed:.1f}s[/dim]")
885
+
903
886
 
904
887
  def main() -> None:
905
888
  try:
@@ -16,6 +16,7 @@ from collections.abc import Iterator
16
16
  from contextlib import contextmanager
17
17
  from dataclasses import dataclass
18
18
 
19
+ from .blockhash import stmt_hash
19
20
  from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments
20
21
  from .cfg import CFGBuilder
21
22
  from .errors import ParseError
@@ -250,28 +251,42 @@ def extract_units_from_source(
250
251
  )
251
252
  )
252
253
 
253
- # Block-level units (exclude __init__)
254
- if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
255
- blocks = extract_blocks(
256
- node,
257
- filepath=filepath,
258
- qualname=qualname,
259
- cfg=cfg,
260
- block_size=4,
261
- max_blocks=15,
262
- )
263
- block_units.extend(blocks)
264
-
265
- # Segment-level units (windows within functions, for internal clones)
266
- if loc >= 30 and stmt_count >= 12:
267
- segments = extract_segments(
268
- node,
269
- filepath=filepath,
270
- qualname=qualname,
271
- cfg=cfg,
272
- window_size=6,
273
- max_segments=60,
274
- )
275
- segment_units.extend(segments)
254
+ # Block-level and segment-level units share statement hashes
255
+ needs_blocks = (
256
+ not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10
257
+ )
258
+ needs_segments = loc >= 30 and stmt_count >= 12
259
+
260
+ if needs_blocks or needs_segments:
261
+ body = getattr(node, "body", None)
262
+ hashes: list[str] | None = None
263
+ if isinstance(body, list):
264
+ hashes = [stmt_hash(stmt, cfg) for stmt in body]
265
+
266
+ if needs_blocks:
267
+ block_units.extend(
268
+ extract_blocks(
269
+ node,
270
+ filepath=filepath,
271
+ qualname=qualname,
272
+ cfg=cfg,
273
+ block_size=4,
274
+ max_blocks=15,
275
+ precomputed_hashes=hashes,
276
+ )
277
+ )
278
+
279
+ if needs_segments:
280
+ segment_units.extend(
281
+ extract_segments(
282
+ node,
283
+ filepath=filepath,
284
+ qualname=qualname,
285
+ cfg=cfg,
286
+ window_size=6,
287
+ max_segments=60,
288
+ precomputed_hashes=hashes,
289
+ )
290
+ )
276
291
 
277
292
  return units, block_units, segment_units
@@ -760,10 +760,9 @@ def build_html_report(
760
760
  f'<div class="meta-panel" id="report-meta" {meta_attrs}>'
761
761
  '<div class="meta-header">'
762
762
  '<div class="meta-title">'
763
- f"{chevron_icon}"
764
763
  "Report Provenance"
765
764
  "</div>"
766
- '<div class="meta-toggle collapsed">▸</div>'
765
+ f'<div class="meta-toggle collapsed">{chevron_icon}</div>'
767
766
  "</div>"
768
767
  '<div class="meta-content collapsed">'
769
768
  f'<div class="meta-grid">{meta_rows_html}</div>'
@@ -77,8 +77,9 @@ def iter_py_files(
77
77
  if root_str.startswith(sensitive + "/"):
78
78
  raise ValidationError(f"Cannot scan under sensitive directory: {root}")
79
79
 
80
- file_count = 0
81
- for p in sorted(rootp.rglob("*.py"), key=lambda path: str(path)):
80
+ # Collect and filter first, then sort — avoids sorting excluded paths
81
+ candidates: list[Path] = []
82
+ for p in rootp.rglob("*.py"):
82
83
  # Verify path is actually under root (prevent symlink attacks)
83
84
  try:
84
85
  p.resolve().relative_to(rootp)
@@ -90,12 +91,15 @@ def iter_py_files(
90
91
  if any(ex in parts for ex in excludes):
91
92
  continue
92
93
 
93
- file_count += 1
94
- if file_count > max_files:
95
- raise ValidationError(
96
- f"File count exceeds limit of {max_files}. "
97
- "Use more specific root or increase limit."
98
- )
94
+ candidates.append(p)
95
+
96
+ if len(candidates) > max_files:
97
+ raise ValidationError(
98
+ f"File count exceeds limit of {max_files}. "
99
+ "Use more specific root or increase limit."
100
+ )
101
+
102
+ for p in sorted(candidates, key=lambda path: str(path)):
99
103
  yield str(p)
100
104
 
101
105