codeclone 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeclone/cli.py CHANGED
@@ -1,36 +1,32 @@
1
- """
2
- CodeClone — AST and CFG-based code clone detector for Python
3
- focused on architectural duplication.
4
-
5
- Copyright (c) 2026 Den Rozhnovskiy
6
- Licensed under the MIT License.
7
- """
8
-
9
1
  from __future__ import annotations
10
2
 
11
3
  import argparse
4
+ import os
12
5
  import sys
13
6
  from concurrent.futures import ProcessPoolExecutor, as_completed
7
+ from dataclasses import asdict, dataclass
14
8
  from pathlib import Path
9
+ from typing import Any, cast
15
10
 
16
11
  from rich.console import Console
17
12
  from rich.panel import Panel
18
13
  from rich.progress import (
14
+ BarColumn,
19
15
  Progress,
20
16
  SpinnerColumn,
21
17
  TextColumn,
22
- BarColumn,
23
18
  TimeElapsedColumn,
24
19
  )
25
20
  from rich.table import Table
26
21
  from rich.theme import Theme
27
22
 
28
23
  from .baseline import Baseline
29
- from .cache import Cache, file_stat_signature
24
+ from .cache import Cache, CacheEntry, FileStat, file_stat_signature
25
+ from .errors import CacheError
30
26
  from .extractor import extract_units_from_source
31
27
  from .html_report import build_html_report
32
28
  from .normalize import NormalizationConfig
33
- from .report import build_groups, build_block_groups, to_json, to_text
29
+ from .report import build_block_groups, build_groups, to_json_report, to_text
34
30
  from .scanner import iter_py_files, module_name_from_path
35
31
 
36
32
  # Custom theme for Rich
@@ -45,6 +41,21 @@ custom_theme = Theme(
45
41
  )
46
42
  console = Console(theme=custom_theme, width=200)
47
43
 
44
+ MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
45
+ BATCH_SIZE = 100
46
+
47
+
48
+ @dataclass(slots=True)
49
+ class ProcessingResult:
50
+ """Result of processing a single file."""
51
+
52
+ filepath: str
53
+ success: bool
54
+ error: str | None = None
55
+ units: list[Any] | None = None
56
+ blocks: list[Any] | None = None
57
+ stat: FileStat | None = None
58
+
48
59
 
49
60
  def expand_path(p: str) -> Path:
50
61
  return Path(p).expanduser().resolve()
@@ -56,31 +67,76 @@ def process_file(
56
67
  cfg: NormalizationConfig,
57
68
  min_loc: int,
58
69
  min_stmt: int,
59
- ) -> tuple[str, dict, list, list] | None:
70
+ ) -> ProcessingResult:
71
+ """
72
+ Process a single Python file with comprehensive error handling.
73
+
74
+ Args:
75
+ filepath: Absolute path to the file
76
+ root: Root directory of the scan
77
+ cfg: Normalization configuration
78
+ min_loc: Minimum lines of code to consider a function
79
+ min_stmt: Minimum statements to consider a function
80
+
81
+ Returns:
82
+ ProcessingResult object indicating success/failure and containing
83
+ extracted units/blocks if successful.
84
+ """
85
+
60
86
  try:
61
- source = Path(filepath).read_text("utf-8")
62
- except UnicodeDecodeError:
63
- return None
64
-
65
- stat = file_stat_signature(filepath)
66
- module_name = module_name_from_path(root, filepath)
67
-
68
- units, blocks = extract_units_from_source(
69
- source=source,
70
- filepath=filepath,
71
- module_name=module_name,
72
- cfg=cfg,
73
- min_loc=min_loc,
74
- min_stmt=min_stmt,
75
- )
87
+ # Check file size
88
+ try:
89
+ st_size = os.path.getsize(filepath)
90
+ if st_size > MAX_FILE_SIZE:
91
+ return ProcessingResult(
92
+ filepath=filepath,
93
+ success=False,
94
+ error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})",
95
+ )
96
+ except OSError as e:
97
+ return ProcessingResult(
98
+ filepath=filepath, success=False, error=f"Cannot stat file: {e}"
99
+ )
76
100
 
77
- return filepath, stat, units, blocks
101
+ try:
102
+ source = Path(filepath).read_text("utf-8")
103
+ except UnicodeDecodeError as e:
104
+ return ProcessingResult(
105
+ filepath=filepath, success=False, error=f"Encoding error: {e}"
106
+ )
107
+
108
+ stat = file_stat_signature(filepath)
109
+ module_name = module_name_from_path(root, filepath)
110
+
111
+ units, blocks = extract_units_from_source(
112
+ source=source,
113
+ filepath=filepath,
114
+ module_name=module_name,
115
+ cfg=cfg,
116
+ min_loc=min_loc,
117
+ min_stmt=min_stmt,
118
+ )
119
+
120
+ return ProcessingResult(
121
+ filepath=filepath,
122
+ success=True,
123
+ units=units,
124
+ blocks=blocks,
125
+ stat=stat,
126
+ )
127
+
128
+ except Exception as e:
129
+ return ProcessingResult(
130
+ filepath=filepath,
131
+ success=False,
132
+ error=f"Unexpected error: {type(e).__name__}: {e}",
133
+ )
78
134
 
79
135
 
80
- def print_banner():
136
+ def print_banner() -> None:
81
137
  console.print(
82
138
  Panel.fit(
83
- "[bold white]CodeClone[/bold white] [dim]v1.2.0[/dim]\n"
139
+ "[bold white]CodeClone[/bold white] [dim]v1.2.1[/dim]\n"
84
140
  "[italic]Architectural duplication detector[/italic]",
85
141
  border_style="blue",
86
142
  padding=(0, 2),
@@ -185,9 +241,13 @@ def main() -> None:
185
241
 
186
242
  print_banner()
187
243
 
188
- root_path = Path(args.root).resolve()
189
- if not root_path.exists():
190
- console.print(f"[error]Root path does not exist: {root_path}[/error]")
244
+ try:
245
+ root_path = Path(args.root).resolve()
246
+ if not root_path.exists():
247
+ console.print(f"[error]Root path does not exist: {root_path}[/error]")
248
+ sys.exit(1)
249
+ except Exception as e:
250
+ console.print(f"[error]Invalid root path: {e}[/error]")
191
251
  sys.exit(1)
192
252
 
193
253
  console.print(f"[info]Scanning root:[/info] {root_path}")
@@ -197,101 +257,213 @@ def main() -> None:
197
257
  cache_path = Path(args.cache_dir).expanduser()
198
258
  cache = Cache(cache_path)
199
259
  cache.load()
260
+ if cache.load_warning:
261
+ console.print(f"[warning]{cache.load_warning}[/warning]")
200
262
 
201
- all_units: list[dict] = []
202
- all_blocks: list[dict] = []
263
+ all_units: list[dict[str, Any]] = []
264
+ all_blocks: list[dict[str, Any]] = []
203
265
  changed_files_count = 0
204
266
  files_to_process: list[str] = []
205
267
 
268
+ def _get_cached_entry(
269
+ fp: str,
270
+ ) -> tuple[FileStat | None, CacheEntry | None, str | None]:
271
+ try:
272
+ stat = file_stat_signature(fp)
273
+ except OSError as e:
274
+ return None, None, f"[warning]Skipping file {fp}: {e}[/warning]"
275
+ cached = cache.get_file_entry(fp)
276
+ return stat, cached, None
277
+
278
+ def _safe_process_file(fp: str) -> ProcessingResult | None:
279
+ try:
280
+ return process_file(
281
+ fp,
282
+ str(root_path),
283
+ cfg,
284
+ args.min_loc,
285
+ args.min_stmt,
286
+ )
287
+ except Exception as e:
288
+ console.print(f"[warning]Worker failed: {e}[/warning]")
289
+ return None
290
+
291
+ def _safe_future_result(future: Any) -> tuple[ProcessingResult | None, str | None]:
292
+ try:
293
+ return future.result(), None
294
+ except Exception as e:
295
+ return None, str(e)
296
+
206
297
  # Discovery phase
207
298
  with console.status("[bold green]Discovering Python files...", spinner="dots"):
208
- for fp in iter_py_files(str(root_path)):
209
- stat = file_stat_signature(fp)
210
- cached = cache.get_file_entry(fp)
211
- if cached and cached.get("stat") == stat:
212
- all_units.extend(cached.get("units", []))
213
- all_blocks.extend(cached.get("blocks", []))
214
- else:
215
- files_to_process.append(fp)
299
+ try:
300
+ for fp in iter_py_files(str(root_path)):
301
+ stat, cached, warn = _get_cached_entry(fp)
302
+ if warn:
303
+ console.print(warn)
304
+ continue
305
+ if cached and cached.get("stat") == stat:
306
+ all_units.extend(
307
+ cast(
308
+ list[dict[str, Any]],
309
+ cast(object, cached.get("units", [])),
310
+ )
311
+ )
312
+ all_blocks.extend(
313
+ cast(
314
+ list[dict[str, Any]],
315
+ cast(object, cached.get("blocks", [])),
316
+ )
317
+ )
318
+ else:
319
+ files_to_process.append(fp)
320
+ except Exception as e:
321
+ console.print(f"[error]Scan failed: {e}[/error]")
322
+ sys.exit(1)
216
323
 
217
324
  total_files = len(files_to_process)
325
+ failed_files = []
218
326
 
219
327
  # Processing phase
220
328
  if total_files > 0:
221
- if args.no_progress:
222
- console.print(f"[info]Processing {total_files} changed files...[/info]")
329
+
330
+ def handle_result(result: ProcessingResult) -> None:
331
+ nonlocal changed_files_count
332
+ if result.success and result.stat:
333
+ cache.put_file_entry(
334
+ result.filepath,
335
+ result.stat,
336
+ result.units or [],
337
+ result.blocks or [],
338
+ )
339
+ changed_files_count += 1
340
+ if result.units:
341
+ all_units.extend([asdict(u) for u in result.units])
342
+ if result.blocks:
343
+ all_blocks.extend([asdict(b) for b in result.blocks])
344
+ else:
345
+ failed_files.append(f"{result.filepath}: {result.error}")
346
+
347
+ def process_sequential(with_progress: bool) -> None:
348
+ if with_progress:
349
+ with Progress(
350
+ SpinnerColumn(),
351
+ TextColumn("[progress.description]{task.description}"),
352
+ BarColumn(),
353
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
354
+ TimeElapsedColumn(),
355
+ console=console,
356
+ ) as progress:
357
+ task = progress.add_task(
358
+ f"Analyzing {total_files} files...", total=total_files
359
+ )
360
+ for fp in files_to_process:
361
+ result = _safe_process_file(fp)
362
+ if result is not None:
363
+ handle_result(result)
364
+ progress.advance(task)
365
+ else:
366
+ console.print(f"[info]Processing {total_files} changed files...[/info]")
367
+ for fp in files_to_process:
368
+ result = _safe_process_file(fp)
369
+ if result is not None:
370
+ handle_result(result)
371
+
372
+ try:
223
373
  with ProcessPoolExecutor(max_workers=args.processes) as executor:
224
- futures = [
225
- executor.submit(
226
- process_file,
227
- fp,
228
- str(root_path),
229
- cfg,
230
- args.min_loc,
231
- args.min_stmt,
374
+ if args.no_progress:
375
+ console.print(
376
+ f"[info]Processing {total_files} changed files...[/info]"
232
377
  )
233
- for fp in files_to_process
234
- ]
235
- for future in as_completed(futures):
236
- try:
237
- result = future.result()
238
- except Exception as e:
239
- console.print(f"[warning]Failed to process file: {e}[/warning]")
240
- continue
241
-
242
- if result:
243
- fp, stat, units, blocks = result
244
- cache.put_file_entry(fp, stat, units, blocks)
245
- changed_files_count += 1
246
- all_units.extend([u.__dict__ for u in units])
247
- all_blocks.extend([b.__dict__ for b in blocks])
248
- else:
249
- with Progress(
250
- SpinnerColumn(),
251
- TextColumn("[progress.description]{task.description}"),
252
- BarColumn(),
253
- TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
254
- TimeElapsedColumn(),
255
- console=console,
256
- ) as progress:
257
- task = progress.add_task(
258
- f"Analyzing {total_files} files...", total=total_files
259
- )
260
- with ProcessPoolExecutor(max_workers=args.processes) as executor:
261
- futures = [
262
- executor.submit(
263
- process_file,
264
- fp,
265
- str(root_path),
266
- cfg,
267
- args.min_loc,
268
- args.min_stmt,
378
+
379
+ # Process in batches to manage memory
380
+ for i in range(0, total_files, BATCH_SIZE):
381
+ batch = files_to_process[i : i + BATCH_SIZE]
382
+ futures = [
383
+ executor.submit(
384
+ process_file,
385
+ fp,
386
+ str(root_path),
387
+ cfg,
388
+ args.min_loc,
389
+ args.min_stmt,
390
+ )
391
+ for fp in batch
392
+ ]
393
+
394
+ for future in as_completed(futures):
395
+ result, err = _safe_future_result(future)
396
+ if result is not None:
397
+ handle_result(result)
398
+ elif err is not None:
399
+ console.print(
400
+ "[warning]Failed to process batch item: "
401
+ f"{err}[/warning]"
402
+ )
403
+
404
+ else:
405
+ with Progress(
406
+ SpinnerColumn(),
407
+ TextColumn("[progress.description]{task.description}"),
408
+ BarColumn(),
409
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
410
+ TimeElapsedColumn(),
411
+ console=console,
412
+ ) as progress:
413
+ task = progress.add_task(
414
+ f"Analyzing {total_files} files...", total=total_files
269
415
  )
270
- for fp in files_to_process
271
- ]
272
- for future in as_completed(futures):
273
- try:
274
- result = future.result()
275
- except Exception:
276
- # Log error but keep progress bar moving?
277
- # console.print might break progress bar layout, better to rely on rich logging or just skip
278
- # console.print(f"[warning]Failed to process file: {e}[/warning]")
279
- continue
280
- finally:
281
- progress.advance(task)
282
-
283
- if result:
284
- fp, stat, units, blocks = result
285
- cache.put_file_entry(fp, stat, units, blocks)
286
- changed_files_count += 1
287
- all_units.extend([u.__dict__ for u in units])
288
- all_blocks.extend([b.__dict__ for b in blocks])
416
+
417
+ # Process in batches
418
+ for i in range(0, total_files, BATCH_SIZE):
419
+ batch = files_to_process[i : i + BATCH_SIZE]
420
+ futures = [
421
+ executor.submit(
422
+ process_file,
423
+ fp,
424
+ str(root_path),
425
+ cfg,
426
+ args.min_loc,
427
+ args.min_stmt,
428
+ )
429
+ for fp in batch
430
+ ]
431
+
432
+ for future in as_completed(futures):
433
+ result, err = _safe_future_result(future)
434
+ if result is not None:
435
+ handle_result(result)
436
+ elif err is not None:
437
+ # Should rarely happen due to try/except
438
+ # in process_file.
439
+ console.print(
440
+ f"[warning]Worker failed: {err}[/warning]"
441
+ )
442
+ progress.advance(task)
443
+ except (OSError, RuntimeError, PermissionError) as e:
444
+ console.print(
445
+ "[warning]Parallel processing unavailable, "
446
+ f"falling back to sequential: {e}[/warning]"
447
+ )
448
+ process_sequential(with_progress=not args.no_progress)
449
+
450
+ if failed_files:
451
+ console.print(
452
+ f"\n[warning]⚠ {len(failed_files)} files failed to process:[/warning]"
453
+ )
454
+ for failure in failed_files[:10]:
455
+ console.print(f" • {failure}")
456
+ if len(failed_files) > 10:
457
+ console.print(f" ... and {len(failed_files) - 10} more")
289
458
 
290
459
  # Analysis phase
291
460
  with console.status("[bold green]Grouping clones...", spinner="dots"):
292
461
  func_groups = build_groups(all_units)
293
462
  block_groups = build_block_groups(all_blocks)
294
- cache.save()
463
+ try:
464
+ cache.save()
465
+ except CacheError as e:
466
+ console.print(f"[warning]Failed to save cache: {e}[/warning]")
295
467
 
296
468
  # Reporting
297
469
  func_clones_count = len(func_groups)
@@ -300,24 +472,45 @@ def main() -> None:
300
472
  # Baseline Logic
301
473
  baseline_path = Path(args.baseline).expanduser().resolve()
302
474
 
303
- # If user didn't specify path, and default logic applies, baseline_path is now ./codeclone_baseline.json
475
+ # If user didn't specify path and default logic applies, baseline_path
476
+ # is now ./codeclone_baseline.json
304
477
 
305
478
  baseline = Baseline(baseline_path)
306
479
  baseline_exists = baseline_path.exists()
307
480
 
308
481
  if baseline_exists:
309
482
  baseline.load()
483
+ if not args.update_baseline and baseline.python_version:
484
+ current_version = f"{sys.version_info.major}.{sys.version_info.minor}"
485
+ if baseline.python_version != current_version:
486
+ console.print(
487
+ "[warning]Baseline Python version mismatch.[/warning]\n"
488
+ f"Baseline was generated with Python {baseline.python_version}.\n"
489
+ f"Current interpreter: Python {current_version}."
490
+ )
491
+ if args.fail_on_new:
492
+ console.print(
493
+ "[error]Baseline checks require the same Python version to "
494
+ "ensure deterministic results. Please regenerate the baseline "
495
+ "using the current interpreter.[/error]"
496
+ )
497
+ sys.exit(2)
310
498
  else:
311
499
  if not args.update_baseline:
312
500
  console.print(
313
- f"[warning]Baseline file not found at: [bold]{baseline_path}[/bold][/warning]\n"
501
+ "[warning]Baseline file not found at: [bold]"
502
+ f"{baseline_path}"
503
+ "[/bold][/warning]\n"
314
504
  "[dim]Comparing against an empty baseline. "
315
505
  "Use --update-baseline to create it.[/dim]"
316
506
  )
317
507
 
318
508
  if args.update_baseline:
319
509
  new_baseline = Baseline.from_groups(
320
- func_groups, block_groups, path=baseline_path
510
+ func_groups,
511
+ block_groups,
512
+ path=baseline_path,
513
+ python_version=f"{sys.version_info.major}.{sys.version_info.minor}",
321
514
  )
322
515
  new_baseline.save()
323
516
  console.print(f"[success]✔ Baseline updated:[/success] {baseline_path}")
@@ -365,7 +558,7 @@ def main() -> None:
365
558
  out = Path(args.json_out).expanduser().resolve()
366
559
  out.parent.mkdir(parents=True, exist_ok=True)
367
560
  out.write_text(
368
- to_json({"functions": func_groups, "blocks": block_groups}),
561
+ to_json_report(func_groups, block_groups),
369
562
  "utf-8",
370
563
  )
371
564
  console.print(f"[info]JSON report saved:[/info] {out}")
@@ -392,8 +585,9 @@ def main() -> None:
392
585
  sys.exit(3)
393
586
 
394
587
  if 0 <= args.fail_threshold < (func_clones_count + block_clones_count):
588
+ total = func_clones_count + block_clones_count
395
589
  console.print(
396
- f"\n[error]❌ FAILED: Total clones ({func_clones_count + block_clones_count}) "
590
+ f"\n[error]❌ FAILED: Total clones ({total}) "
397
591
  f"exceed threshold ({args.fail_threshold})![/error]"
398
592
  )
399
593
  sys.exit(2)
codeclone/errors.py ADDED
@@ -0,0 +1,27 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
9
+
10
+ class CodeCloneError(Exception):
11
+ """Base exception for CodeClone."""
12
+
13
+
14
+ class FileProcessingError(CodeCloneError):
15
+ """Error processing a source file."""
16
+
17
+
18
+ class ParseError(FileProcessingError):
19
+ """AST parsing failed."""
20
+
21
+
22
+ class ValidationError(CodeCloneError):
23
+ """Input validation failed."""
24
+
25
+
26
+ class CacheError(CodeCloneError):
27
+ """Cache operation failed."""