ai-codeindex 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeindex/tech_debt.py ADDED
@@ -0,0 +1,619 @@
1
+ """Technical debt detection for code analysis.
2
+
3
+ This module provides tools to detect and analyze technical debt in codebases,
4
+ including file size issues, God Classes, and code quality metrics.
5
+ """
6
+
7
+ from collections import defaultdict
8
+ from dataclasses import dataclass, field
9
+ from enum import IntEnum
10
+ from pathlib import Path
11
+
12
+ from codeindex.config import Config
13
+ from codeindex.parser import ParseResult
14
+ from codeindex.symbol_scorer import SymbolImportanceScorer
15
+
16
+
17
+ class DebtSeverity(IntEnum):
18
+ """Severity levels for technical debt issues.
19
+
20
+ Lower values indicate higher severity (CRITICAL is most severe).
21
+ """
22
+
23
+ CRITICAL = 1 # Must fix: super large files, God Classes
24
+ HIGH = 2 # Should fix: large files, complex methods
25
+ MEDIUM = 3 # Consider fixing: moderate issues
26
+ LOW = 4 # Nice to fix: minor issues
27
+
28
+
29
+ @dataclass
30
+ class DebtIssue:
31
+ """Represents a technical debt issue detected in code.
32
+
33
+ Attributes:
34
+ severity: The severity level of the issue
35
+ category: Category identifier (e.g., "super_large_file", "god_class")
36
+ file_path: Path to the file containing the issue
37
+ metric_value: Actual measured value (e.g., line count, method count)
38
+ threshold: The threshold value that was exceeded
39
+ description: Human-readable description of the issue
40
+ suggestion: Actionable suggestion for fixing the issue
41
+ """
42
+
43
+ severity: DebtSeverity
44
+ category: str
45
+ file_path: Path
46
+ metric_value: float
47
+ threshold: float
48
+ description: str
49
+ suggestion: str
50
+
51
+
52
+ @dataclass
53
+ class DebtAnalysisResult:
54
+ """Result of analyzing a file for technical debt.
55
+
56
+ Attributes:
57
+ issues: List of detected technical debt issues
58
+ quality_score: Overall code quality score (0-100, higher is better)
59
+ file_path: Path to the analyzed file
60
+ file_lines: Number of lines in the file
61
+ total_symbols: Total number of symbols in the file
62
+ """
63
+
64
+ issues: list[DebtIssue] = field(default_factory=list)
65
+ quality_score: float = 100.0
66
+ file_path: Path = Path()
67
+ file_lines: int = 0
68
+ total_symbols: int = 0
69
+
70
+
71
+ @dataclass
72
+ class SymbolOverloadAnalysis:
73
+ """Analysis result of symbol overload detection.
74
+
75
+ Attributes:
76
+ total_symbols: Total number of symbols in the file
77
+ filtered_symbols: Number of high-quality symbols after filtering
78
+ filter_ratio: Ratio of filtered symbols (0.0 to 1.0)
79
+ noise_breakdown: Dictionary categorizing noise sources
80
+ Keys: "getters_setters", "private_methods", "magic_methods", "other"
81
+ Values: Count of symbols in each category
82
+ quality_score: Symbol quality score (0-100, higher is better)
83
+ Based on filter ratio and noise breakdown
84
+ """
85
+
86
+ total_symbols: int = 0
87
+ filtered_symbols: int = 0
88
+ filter_ratio: float = 0.0
89
+ noise_breakdown: dict[str, int] = field(default_factory=dict)
90
+ quality_score: float = 100.0
91
+
92
+
93
+ @dataclass
94
+ class FileReport:
95
+ """Report for a single file's technical debt analysis.
96
+
97
+ Attributes:
98
+ file_path: Path to the analyzed file
99
+ debt_analysis: DebtAnalysisResult for the file
100
+ symbol_analysis: Optional SymbolOverloadAnalysis for the file
101
+ total_issues: Total number of issues detected (computed property)
102
+ """
103
+
104
+ file_path: Path
105
+ debt_analysis: DebtAnalysisResult
106
+ symbol_analysis: SymbolOverloadAnalysis | None = None
107
+
108
+ @property
109
+ def total_issues(self) -> int:
110
+ """Calculate total number of issues."""
111
+ return len(self.debt_analysis.issues)
112
+
113
+
114
+ @dataclass
115
+ class TechDebtReport:
116
+ """Aggregate report for technical debt across multiple files.
117
+
118
+ Attributes:
119
+ file_reports: List of FileReport for each analyzed file
120
+ total_files: Total number of files analyzed
121
+ total_issues: Total number of issues across all files
122
+ critical_issues: Count of CRITICAL severity issues
123
+ high_issues: Count of HIGH severity issues
124
+ medium_issues: Count of MEDIUM severity issues
125
+ low_issues: Count of LOW severity issues
126
+ average_quality_score: Average quality score across all files
127
+ """
128
+
129
+ file_reports: list[FileReport] = field(default_factory=list)
130
+ total_files: int = 0
131
+ total_issues: int = 0
132
+ critical_issues: int = 0
133
+ high_issues: int = 0
134
+ medium_issues: int = 0
135
+ low_issues: int = 0
136
+ average_quality_score: float = 100.0
137
+
138
+
139
+ class TechDebtReporter:
140
+ """Reporter for aggregating technical debt analysis across multiple files.
141
+
142
+ This class collects analysis results from multiple files and generates
143
+ aggregate reports with overall statistics.
144
+ """
145
+
146
+ def __init__(self):
147
+ """Initialize the reporter."""
148
+ self._file_reports: list[FileReport] = []
149
+
150
+ def add_file_result(
151
+ self,
152
+ file_path: Path,
153
+ debt_analysis: DebtAnalysisResult,
154
+ symbol_analysis: SymbolOverloadAnalysis | None = None,
155
+ ):
156
+ """Add a file analysis result to the reporter.
157
+
158
+ Args:
159
+ file_path: Path to the analyzed file
160
+ debt_analysis: DebtAnalysisResult for the file
161
+ symbol_analysis: Optional SymbolOverloadAnalysis for the file
162
+ """
163
+ file_report = FileReport(
164
+ file_path=file_path,
165
+ debt_analysis=debt_analysis,
166
+ symbol_analysis=symbol_analysis,
167
+ )
168
+ self._file_reports.append(file_report)
169
+
170
+ def generate_report(self) -> TechDebtReport:
171
+ """Generate aggregate report from all collected file results.
172
+
173
+ Returns:
174
+ TechDebtReport with aggregated statistics
175
+ """
176
+ if not self._file_reports:
177
+ return TechDebtReport()
178
+
179
+ # Aggregate statistics
180
+ total_files = len(self._file_reports)
181
+ total_issues = 0
182
+ critical_issues = 0
183
+ high_issues = 0
184
+ medium_issues = 0
185
+ low_issues = 0
186
+ total_quality_score = 0.0
187
+
188
+ for file_report in self._file_reports:
189
+ total_issues += file_report.total_issues
190
+ total_quality_score += file_report.debt_analysis.quality_score
191
+
192
+ # Count issues by severity
193
+ for issue in file_report.debt_analysis.issues:
194
+ if issue.severity == DebtSeverity.CRITICAL:
195
+ critical_issues += 1
196
+ elif issue.severity == DebtSeverity.HIGH:
197
+ high_issues += 1
198
+ elif issue.severity == DebtSeverity.MEDIUM:
199
+ medium_issues += 1
200
+ elif issue.severity == DebtSeverity.LOW:
201
+ low_issues += 1
202
+
203
+ average_quality_score = total_quality_score / total_files
204
+
205
+ return TechDebtReport(
206
+ file_reports=self._file_reports,
207
+ total_files=total_files,
208
+ total_issues=total_issues,
209
+ critical_issues=critical_issues,
210
+ high_issues=high_issues,
211
+ medium_issues=medium_issues,
212
+ low_issues=low_issues,
213
+ average_quality_score=average_quality_score,
214
+ )
215
+
216
+
217
+ class TechDebtDetector:
218
+ """Detector for technical debt in code.
219
+
220
+ This class analyzes parsed code to identify technical debt issues
221
+ such as oversized files, God Classes, and code quality problems.
222
+
223
+ Attributes:
224
+ config: Configuration object
225
+ classifier: Unified file size classifier (Epic 4 refactoring)
226
+ GOD_CLASS_METHODS: Threshold for God Class detection (>50 methods)
227
+ MASSIVE_SYMBOL_COUNT: Threshold for massive symbol count (>100)
228
+ HIGH_NOISE_RATIO: High noise ratio threshold (>0.5)
229
+ """
230
+
231
+ GOD_CLASS_METHODS = 50 # Methods per class
232
+ MASSIVE_SYMBOL_COUNT = 100 # Total symbols
233
+ HIGH_NOISE_RATIO = 0.5 # 50% filter ratio
234
+
235
+ def __init__(self, config: Config):
236
+ """Initialize the technical debt detector.
237
+
238
+ Args:
239
+ config: Configuration object
240
+ """
241
+ self.config = config
242
+ # Use unified FileSizeClassifier (Epic 4 Story 4.2)
243
+ from codeindex.file_classifier import FileSizeClassifier
244
+
245
+ self.classifier = FileSizeClassifier(config)
246
+
247
+ def analyze_file(
248
+ self, parse_result: ParseResult, scorer: SymbolImportanceScorer
249
+ ) -> DebtAnalysisResult:
250
+ """Analyze a file for technical debt.
251
+
252
+ Args:
253
+ parse_result: The parsed file to analyze
254
+ scorer: Symbol importance scorer for quality analysis
255
+
256
+ Returns:
257
+ DebtAnalysisResult containing detected issues and quality score
258
+ """
259
+ issues = []
260
+
261
+ # Detect file-level issues
262
+ issues.extend(self._detect_file_size_issues(parse_result))
263
+
264
+ # Detect class-level issues (God Class)
265
+ issues.extend(self._detect_god_class(parse_result))
266
+
267
+ # Calculate quality score based on issues
268
+ quality_score = self._calculate_quality_score(parse_result, issues)
269
+
270
+ return DebtAnalysisResult(
271
+ issues=issues,
272
+ quality_score=quality_score,
273
+ file_path=parse_result.path,
274
+ file_lines=parse_result.file_lines,
275
+ total_symbols=len(parse_result.symbols),
276
+ )
277
+
278
+ def _detect_file_size_issues(self, parse_result: ParseResult) -> list[DebtIssue]:
279
+ """Detect file size related technical debt.
280
+
281
+ Uses unified FileSizeClassifier for consistent detection (Epic 4 refactoring).
282
+
283
+ Args:
284
+ parse_result: The parsed file to analyze
285
+
286
+ Returns:
287
+ List of DebtIssue for file size problems
288
+ """
289
+ from codeindex.file_classifier import FileSizeCategory
290
+
291
+ issues = []
292
+ analysis = self.classifier.classify(parse_result)
293
+ lines = parse_result.file_lines
294
+
295
+ if analysis.category == FileSizeCategory.SUPER_LARGE:
296
+ # Use classifier thresholds
297
+ threshold = self.classifier.super_large_lines
298
+ issues.append(
299
+ DebtIssue(
300
+ severity=DebtSeverity.CRITICAL,
301
+ category="super_large_file",
302
+ file_path=parse_result.path,
303
+ metric_value=lines,
304
+ threshold=threshold,
305
+ description=f"File has {lines} lines (threshold: {threshold})",
306
+ suggestion="Split into 3-5 smaller files by responsibility",
307
+ )
308
+ )
309
+ elif analysis.category == FileSizeCategory.LARGE:
310
+ # Large file threshold (2000 lines from classifier)
311
+ threshold = 2000
312
+ issues.append(
313
+ DebtIssue(
314
+ severity=DebtSeverity.HIGH,
315
+ category="large_file",
316
+ file_path=parse_result.path,
317
+ metric_value=lines,
318
+ threshold=threshold,
319
+ description=f"File has {lines} lines (threshold: {threshold})",
320
+ suggestion="Consider splitting into 2-3 smaller modules",
321
+ )
322
+ )
323
+
324
+ return issues
325
+
326
+ def _detect_god_class(self, parse_result: ParseResult) -> list[DebtIssue]:
327
+ """Detect God Class anti-pattern.
328
+
329
+ A God Class is a class with too many responsibilities, indicated by
330
+ having an excessive number of methods.
331
+
332
+ Args:
333
+ parse_result: The parsed file to analyze
334
+
335
+ Returns:
336
+ List of DebtIssue for God Class problems
337
+ """
338
+ issues = []
339
+
340
+ # Group methods by class name
341
+ class_methods: dict[str, list] = defaultdict(list)
342
+ for symbol in parse_result.symbols:
343
+ if symbol.kind == "method":
344
+ # Extract class name from method name
345
+ # Supports both PHP (ClassName::methodName) and Python (ClassName.methodName)
346
+ if "::" in symbol.name:
347
+ class_name = symbol.name.split("::")[0]
348
+ elif "." in symbol.name and not symbol.name.startswith("_"):
349
+ class_name = symbol.name.split(".")[0]
350
+ else:
351
+ continue # Not a class method
352
+
353
+ class_methods[class_name].append(symbol)
354
+
355
+ # Check each class for too many methods
356
+ for class_name, methods in class_methods.items():
357
+ method_count = len(methods)
358
+ if method_count > self.GOD_CLASS_METHODS:
359
+ suggested_split_count = max(3, method_count // 20)
360
+ issues.append(
361
+ DebtIssue(
362
+ severity=DebtSeverity.CRITICAL,
363
+ category="god_class",
364
+ file_path=parse_result.path,
365
+ metric_value=method_count,
366
+ threshold=self.GOD_CLASS_METHODS,
367
+ description=f"Class '{class_name}' has {method_count} methods "
368
+ f"(threshold: {self.GOD_CLASS_METHODS})",
369
+ suggestion=f"Extract {suggested_split_count} smaller classes by "
370
+ f"responsibility",
371
+ )
372
+ )
373
+
374
+ return issues
375
+
376
+ def _calculate_quality_score(
377
+ self, parse_result: ParseResult, issues: list[DebtIssue]
378
+ ) -> float:
379
+ """Calculate overall code quality score.
380
+
381
+ Starts with 100 points and deducts based on issue severity:
382
+ - CRITICAL: -30 points
383
+ - HIGH: -15 points
384
+ - MEDIUM: -5 points
385
+ - LOW: -2 points
386
+
387
+ Args:
388
+ parse_result: The parsed file
389
+ issues: List of detected issues
390
+
391
+ Returns:
392
+ Quality score (0-100, higher is better)
393
+ """
394
+ score = 100.0
395
+
396
+ for issue in issues:
397
+ if issue.severity == DebtSeverity.CRITICAL:
398
+ score -= 30
399
+ elif issue.severity == DebtSeverity.HIGH:
400
+ score -= 15
401
+ elif issue.severity == DebtSeverity.MEDIUM:
402
+ score -= 5
403
+ elif issue.severity == DebtSeverity.LOW:
404
+ score -= 2
405
+
406
+ # Ensure score doesn't go below 0
407
+ return max(0.0, score)
408
+
409
+ def analyze_symbol_overload(
410
+ self, parse_result: ParseResult, scorer: SymbolImportanceScorer
411
+ ) -> tuple[list[DebtIssue], SymbolOverloadAnalysis]:
412
+ """Analyze symbol overload issues.
413
+
414
+ Detects:
415
+ - Massive symbol count (>100 symbols)
416
+ - High noise ratio (>50% filtered)
417
+ - Data Class smell (>66% getters/setters)
418
+
419
+ Args:
420
+ parse_result: The parsed file to analyze
421
+ scorer: Symbol importance scorer for quality analysis
422
+
423
+ Returns:
424
+ Tuple of (issues list, SymbolOverloadAnalysis)
425
+ """
426
+ issues = []
427
+ total_symbols = len(parse_result.symbols)
428
+
429
+ # Detect massive symbol count
430
+ if total_symbols > self.MASSIVE_SYMBOL_COUNT:
431
+ issues.append(
432
+ DebtIssue(
433
+ severity=DebtSeverity.CRITICAL,
434
+ category="massive_symbol_count",
435
+ file_path=parse_result.path,
436
+ metric_value=total_symbols,
437
+ threshold=self.MASSIVE_SYMBOL_COUNT,
438
+ description=f"File has {total_symbols} symbols "
439
+ f"(threshold: {self.MASSIVE_SYMBOL_COUNT})",
440
+ suggestion="Split into multiple modules to reduce cognitive load",
441
+ )
442
+ )
443
+
444
+ # Score all symbols and filter
445
+ scored_symbols = []
446
+ for symbol in parse_result.symbols:
447
+ score = scorer.score(symbol)
448
+ scored_symbols.append((symbol, score))
449
+
450
+ # Use standard threshold for filtering (30.0 is a reasonable cutoff)
451
+ threshold = 30.0
452
+ filtered_symbols = [s for s, score in scored_symbols if score >= threshold]
453
+
454
+ # Calculate metrics
455
+ filtered_count = len(filtered_symbols)
456
+ filter_ratio = 1.0 - (filtered_count / total_symbols) if total_symbols > 0 else 0.0
457
+
458
+ # Analyze noise breakdown
459
+ noise_breakdown = self._analyze_noise_breakdown(
460
+ parse_result.symbols, filtered_symbols
461
+ )
462
+
463
+ # Detect high noise ratio
464
+ if filter_ratio > self.HIGH_NOISE_RATIO:
465
+ noise_description = self._format_noise_description(noise_breakdown)
466
+ issues.append(
467
+ DebtIssue(
468
+ severity=DebtSeverity.HIGH,
469
+ category="low_quality_symbols",
470
+ file_path=parse_result.path,
471
+ metric_value=filter_ratio,
472
+ threshold=self.HIGH_NOISE_RATIO,
473
+ description=f"High symbol noise ratio: {filter_ratio*100:.1f}% "
474
+ f"({total_symbols - filtered_count}/{total_symbols} symbols filtered). "
475
+ f"{noise_description}",
476
+ suggestion=self._suggest_noise_reduction(noise_breakdown),
477
+ )
478
+ )
479
+
480
+ # Calculate quality score
481
+ quality_score = self._calculate_symbol_quality_score(
482
+ total_symbols, filtered_count, noise_breakdown
483
+ )
484
+
485
+ analysis = SymbolOverloadAnalysis(
486
+ total_symbols=total_symbols,
487
+ filtered_symbols=filtered_count,
488
+ filter_ratio=filter_ratio,
489
+ noise_breakdown=noise_breakdown,
490
+ quality_score=quality_score,
491
+ )
492
+
493
+ return issues, analysis
494
+
495
+ def _analyze_noise_breakdown(
496
+ self, all_symbols: list, filtered_symbols: list
497
+ ) -> dict[str, int]:
498
+ """Analyze and categorize noise sources.
499
+
500
+ Args:
501
+ all_symbols: All symbols in the file
502
+ filtered_symbols: High-quality symbols after filtering
503
+
504
+ Returns:
505
+ Dictionary with noise categories and counts
506
+ """
507
+ # Get filtered symbol names for quick lookup
508
+ filtered_names = {s.name for s in filtered_symbols}
509
+
510
+ # Categorize noise
511
+ breakdown = {
512
+ "getters_setters": 0,
513
+ "private_methods": 0,
514
+ "magic_methods": 0,
515
+ "other": 0,
516
+ }
517
+
518
+ for symbol in all_symbols:
519
+ if symbol.name in filtered_names:
520
+ continue # Skip high-quality symbols
521
+
522
+ # Categorize this noise symbol
523
+ if symbol.name.startswith(("get", "set")) and len(symbol.name) > 3:
524
+ # Simple getter/setter pattern
525
+ breakdown["getters_setters"] += 1
526
+ elif symbol.name.startswith("_") and not symbol.name.startswith("__"):
527
+ # Private method (single underscore)
528
+ breakdown["private_methods"] += 1
529
+ elif symbol.name.startswith("__") and symbol.name.endswith("__"):
530
+ # Magic method
531
+ breakdown["magic_methods"] += 1
532
+ else:
533
+ breakdown["other"] += 1
534
+
535
+ return breakdown
536
+
537
+ def _format_noise_description(self, noise_breakdown: dict[str, int]) -> str:
538
+ """Format noise breakdown into readable description.
539
+
540
+ Args:
541
+ noise_breakdown: Dictionary of noise categories and counts
542
+
543
+ Returns:
544
+ Human-readable description
545
+ """
546
+ parts = []
547
+ if noise_breakdown.get("getters_setters", 0) > 0:
548
+ parts.append(f"{noise_breakdown['getters_setters']} getters/setters")
549
+ if noise_breakdown.get("private_methods", 0) > 0:
550
+ parts.append(f"{noise_breakdown['private_methods']} private methods")
551
+ if noise_breakdown.get("magic_methods", 0) > 0:
552
+ parts.append(f"{noise_breakdown['magic_methods']} magic methods")
553
+ if noise_breakdown.get("other", 0) > 0:
554
+ parts.append(f"{noise_breakdown['other']} other low-quality symbols")
555
+
556
+ return "Breakdown: " + ", ".join(parts) if parts else "No breakdown available"
557
+
558
+ def _suggest_noise_reduction(self, noise_breakdown: dict[str, int]) -> str:
559
+ """Generate suggestions for reducing symbol noise.
560
+
561
+ Args:
562
+ noise_breakdown: Dictionary of noise categories and counts
563
+
564
+ Returns:
565
+ Actionable suggestion
566
+ """
567
+ getters_setters = noise_breakdown.get("getters_setters", 0)
568
+ total_noise = sum(noise_breakdown.values())
569
+
570
+ if getters_setters > total_noise * 0.66:
571
+ # Data Class smell
572
+ return (
573
+ "Data Class smell detected: >66% getters/setters. "
574
+ "Consider using DTOs, value objects, or applying Tell Don't Ask principle"
575
+ )
576
+ elif getters_setters > 10:
577
+ return (
578
+ "High number of getters/setters. "
579
+ "Consider encapsulating data with behavior or using data classes"
580
+ )
581
+ else:
582
+ return (
583
+ "Reduce low-quality symbols: improve method naming, "
584
+ "merge helpers, or extract utilities"
585
+ )
586
+
587
+ def _calculate_symbol_quality_score(
588
+ self, total: int, filtered: int, noise_breakdown: dict[str, int]
589
+ ) -> float:
590
+ """Calculate symbol quality score.
591
+
592
+ Args:
593
+ total: Total symbol count
594
+ filtered: Filtered (high-quality) symbol count
595
+ noise_breakdown: Noise categorization
596
+
597
+ Returns:
598
+ Quality score (0-100, higher is better)
599
+ """
600
+ if total == 0:
601
+ return 100.0
602
+
603
+ # Base score from retention ratio
604
+ retention_ratio = filtered / total
605
+ score = retention_ratio * 100
606
+
607
+ # Penalty for getters/setters (Data Class smell)
608
+ getters_setters = noise_breakdown.get("getters_setters", 0)
609
+ if getters_setters > total * 0.5:
610
+ score -= 20 # Heavy penalty for Data Class
611
+ elif getters_setters > total * 0.3:
612
+ score -= 10 # Moderate penalty
613
+
614
+ # Penalty for many private methods (poor encapsulation)
615
+ private_methods = noise_breakdown.get("private_methods", 0)
616
+ if private_methods > total * 0.3:
617
+ score -= 10
618
+
619
+ return max(0.0, score)