thailint 0.1.5__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. src/__init__.py +7 -2
  2. src/analyzers/__init__.py +23 -0
  3. src/analyzers/typescript_base.py +148 -0
  4. src/api.py +1 -1
  5. src/cli.py +1111 -144
  6. src/config.py +12 -33
  7. src/core/base.py +102 -5
  8. src/core/cli_utils.py +206 -0
  9. src/core/config_parser.py +126 -0
  10. src/core/linter_utils.py +168 -0
  11. src/core/registry.py +17 -92
  12. src/core/rule_discovery.py +132 -0
  13. src/core/violation_builder.py +122 -0
  14. src/linter_config/ignore.py +112 -40
  15. src/linter_config/loader.py +3 -13
  16. src/linters/dry/__init__.py +23 -0
  17. src/linters/dry/base_token_analyzer.py +76 -0
  18. src/linters/dry/block_filter.py +265 -0
  19. src/linters/dry/block_grouper.py +59 -0
  20. src/linters/dry/cache.py +172 -0
  21. src/linters/dry/cache_query.py +61 -0
  22. src/linters/dry/config.py +134 -0
  23. src/linters/dry/config_loader.py +44 -0
  24. src/linters/dry/deduplicator.py +120 -0
  25. src/linters/dry/duplicate_storage.py +63 -0
  26. src/linters/dry/file_analyzer.py +90 -0
  27. src/linters/dry/inline_ignore.py +140 -0
  28. src/linters/dry/linter.py +163 -0
  29. src/linters/dry/python_analyzer.py +668 -0
  30. src/linters/dry/storage_initializer.py +42 -0
  31. src/linters/dry/token_hasher.py +169 -0
  32. src/linters/dry/typescript_analyzer.py +592 -0
  33. src/linters/dry/violation_builder.py +74 -0
  34. src/linters/dry/violation_filter.py +94 -0
  35. src/linters/dry/violation_generator.py +174 -0
  36. src/linters/file_header/__init__.py +24 -0
  37. src/linters/file_header/atemporal_detector.py +87 -0
  38. src/linters/file_header/config.py +66 -0
  39. src/linters/file_header/field_validator.py +69 -0
  40. src/linters/file_header/linter.py +313 -0
  41. src/linters/file_header/python_parser.py +86 -0
  42. src/linters/file_header/violation_builder.py +78 -0
  43. src/linters/file_placement/config_loader.py +86 -0
  44. src/linters/file_placement/directory_matcher.py +80 -0
  45. src/linters/file_placement/linter.py +262 -471
  46. src/linters/file_placement/path_resolver.py +61 -0
  47. src/linters/file_placement/pattern_matcher.py +55 -0
  48. src/linters/file_placement/pattern_validator.py +106 -0
  49. src/linters/file_placement/rule_checker.py +229 -0
  50. src/linters/file_placement/violation_factory.py +177 -0
  51. src/linters/magic_numbers/__init__.py +48 -0
  52. src/linters/magic_numbers/config.py +82 -0
  53. src/linters/magic_numbers/context_analyzer.py +247 -0
  54. src/linters/magic_numbers/linter.py +516 -0
  55. src/linters/magic_numbers/python_analyzer.py +76 -0
  56. src/linters/magic_numbers/typescript_analyzer.py +218 -0
  57. src/linters/magic_numbers/violation_builder.py +98 -0
  58. src/linters/nesting/__init__.py +6 -2
  59. src/linters/nesting/config.py +17 -4
  60. src/linters/nesting/linter.py +81 -168
  61. src/linters/nesting/typescript_analyzer.py +39 -102
  62. src/linters/nesting/typescript_function_extractor.py +130 -0
  63. src/linters/nesting/violation_builder.py +139 -0
  64. src/linters/print_statements/__init__.py +53 -0
  65. src/linters/print_statements/config.py +83 -0
  66. src/linters/print_statements/linter.py +430 -0
  67. src/linters/print_statements/python_analyzer.py +155 -0
  68. src/linters/print_statements/typescript_analyzer.py +135 -0
  69. src/linters/print_statements/violation_builder.py +98 -0
  70. src/linters/srp/__init__.py +99 -0
  71. src/linters/srp/class_analyzer.py +113 -0
  72. src/linters/srp/config.py +82 -0
  73. src/linters/srp/heuristics.py +89 -0
  74. src/linters/srp/linter.py +234 -0
  75. src/linters/srp/metrics_evaluator.py +47 -0
  76. src/linters/srp/python_analyzer.py +72 -0
  77. src/linters/srp/typescript_analyzer.py +75 -0
  78. src/linters/srp/typescript_metrics_calculator.py +90 -0
  79. src/linters/srp/violation_builder.py +117 -0
  80. src/orchestrator/core.py +54 -9
  81. src/templates/thailint_config_template.yaml +158 -0
  82. src/utils/__init__.py +4 -0
  83. src/utils/project_root.py +203 -0
  84. thailint-0.5.0.dist-info/METADATA +1286 -0
  85. thailint-0.5.0.dist-info/RECORD +96 -0
  86. {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info}/WHEEL +1 -1
  87. src/.ai/layout.yaml +0 -48
  88. thailint-0.1.5.dist-info/METADATA +0 -629
  89. thailint-0.1.5.dist-info/RECORD +0 -28
  90. {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info}/entry_points.txt +0 -0
  91. {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info/licenses}/LICENSE +0 -0
@@ -5,18 +5,18 @@ Scope: Multi-level ignore system across repository, directory, file, method, and
5
5
 
6
6
  Overview: Implements a sophisticated ignore directive system that allows developers to suppress
7
7
  linting violations at five different granularity levels, from entire repository patterns down
8
- to individual lines of code. Repository level uses .thailintignore file with gitignore-style
9
- glob patterns for excluding files like build artifacts and dependencies. File level scans the
10
- first 10 lines for ignore-file directives (performance optimization). Method level supports
11
- ignore-next-line directives placed before functions. Line level enables inline ignore comments
12
- at the end of code lines. All levels support rule-specific ignores using bracket syntax
13
- [rule-id] and wildcard rule matching (literals.* matches literals.magic-number). The
14
- should_ignore_violation() method provides unified checking across all levels, integrating
8
+ to individual lines of code. Repository level uses global ignore patterns from .thailint.yaml
9
+ with gitignore-style glob patterns for excluding files like build artifacts and dependencies.
10
+ File level scans the first 10 lines for ignore-file directives (performance optimization).
11
+ Method level supports ignore-next-line directives placed before functions. Line level enables
12
+ inline ignore comments at the end of code lines. All levels support rule-specific ignores
13
+ using bracket syntax [rule-id] and wildcard rule matching (literals.* matches literals.magic-number).
14
+ The should_ignore_violation() method provides unified checking across all levels, integrating
15
15
  with the violation reporting system to filter out suppressed violations before displaying
16
16
  results to users.
17
17
 
18
18
  Dependencies: fnmatch for gitignore-style pattern matching, re for regex-based directive parsing,
19
- pathlib for file operations, Violation type for violation checking
19
+ pathlib for file operations, Violation type for violation checking, yaml for config loading
20
20
 
21
21
  Exports: IgnoreDirectiveParser class
22
22
 
@@ -25,9 +25,9 @@ Interfaces: is_ignored(file_path: Path) -> bool for repo-level checking,
25
25
  has_line_ignore(code: str, line_num: int, rule_id: str | None) -> bool for line-level,
26
26
  should_ignore_violation(violation: Violation, file_content: str) -> bool for unified checking
27
27
 
28
- Implementation: Gitignore-style pattern matching with fnmatch, first-10-lines scanning for
29
- performance, regex-based directive parsing with rule ID extraction, wildcard rule matching
30
- with prefix comparison, graceful error handling for malformed directives
28
+ Implementation: Gitignore-style pattern matching with fnmatch, YAML config loading for global patterns,
29
+ first-10-lines scanning for performance, regex-based directive parsing with rule ID extraction,
30
+ wildcard rule matching with prefix comparison, graceful error handling for malformed directives
31
31
  """
32
32
 
33
33
  import fnmatch
@@ -35,6 +35,8 @@ import re
35
35
  from pathlib import Path
36
36
  from typing import TYPE_CHECKING
37
37
 
38
+ import yaml
39
+
38
40
  if TYPE_CHECKING:
39
41
  from src.core.types import Violation
40
42
 
@@ -56,22 +58,58 @@ class IgnoreDirectiveParser:
56
58
  self.repo_patterns = self._load_repo_ignores()
57
59
 
58
60
  def _load_repo_ignores(self) -> list[str]:
59
- """Load .thailintignore file patterns.
61
+ """Load global ignore patterns from .thailintignore or .thailint.yaml."""
62
+ # First, try to load from .thailintignore (gitignore-style)
63
+ thailintignore = self.project_root / ".thailintignore"
64
+ if thailintignore.exists():
65
+ return self._parse_thailintignore_file(thailintignore)
66
+
67
+ # Fall back to .thailint.yaml
68
+ config_file = self.project_root / ".thailint.yaml"
69
+ if config_file.exists():
70
+ return self._parse_config_file(config_file)
71
+
72
+ return []
73
+
74
+ def _parse_thailintignore_file(self, ignore_file: Path) -> list[str]:
75
+ """Parse .thailintignore file (gitignore-style).
76
+
77
+ Args:
78
+ ignore_file: Path to .thailintignore file
60
79
 
61
80
  Returns:
62
- List of gitignore-style patterns.
81
+ List of ignore patterns
63
82
  """
64
- ignore_file = self.project_root / ".thailintignore"
65
- if not ignore_file.exists():
83
+ try:
84
+ content = ignore_file.read_text(encoding="utf-8")
85
+ patterns = []
86
+ for line in content.splitlines():
87
+ line = line.strip()
88
+ # Skip empty lines and comments
89
+ if line and not line.startswith("#"):
90
+ patterns.append(line)
91
+ return patterns
92
+ except (OSError, UnicodeDecodeError):
66
93
  return []
67
94
 
68
- patterns = []
69
- for line in ignore_file.read_text(encoding="utf-8").splitlines():
70
- line = line.strip()
71
- # Skip comments and blank lines
72
- if line and not line.startswith("#"):
73
- patterns.append(line)
74
- return patterns
95
+ def _parse_config_file(self, config_file: Path) -> list[str]:
96
+ """Parse YAML config file and extract ignore patterns."""
97
+ try:
98
+ config = yaml.safe_load(config_file.read_text(encoding="utf-8"))
99
+ return self._extract_ignore_patterns(config)
100
+ except (yaml.YAMLError, OSError, UnicodeDecodeError):
101
+ return []
102
+
103
+ @staticmethod
104
+ def _extract_ignore_patterns(config: dict | None) -> list[str]:
105
+ """Extract ignore patterns from config dict."""
106
+ if not config or not isinstance(config, dict):
107
+ return []
108
+
109
+ ignore_patterns = config.get("ignore", [])
110
+ if isinstance(ignore_patterns, list):
111
+ return [str(pattern) for pattern in ignore_patterns]
112
+ return []
75
113
 
76
114
  def is_ignored(self, file_path: Path) -> bool:
77
115
  """Check if file matches repository-level ignore patterns.
@@ -122,13 +160,33 @@ class IgnoreDirectiveParser:
122
160
 
123
161
  def _has_ignore_directive_marker(self, line: str) -> bool:
124
162
  """Check if line contains an ignore directive marker."""
125
- return "# thailint: ignore-file" in line or "# design-lint: ignore-file" in line
163
+ line_lower = line.lower()
164
+ return "# thailint: ignore-file" in line_lower or "# design-lint: ignore-file" in line_lower
126
165
 
127
166
  def _check_specific_rule_ignore(self, line: str, rule_id: str) -> bool:
128
167
  """Check if line ignores a specific rule."""
129
- match = re.search(r"ignore-file\[([^\]]+)\]", line)
130
- if match:
131
- ignored_rules = [r.strip() for r in match.group(1).split(",")]
168
+ # Check for bracket syntax: # thailint: ignore-file[rule1, rule2]
169
+ if self._check_bracket_syntax_file_ignore(line, rule_id):
170
+ return True
171
+
172
+ # Check for space-separated syntax: # thailint: ignore-file rule1 rule2
173
+ return self._check_space_syntax_file_ignore(line, rule_id)
174
+
175
+ def _check_bracket_syntax_file_ignore(self, line: str, rule_id: str) -> bool:
176
+ """Check bracket syntax for file-level ignore."""
177
+ bracket_match = re.search(r"ignore-file\[([^\]]+)\]", line, re.IGNORECASE)
178
+ if bracket_match:
179
+ ignored_rules = [r.strip() for r in bracket_match.group(1).split(",")]
180
+ return any(self._rule_matches(rule_id, r) for r in ignored_rules)
181
+ return False
182
+
183
+ def _check_space_syntax_file_ignore(self, line: str, rule_id: str) -> bool:
184
+ """Check space-separated syntax for file-level ignore."""
185
+ space_match = re.search(r"ignore-file\s+([^\s#]+(?:\s+[^\s#]+)*)", line, re.IGNORECASE)
186
+ if space_match:
187
+ ignored_rules = [
188
+ r.strip() for r in re.split(r"[,\s]+", space_match.group(1)) if r.strip()
189
+ ]
132
190
  return any(self._rule_matches(rule_id, r) for r in ignored_rules)
133
191
  return False
134
192
 
@@ -171,27 +229,28 @@ class IgnoreDirectiveParser:
171
229
 
172
230
  def _has_line_ignore_marker(self, code: str) -> bool:
173
231
  """Check if code line has ignore marker."""
232
+ code_lower = code.lower()
174
233
  return (
175
- "# thailint: ignore" in code
176
- or "# design-lint: ignore" in code
177
- or "// thailint: ignore" in code
178
- or "// design-lint: ignore" in code
234
+ "# thailint: ignore" in code_lower
235
+ or "# design-lint: ignore" in code_lower
236
+ or "// thailint: ignore" in code_lower
237
+ or "// design-lint: ignore" in code_lower
179
238
  )
180
239
 
181
240
  def _check_specific_rule_in_line(self, code: str, rule_id: str) -> bool:
182
241
  """Check if line's ignore directive matches specific rule."""
183
242
  # Check for bracket syntax: # thailint: ignore[rule1, rule2]
184
- bracket_match = re.search(r"ignore\[([^\]]+)\]", code)
243
+ bracket_match = re.search(r"ignore\[([^\]]+)\]", code, re.IGNORECASE)
185
244
  if bracket_match:
186
245
  return self._check_bracket_rules(bracket_match.group(1), rule_id)
187
246
 
188
247
  # Check for space-separated syntax: # thailint: ignore rule1 rule2
189
- space_match = re.search(r"ignore\s+([^\s#]+(?:\s+[^\s#]+)*)", code)
248
+ space_match = re.search(r"ignore\s+([^\s#]+(?:\s+[^\s#]+)*)", code, re.IGNORECASE)
190
249
  if space_match:
191
250
  return self._check_space_separated_rules(space_match.group(1), rule_id)
192
251
 
193
252
  # No specific rules - check for "ignore-all"
194
- return "ignore-all" in code
253
+ return "ignore-all" in code.lower()
195
254
 
196
255
  def _check_bracket_rules(self, rules_text: str, rule_id: str) -> bool:
197
256
  """Check if bracketed rules match the rule ID."""
@@ -231,17 +290,21 @@ class IgnoreDirectiveParser:
231
290
  Returns:
232
291
  True if rule matches pattern.
233
292
  """
234
- if pattern.endswith("*"):
293
+ # Case-insensitive comparison
294
+ rule_id_lower = rule_id.lower()
295
+ pattern_lower = pattern.lower()
296
+
297
+ if pattern_lower.endswith("*"):
235
298
  # Wildcard match: literals.* matches literals.magic-number
236
- prefix = pattern[:-1]
237
- return rule_id.startswith(prefix)
299
+ prefix = pattern_lower[:-1]
300
+ return rule_id_lower.startswith(prefix)
238
301
 
239
302
  # Exact match
240
- if rule_id == pattern:
303
+ if rule_id_lower == pattern_lower:
241
304
  return True
242
305
 
243
306
  # Prefix match: "nesting" matches "nesting.excessive-depth"
244
- if rule_id.startswith(pattern + "."):
307
+ if rule_id_lower.startswith(pattern_lower + "."):
245
308
  return True
246
309
 
247
310
  return False
@@ -293,18 +356,27 @@ class IgnoreDirectiveParser:
293
356
  file_path = Path(violation.file_path)
294
357
 
295
358
  # Repository and file level checks
296
- if self._is_ignored_at_file_level(file_path, violation.rule_id):
359
+ if self._is_ignored_at_file_level(file_path, violation.rule_id, file_content):
297
360
  return True
298
361
 
299
362
  # Line-based checks
300
363
  return self._is_ignored_in_content(file_content, violation)
301
364
 
302
- def _is_ignored_at_file_level(self, file_path: Path, rule_id: str) -> bool:
365
+ def _is_ignored_at_file_level(self, file_path: Path, rule_id: str, file_content: str) -> bool:
303
366
  """Check repository and file level ignores."""
304
367
  if self.is_ignored(file_path):
305
368
  return True
369
+ # Check content first (for tests with in-memory content)
370
+ if self._has_file_ignore_in_content(file_content, rule_id):
371
+ return True
372
+ # Fall back to reading from disk if file exists
306
373
  return self.has_file_ignore(file_path, rule_id)
307
374
 
375
+ def _has_file_ignore_in_content(self, file_content: str, rule_id: str | None) -> bool:
376
+ """Check if file content has ignore-file directive."""
377
+ lines = file_content.splitlines()[:10] # Check first 10 lines
378
+ return any(self._check_line_for_ignore(line, rule_id) for line in lines)
379
+
308
380
  def _is_ignored_in_content(self, file_content: str, violation: "Violation") -> bool:
309
381
  """Check content-based ignores (block, line, method level)."""
310
382
  lines = file_content.splitlines()
@@ -25,11 +25,10 @@ Implementation: Extension-based format detection (.yaml/.yml vs .json), yaml.saf
25
25
  for security, empty dict handling for null YAML, ValueError for unsupported formats
26
26
  """
27
27
 
28
- import json
29
28
  from pathlib import Path
30
29
  from typing import Any
31
30
 
32
- import yaml
31
+ from src.core.config_parser import parse_config_file
33
32
 
34
33
 
35
34
  class LinterConfigLoader:
@@ -49,21 +48,12 @@ class LinterConfigLoader:
49
48
  Configuration dictionary.
50
49
 
51
50
  Raises:
52
- ValueError: If file format is unsupported.
53
- yaml.YAMLError: If YAML is malformed.
54
- json.JSONDecodeError: If JSON is malformed.
51
+ ConfigParseError: If file format is unsupported or parsing fails.
55
52
  """
56
53
  if not config_path.exists():
57
54
  return self.get_defaults()
58
55
 
59
- suffix = config_path.suffix.lower()
60
-
61
- with config_path.open(encoding="utf-8") as f:
62
- if suffix in [".yaml", ".yml"]:
63
- return yaml.safe_load(f) or {}
64
- if suffix == ".json":
65
- return json.load(f)
66
- raise ValueError(f"Unsupported config format: {suffix}")
56
+ return parse_config_file(config_path)
67
57
 
68
58
  def get_defaults(self) -> dict[str, Any]:
69
59
  """Get default configuration.
@@ -0,0 +1,23 @@
1
+ """
2
+ Purpose: DRY (Don't Repeat Yourself) linter module exports
3
+
4
+ Scope: Module-level exports for DRY linter components
5
+
6
+ Overview: Provides centralized exports for the DRY linter module components. Exposes the main
7
+ DRYRule class for duplicate code detection, configuration dataclass, and analyzer components.
8
+ Simplifies imports for consumers by providing a single import point for all DRY linter
9
+ functionality. Follows the established pattern from nesting and SRP linters.
10
+
11
+ Dependencies: linter.DRYRule, config.DRYConfig
12
+
13
+ Exports: DRYRule (main rule class), DRYConfig (configuration)
14
+
15
+ Interfaces: Module-level __all__ list defining public API
16
+
17
+ Implementation: Standard Python module with explicit exports via __all__
18
+ """
19
+
20
+ from .config import DRYConfig
21
+ from .linter import DRYRule
22
+
23
+ __all__ = ["DRYRule", "DRYConfig"]
@@ -0,0 +1,76 @@
1
+ """
2
+ Purpose: Base class for token-based duplicate code analysis
3
+
4
+ Scope: Common duplicate detection workflow for Python and TypeScript analyzers
5
+
6
+ Overview: Provides shared infrastructure for token-based duplicate code detection across different
7
+ programming languages. Implements common workflow of tokenization, rolling hash window generation,
8
+ and CodeBlock creation. Subclasses provide language-specific filtering (e.g., interface filtering
9
+ for TypeScript). Eliminates duplication between PythonDuplicateAnalyzer and TypeScriptDuplicateAnalyzer
10
+ by extracting shared analyze() method pattern and CodeBlock creation logic.
11
+
12
+ Dependencies: TokenHasher, CodeBlock, DRYConfig, pathlib.Path
13
+
14
+ Exports: BaseTokenAnalyzer class
15
+
16
+ Interfaces: BaseTokenAnalyzer.analyze(file_path: Path, content: str, config: DRYConfig) -> list[CodeBlock]
17
+
18
+ Implementation: Template method pattern with extension point for language-specific block filtering
19
+ """
20
+
21
+ from pathlib import Path
22
+
23
+ from .cache import CodeBlock
24
+ from .config import DRYConfig
25
+ from .token_hasher import TokenHasher
26
+
27
+
28
+ class BaseTokenAnalyzer:
29
+ """Base analyzer for token-based duplicate detection."""
30
+
31
+ def __init__(self) -> None:
32
+ """Initialize analyzer with token hasher."""
33
+ self._hasher = TokenHasher()
34
+
35
+ def analyze(self, file_path: Path, content: str, config: DRYConfig) -> list[CodeBlock]:
36
+ """Analyze file for duplicate code blocks.
37
+
38
+ Args:
39
+ file_path: Path to source file
40
+ content: File content
41
+ config: DRY configuration
42
+
43
+ Returns:
44
+ List of CodeBlock instances with hash values
45
+ """
46
+ lines = self._hasher.tokenize(content)
47
+ windows = self._hasher.rolling_hash(lines, config.min_duplicate_lines)
48
+
49
+ blocks = []
50
+ for hash_val, start_line, end_line, snippet in windows:
51
+ if self._should_include_block(content, start_line, end_line):
52
+ block = CodeBlock(
53
+ file_path=file_path,
54
+ start_line=start_line,
55
+ end_line=end_line,
56
+ snippet=snippet,
57
+ hash_value=hash_val,
58
+ )
59
+ blocks.append(block)
60
+
61
+ return blocks
62
+
63
+ def _should_include_block(self, content: str, start_line: int, end_line: int) -> bool:
64
+ """Determine if block should be included.
65
+
66
+ Extension point for language-specific filtering.
67
+
68
+ Args:
69
+ content: File content
70
+ start_line: Block start line
71
+ end_line: Block end line
72
+
73
+ Returns:
74
+ True if block should be included, False to filter out
75
+ """
76
+ return True
@@ -0,0 +1,265 @@
1
+ """
2
+ Purpose: Extensible filter system for DRY duplicate detection
3
+
4
+ Scope: Filters out false positive duplications (API boilerplate, keyword arguments, etc.)
5
+
6
+ Overview: Provides an extensible architecture for filtering duplicate code blocks that are
7
+ not meaningful duplications. Includes base filter interface and built-in filters for
8
+ common false positive patterns like keyword-only function arguments, import groups,
9
+ and API call boilerplate. New filters can be added by subclassing BaseBlockFilter.
10
+
11
+ Dependencies: ast, re, typing
12
+
13
+ Exports: BaseBlockFilter, BlockFilterRegistry, KeywordArgumentFilter, ImportGroupFilter
14
+
15
+ Interfaces: BaseBlockFilter.should_filter(code_block, file_content) -> bool
16
+
17
+ Implementation: Strategy pattern with filter registry for extensibility
18
+ """
19
+
20
+ import ast
21
+ import re
22
+ from abc import ABC, abstractmethod
23
+ from pathlib import Path
24
+ from typing import Protocol
25
+
26
+ # Default filter threshold constants
27
+ DEFAULT_KEYWORD_ARG_THRESHOLD = 0.8
28
+
29
+
30
+ class CodeBlock(Protocol):
31
+ """Protocol for code blocks (matches cache.CodeBlock)."""
32
+
33
+ file_path: Path
34
+ start_line: int
35
+ end_line: int
36
+ snippet: str
37
+ hash_value: int
38
+
39
+
40
+ class BaseBlockFilter(ABC):
41
+ """Base class for duplicate block filters."""
42
+
43
+ @abstractmethod
44
+ def should_filter(self, block: CodeBlock, file_content: str) -> bool:
45
+ """Determine if a code block should be filtered out.
46
+
47
+ Args:
48
+ block: Code block to evaluate
49
+ file_content: Full file content for context
50
+
51
+ Returns:
52
+ True if block should be filtered (not reported as duplicate)
53
+ """
54
+ pass
55
+
56
+ @abstractmethod
57
+ def get_name(self) -> str:
58
+ """Get filter name for configuration and logging."""
59
+ pass
60
+
61
+
62
+ class KeywordArgumentFilter(BaseBlockFilter):
63
+ """Filters blocks that are primarily keyword arguments in function calls.
64
+
65
+ Detects patterns like:
66
+ message=message,
67
+ severity=Severity.ERROR,
68
+ suggestion=suggestion,
69
+
70
+ These are common in builder patterns and API calls.
71
+ """
72
+
73
+ def __init__(self, threshold: float = DEFAULT_KEYWORD_ARG_THRESHOLD):
74
+ """Initialize filter.
75
+
76
+ Args:
77
+ threshold: Minimum percentage of lines that must be keyword args (0.0-1.0)
78
+ """
79
+ self.threshold = threshold
80
+ # Pattern: optional whitespace, identifier, =, value, optional comma
81
+ self._kwarg_pattern = re.compile(r"^\s*\w+\s*=\s*.+,?\s*$")
82
+
83
+ def should_filter(self, block: CodeBlock, file_content: str) -> bool:
84
+ """Check if block is primarily keyword arguments.
85
+
86
+ Args:
87
+ block: Code block to evaluate
88
+ file_content: Full file content for context
89
+
90
+ Returns:
91
+ True if block should be filtered
92
+ """
93
+ lines = file_content.split("\n")[block.start_line - 1 : block.end_line]
94
+
95
+ if not lines:
96
+ return False
97
+
98
+ # Count lines that match keyword argument pattern
99
+ kwarg_lines = sum(1 for line in lines if self._kwarg_pattern.match(line))
100
+
101
+ # Filter if most lines are keyword arguments
102
+ ratio = kwarg_lines / len(lines)
103
+ if ratio >= self.threshold:
104
+ return self._is_inside_function_call(block, file_content)
105
+
106
+ return False
107
+
108
+ def _is_inside_function_call(self, block: CodeBlock, file_content: str) -> bool:
109
+ """Verify the block is inside a function call, not standalone code."""
110
+ try:
111
+ tree = ast.parse(file_content)
112
+ except SyntaxError:
113
+ return False
114
+
115
+ # Find if any Call node contains the block
116
+ for node in ast.walk(tree):
117
+ if isinstance(node, ast.Call) and self._check_multiline_containment(node, block):
118
+ return True
119
+ return False
120
+
121
+ @staticmethod
122
+ def _check_multiline_containment(node: ast.Call, block: CodeBlock) -> bool:
123
+ """Check if Call node is multiline and contains block."""
124
+ if not KeywordArgumentFilter._has_valid_line_info(node):
125
+ return False
126
+
127
+ # After validation, these are guaranteed to be non-None integers
128
+ # Use type: ignore to suppress MyPy's inability to understand runtime validation
129
+ is_multiline = node.lineno < node.end_lineno # type: ignore[operator]
130
+ contains_block = (
131
+ node.lineno <= block.start_line and node.end_lineno >= block.end_line # type: ignore[operator]
132
+ )
133
+ return is_multiline and contains_block
134
+
135
+ @staticmethod
136
+ def _has_valid_line_info(node: ast.Call) -> bool:
137
+ """Check if node has valid line information.
138
+
139
+ Args:
140
+ node: AST Call node to check
141
+
142
+ Returns:
143
+ True if node has valid line number attributes
144
+ """
145
+ if not hasattr(node, "lineno"):
146
+ return False
147
+ if not hasattr(node, "end_lineno"):
148
+ return False
149
+ if node.lineno is None:
150
+ return False
151
+ if node.end_lineno is None:
152
+ return False
153
+ return True
154
+
155
+ def get_name(self) -> str:
156
+ """Get filter name."""
157
+ return "keyword_argument_filter"
158
+
159
+
160
+ class ImportGroupFilter(BaseBlockFilter):
161
+ """Filters blocks that are just import statements.
162
+
163
+ Import organization often creates similar patterns that aren't meaningful duplication.
164
+ """
165
+
166
+ def should_filter(self, block: CodeBlock, file_content: str) -> bool:
167
+ """Check if block is only import statements.
168
+
169
+ Args:
170
+ block: Code block to evaluate
171
+ file_content: Full file content
172
+
173
+ Returns:
174
+ True if block should be filtered
175
+ """
176
+ lines = file_content.split("\n")[block.start_line - 1 : block.end_line]
177
+
178
+ for line in lines:
179
+ stripped = line.strip()
180
+ if not stripped:
181
+ continue
182
+ if not (stripped.startswith("import ") or stripped.startswith("from ")):
183
+ return False
184
+
185
+ return True
186
+
187
+ def get_name(self) -> str:
188
+ """Get filter name."""
189
+ return "import_group_filter"
190
+
191
+
192
+ class BlockFilterRegistry:
193
+ """Registry for managing duplicate block filters."""
194
+
195
+ def __init__(self) -> None:
196
+ """Initialize empty registry."""
197
+ self._filters: list[BaseBlockFilter] = []
198
+ self._enabled_filters: set[str] = set()
199
+
200
+ def register(self, filter_instance: BaseBlockFilter) -> None:
201
+ """Register a filter.
202
+
203
+ Args:
204
+ filter_instance: Filter to register
205
+ """
206
+ self._filters.append(filter_instance)
207
+ self._enabled_filters.add(filter_instance.get_name())
208
+
209
+ def enable_filter(self, filter_name: str) -> None:
210
+ """Enable a specific filter by name.
211
+
212
+ Args:
213
+ filter_name: Name of filter to enable
214
+ """
215
+ self._enabled_filters.add(filter_name)
216
+
217
+ def disable_filter(self, filter_name: str) -> None:
218
+ """Disable a specific filter by name.
219
+
220
+ Args:
221
+ filter_name: Name of filter to disable
222
+ """
223
+ self._enabled_filters.discard(filter_name)
224
+
225
+ def should_filter_block(self, block: CodeBlock, file_content: str) -> bool:
226
+ """Check if any enabled filter wants to filter this block.
227
+
228
+ Args:
229
+ block: Code block to evaluate
230
+ file_content: Full file content
231
+
232
+ Returns:
233
+ True if block should be filtered out
234
+ """
235
+ for filter_instance in self._filters:
236
+ if filter_instance.get_name() not in self._enabled_filters:
237
+ continue
238
+
239
+ if filter_instance.should_filter(block, file_content):
240
+ return True
241
+
242
+ return False
243
+
244
+ def get_enabled_filters(self) -> list[str]:
245
+ """Get list of enabled filter names.
246
+
247
+ Returns:
248
+ List of enabled filter names
249
+ """
250
+ return sorted(self._enabled_filters)
251
+
252
+
253
+ def create_default_registry() -> BlockFilterRegistry:
254
+ """Create registry with default filters.
255
+
256
+ Returns:
257
+ BlockFilterRegistry with common filters registered
258
+ """
259
+ registry = BlockFilterRegistry()
260
+
261
+ # Register built-in filters
262
+ registry.register(KeywordArgumentFilter(threshold=DEFAULT_KEYWORD_ARG_THRESHOLD))
263
+ registry.register(ImportGroupFilter())
264
+
265
+ return registry