thailint 0.1.6__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- src/__init__.py +7 -2
- src/analyzers/__init__.py +23 -0
- src/analyzers/typescript_base.py +148 -0
- src/api.py +1 -1
- src/cli.py +498 -141
- src/config.py +6 -31
- src/core/base.py +12 -0
- src/core/cli_utils.py +206 -0
- src/core/config_parser.py +99 -0
- src/core/linter_utils.py +168 -0
- src/core/registry.py +17 -92
- src/core/rule_discovery.py +132 -0
- src/core/violation_builder.py +122 -0
- src/linter_config/ignore.py +112 -40
- src/linter_config/loader.py +3 -13
- src/linters/dry/__init__.py +23 -0
- src/linters/dry/base_token_analyzer.py +76 -0
- src/linters/dry/block_filter.py +262 -0
- src/linters/dry/block_grouper.py +59 -0
- src/linters/dry/cache.py +218 -0
- src/linters/dry/cache_query.py +61 -0
- src/linters/dry/config.py +130 -0
- src/linters/dry/config_loader.py +44 -0
- src/linters/dry/deduplicator.py +120 -0
- src/linters/dry/duplicate_storage.py +126 -0
- src/linters/dry/file_analyzer.py +127 -0
- src/linters/dry/inline_ignore.py +140 -0
- src/linters/dry/linter.py +170 -0
- src/linters/dry/python_analyzer.py +517 -0
- src/linters/dry/storage_initializer.py +51 -0
- src/linters/dry/token_hasher.py +115 -0
- src/linters/dry/typescript_analyzer.py +590 -0
- src/linters/dry/violation_builder.py +74 -0
- src/linters/dry/violation_filter.py +91 -0
- src/linters/dry/violation_generator.py +174 -0
- src/linters/file_placement/config_loader.py +86 -0
- src/linters/file_placement/directory_matcher.py +80 -0
- src/linters/file_placement/linter.py +252 -472
- src/linters/file_placement/path_resolver.py +61 -0
- src/linters/file_placement/pattern_matcher.py +55 -0
- src/linters/file_placement/pattern_validator.py +106 -0
- src/linters/file_placement/rule_checker.py +229 -0
- src/linters/file_placement/violation_factory.py +177 -0
- src/linters/nesting/config.py +13 -3
- src/linters/nesting/linter.py +76 -152
- src/linters/nesting/typescript_analyzer.py +38 -102
- src/linters/nesting/typescript_function_extractor.py +130 -0
- src/linters/nesting/violation_builder.py +139 -0
- src/linters/srp/__init__.py +99 -0
- src/linters/srp/class_analyzer.py +113 -0
- src/linters/srp/config.py +76 -0
- src/linters/srp/heuristics.py +89 -0
- src/linters/srp/linter.py +225 -0
- src/linters/srp/metrics_evaluator.py +47 -0
- src/linters/srp/python_analyzer.py +72 -0
- src/linters/srp/typescript_analyzer.py +75 -0
- src/linters/srp/typescript_metrics_calculator.py +90 -0
- src/linters/srp/violation_builder.py +117 -0
- src/orchestrator/core.py +42 -7
- src/utils/__init__.py +4 -0
- src/utils/project_root.py +84 -0
- {thailint-0.1.6.dist-info → thailint-0.2.0.dist-info}/METADATA +414 -63
- thailint-0.2.0.dist-info/RECORD +75 -0
- src/.ai/layout.yaml +0 -48
- thailint-0.1.6.dist-info/RECORD +0 -28
- {thailint-0.1.6.dist-info → thailint-0.2.0.dist-info}/LICENSE +0 -0
- {thailint-0.1.6.dist-info → thailint-0.2.0.dist-info}/WHEEL +0 -0
- {thailint-0.1.6.dist-info → thailint-0.2.0.dist-info}/entry_points.txt +0 -0
src/linter_config/loader.py
CHANGED
|
@@ -25,11 +25,10 @@ Implementation: Extension-based format detection (.yaml/.yml vs .json), yaml.saf
|
|
|
25
25
|
for security, empty dict handling for null YAML, ValueError for unsupported formats
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
import json
|
|
29
28
|
from pathlib import Path
|
|
30
29
|
from typing import Any
|
|
31
30
|
|
|
32
|
-
import
|
|
31
|
+
from src.core.config_parser import parse_config_file
|
|
33
32
|
|
|
34
33
|
|
|
35
34
|
class LinterConfigLoader:
|
|
@@ -49,21 +48,12 @@ class LinterConfigLoader:
|
|
|
49
48
|
Configuration dictionary.
|
|
50
49
|
|
|
51
50
|
Raises:
|
|
52
|
-
|
|
53
|
-
yaml.YAMLError: If YAML is malformed.
|
|
54
|
-
json.JSONDecodeError: If JSON is malformed.
|
|
51
|
+
ConfigParseError: If file format is unsupported or parsing fails.
|
|
55
52
|
"""
|
|
56
53
|
if not config_path.exists():
|
|
57
54
|
return self.get_defaults()
|
|
58
55
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
with config_path.open(encoding="utf-8") as f:
|
|
62
|
-
if suffix in [".yaml", ".yml"]:
|
|
63
|
-
return yaml.safe_load(f) or {}
|
|
64
|
-
if suffix == ".json":
|
|
65
|
-
return json.load(f)
|
|
66
|
-
raise ValueError(f"Unsupported config format: {suffix}")
|
|
56
|
+
return parse_config_file(config_path)
|
|
67
57
|
|
|
68
58
|
def get_defaults(self) -> dict[str, Any]:
|
|
69
59
|
"""Get default configuration.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: DRY (Don't Repeat Yourself) linter module exports
|
|
3
|
+
|
|
4
|
+
Scope: Module-level exports for DRY linter components
|
|
5
|
+
|
|
6
|
+
Overview: Provides centralized exports for the DRY linter module components. Exposes the main
|
|
7
|
+
DRYRule class for duplicate code detection, configuration dataclass, and analyzer components.
|
|
8
|
+
Simplifies imports for consumers by providing a single import point for all DRY linter
|
|
9
|
+
functionality. Follows the established pattern from nesting and SRP linters.
|
|
10
|
+
|
|
11
|
+
Dependencies: linter.DRYRule, config.DRYConfig
|
|
12
|
+
|
|
13
|
+
Exports: DRYRule (main rule class), DRYConfig (configuration)
|
|
14
|
+
|
|
15
|
+
Interfaces: Module-level __all__ list defining public API
|
|
16
|
+
|
|
17
|
+
Implementation: Standard Python module with explicit exports via __all__
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from .config import DRYConfig
|
|
21
|
+
from .linter import DRYRule
|
|
22
|
+
|
|
23
|
+
__all__ = ["DRYRule", "DRYConfig"]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Base class for token-based duplicate code analysis
|
|
3
|
+
|
|
4
|
+
Scope: Common duplicate detection workflow for Python and TypeScript analyzers
|
|
5
|
+
|
|
6
|
+
Overview: Provides shared infrastructure for token-based duplicate code detection across different
|
|
7
|
+
programming languages. Implements common workflow of tokenization, rolling hash window generation,
|
|
8
|
+
and CodeBlock creation. Subclasses provide language-specific filtering (e.g., interface filtering
|
|
9
|
+
for TypeScript). Eliminates duplication between PythonDuplicateAnalyzer and TypeScriptDuplicateAnalyzer
|
|
10
|
+
by extracting shared analyze() method pattern and CodeBlock creation logic.
|
|
11
|
+
|
|
12
|
+
Dependencies: TokenHasher, CodeBlock, DRYConfig, pathlib.Path
|
|
13
|
+
|
|
14
|
+
Exports: BaseTokenAnalyzer class
|
|
15
|
+
|
|
16
|
+
Interfaces: BaseTokenAnalyzer.analyze(file_path: Path, content: str, config: DRYConfig) -> list[CodeBlock]
|
|
17
|
+
|
|
18
|
+
Implementation: Template method pattern with extension point for language-specific block filtering
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from .cache import CodeBlock
|
|
24
|
+
from .config import DRYConfig
|
|
25
|
+
from .token_hasher import TokenHasher
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BaseTokenAnalyzer:
|
|
29
|
+
"""Base analyzer for token-based duplicate detection."""
|
|
30
|
+
|
|
31
|
+
def __init__(self) -> None:
|
|
32
|
+
"""Initialize analyzer with token hasher."""
|
|
33
|
+
self._hasher = TokenHasher()
|
|
34
|
+
|
|
35
|
+
def analyze(self, file_path: Path, content: str, config: DRYConfig) -> list[CodeBlock]:
|
|
36
|
+
"""Analyze file for duplicate code blocks.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
file_path: Path to source file
|
|
40
|
+
content: File content
|
|
41
|
+
config: DRY configuration
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of CodeBlock instances with hash values
|
|
45
|
+
"""
|
|
46
|
+
lines = self._hasher.tokenize(content)
|
|
47
|
+
windows = self._hasher.rolling_hash(lines, config.min_duplicate_lines)
|
|
48
|
+
|
|
49
|
+
blocks = []
|
|
50
|
+
for hash_val, start_line, end_line, snippet in windows:
|
|
51
|
+
if self._should_include_block(content, start_line, end_line):
|
|
52
|
+
block = CodeBlock(
|
|
53
|
+
file_path=file_path,
|
|
54
|
+
start_line=start_line,
|
|
55
|
+
end_line=end_line,
|
|
56
|
+
snippet=snippet,
|
|
57
|
+
hash_value=hash_val,
|
|
58
|
+
)
|
|
59
|
+
blocks.append(block)
|
|
60
|
+
|
|
61
|
+
return blocks
|
|
62
|
+
|
|
63
|
+
def _should_include_block(self, content: str, start_line: int, end_line: int) -> bool:
|
|
64
|
+
"""Determine if block should be included.
|
|
65
|
+
|
|
66
|
+
Extension point for language-specific filtering.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
content: File content
|
|
70
|
+
start_line: Block start line
|
|
71
|
+
end_line: Block end line
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
True if block should be included, False to filter out
|
|
75
|
+
"""
|
|
76
|
+
return True
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Extensible filter system for DRY duplicate detection
|
|
3
|
+
|
|
4
|
+
Scope: Filters out false positive duplications (API boilerplate, keyword arguments, etc.)
|
|
5
|
+
|
|
6
|
+
Overview: Provides an extensible architecture for filtering duplicate code blocks that are
|
|
7
|
+
not meaningful duplications. Includes base filter interface and built-in filters for
|
|
8
|
+
common false positive patterns like keyword-only function arguments, import groups,
|
|
9
|
+
and API call boilerplate. New filters can be added by subclassing BaseBlockFilter.
|
|
10
|
+
|
|
11
|
+
Dependencies: ast, re, typing
|
|
12
|
+
|
|
13
|
+
Exports: BaseBlockFilter, BlockFilterRegistry, KeywordArgumentFilter, ImportGroupFilter
|
|
14
|
+
|
|
15
|
+
Interfaces: BaseBlockFilter.should_filter(code_block, file_content) -> bool
|
|
16
|
+
|
|
17
|
+
Implementation: Strategy pattern with filter registry for extensibility
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import ast
|
|
21
|
+
import re
|
|
22
|
+
from abc import ABC, abstractmethod
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Protocol
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CodeBlock(Protocol):
|
|
28
|
+
"""Protocol for code blocks (matches cache.CodeBlock)."""
|
|
29
|
+
|
|
30
|
+
file_path: Path
|
|
31
|
+
start_line: int
|
|
32
|
+
end_line: int
|
|
33
|
+
snippet: str
|
|
34
|
+
hash_value: int
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class BaseBlockFilter(ABC):
|
|
38
|
+
"""Base class for duplicate block filters."""
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def should_filter(self, block: CodeBlock, file_content: str) -> bool:
|
|
42
|
+
"""Determine if a code block should be filtered out.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
block: Code block to evaluate
|
|
46
|
+
file_content: Full file content for context
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
True if block should be filtered (not reported as duplicate)
|
|
50
|
+
"""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def get_name(self) -> str:
|
|
55
|
+
"""Get filter name for configuration and logging."""
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class KeywordArgumentFilter(BaseBlockFilter):
|
|
60
|
+
"""Filters blocks that are primarily keyword arguments in function calls.
|
|
61
|
+
|
|
62
|
+
Detects patterns like:
|
|
63
|
+
message=message,
|
|
64
|
+
severity=Severity.ERROR,
|
|
65
|
+
suggestion=suggestion,
|
|
66
|
+
|
|
67
|
+
These are common in builder patterns and API calls.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self, threshold: float = 0.8):
|
|
71
|
+
"""Initialize filter.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
threshold: Minimum percentage of lines that must be keyword args (0.0-1.0)
|
|
75
|
+
"""
|
|
76
|
+
self.threshold = threshold
|
|
77
|
+
# Pattern: optional whitespace, identifier, =, value, optional comma
|
|
78
|
+
self._kwarg_pattern = re.compile(r"^\s*\w+\s*=\s*.+,?\s*$")
|
|
79
|
+
|
|
80
|
+
def should_filter(self, block: CodeBlock, file_content: str) -> bool:
|
|
81
|
+
"""Check if block is primarily keyword arguments.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
block: Code block to evaluate
|
|
85
|
+
file_content: Full file content for context
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
True if block should be filtered
|
|
89
|
+
"""
|
|
90
|
+
lines = file_content.split("\n")[block.start_line - 1 : block.end_line]
|
|
91
|
+
|
|
92
|
+
if not lines:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
# Count lines that match keyword argument pattern
|
|
96
|
+
kwarg_lines = sum(1 for line in lines if self._kwarg_pattern.match(line))
|
|
97
|
+
|
|
98
|
+
# Filter if most lines are keyword arguments
|
|
99
|
+
ratio = kwarg_lines / len(lines)
|
|
100
|
+
if ratio >= self.threshold:
|
|
101
|
+
return self._is_inside_function_call(block, file_content)
|
|
102
|
+
|
|
103
|
+
return False
|
|
104
|
+
|
|
105
|
+
def _is_inside_function_call(self, block: CodeBlock, file_content: str) -> bool:
|
|
106
|
+
"""Verify the block is inside a function call, not standalone code."""
|
|
107
|
+
try:
|
|
108
|
+
tree = ast.parse(file_content)
|
|
109
|
+
except SyntaxError:
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
# Find if any Call node contains the block
|
|
113
|
+
for node in ast.walk(tree):
|
|
114
|
+
if isinstance(node, ast.Call) and self._check_multiline_containment(node, block):
|
|
115
|
+
return True
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def _check_multiline_containment(node: ast.Call, block: CodeBlock) -> bool:
|
|
120
|
+
"""Check if Call node is multiline and contains block."""
|
|
121
|
+
if not KeywordArgumentFilter._has_valid_line_info(node):
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
# After validation, these are guaranteed to be non-None integers
|
|
125
|
+
# Use type: ignore to suppress MyPy's inability to understand runtime validation
|
|
126
|
+
is_multiline = node.lineno < node.end_lineno # type: ignore[operator]
|
|
127
|
+
contains_block = (
|
|
128
|
+
node.lineno <= block.start_line and node.end_lineno >= block.end_line # type: ignore[operator]
|
|
129
|
+
)
|
|
130
|
+
return is_multiline and contains_block
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _has_valid_line_info(node: ast.Call) -> bool:
|
|
134
|
+
"""Check if node has valid line information.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
node: AST Call node to check
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
True if node has valid line number attributes
|
|
141
|
+
"""
|
|
142
|
+
if not hasattr(node, "lineno"):
|
|
143
|
+
return False
|
|
144
|
+
if not hasattr(node, "end_lineno"):
|
|
145
|
+
return False
|
|
146
|
+
if node.lineno is None:
|
|
147
|
+
return False
|
|
148
|
+
if node.end_lineno is None:
|
|
149
|
+
return False
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
def get_name(self) -> str:
|
|
153
|
+
"""Get filter name."""
|
|
154
|
+
return "keyword_argument_filter"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class ImportGroupFilter(BaseBlockFilter):
|
|
158
|
+
"""Filters blocks that are just import statements.
|
|
159
|
+
|
|
160
|
+
Import organization often creates similar patterns that aren't meaningful duplication.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def should_filter(self, block: CodeBlock, file_content: str) -> bool:
|
|
164
|
+
"""Check if block is only import statements.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
block: Code block to evaluate
|
|
168
|
+
file_content: Full file content
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
True if block should be filtered
|
|
172
|
+
"""
|
|
173
|
+
lines = file_content.split("\n")[block.start_line - 1 : block.end_line]
|
|
174
|
+
|
|
175
|
+
for line in lines:
|
|
176
|
+
stripped = line.strip()
|
|
177
|
+
if not stripped:
|
|
178
|
+
continue
|
|
179
|
+
if not (stripped.startswith("import ") or stripped.startswith("from ")):
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
def get_name(self) -> str:
|
|
185
|
+
"""Get filter name."""
|
|
186
|
+
return "import_group_filter"
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class BlockFilterRegistry:
|
|
190
|
+
"""Registry for managing duplicate block filters."""
|
|
191
|
+
|
|
192
|
+
def __init__(self) -> None:
|
|
193
|
+
"""Initialize empty registry."""
|
|
194
|
+
self._filters: list[BaseBlockFilter] = []
|
|
195
|
+
self._enabled_filters: set[str] = set()
|
|
196
|
+
|
|
197
|
+
def register(self, filter_instance: BaseBlockFilter) -> None:
|
|
198
|
+
"""Register a filter.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
filter_instance: Filter to register
|
|
202
|
+
"""
|
|
203
|
+
self._filters.append(filter_instance)
|
|
204
|
+
self._enabled_filters.add(filter_instance.get_name())
|
|
205
|
+
|
|
206
|
+
def enable_filter(self, filter_name: str) -> None:
|
|
207
|
+
"""Enable a specific filter by name.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
filter_name: Name of filter to enable
|
|
211
|
+
"""
|
|
212
|
+
self._enabled_filters.add(filter_name)
|
|
213
|
+
|
|
214
|
+
def disable_filter(self, filter_name: str) -> None:
|
|
215
|
+
"""Disable a specific filter by name.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
filter_name: Name of filter to disable
|
|
219
|
+
"""
|
|
220
|
+
self._enabled_filters.discard(filter_name)
|
|
221
|
+
|
|
222
|
+
def should_filter_block(self, block: CodeBlock, file_content: str) -> bool:
|
|
223
|
+
"""Check if any enabled filter wants to filter this block.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
block: Code block to evaluate
|
|
227
|
+
file_content: Full file content
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
True if block should be filtered out
|
|
231
|
+
"""
|
|
232
|
+
for filter_instance in self._filters:
|
|
233
|
+
if filter_instance.get_name() not in self._enabled_filters:
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
if filter_instance.should_filter(block, file_content):
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
def get_enabled_filters(self) -> list[str]:
|
|
242
|
+
"""Get list of enabled filter names.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
List of enabled filter names
|
|
246
|
+
"""
|
|
247
|
+
return sorted(self._enabled_filters)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def create_default_registry() -> BlockFilterRegistry:
|
|
251
|
+
"""Create registry with default filters.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
BlockFilterRegistry with common filters registered
|
|
255
|
+
"""
|
|
256
|
+
registry = BlockFilterRegistry()
|
|
257
|
+
|
|
258
|
+
# Register built-in filters
|
|
259
|
+
registry.register(KeywordArgumentFilter(threshold=0.8))
|
|
260
|
+
registry.register(ImportGroupFilter())
|
|
261
|
+
|
|
262
|
+
return registry
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Block grouping utilities for duplicate detection
|
|
3
|
+
|
|
4
|
+
Scope: Groups code blocks by file path
|
|
5
|
+
|
|
6
|
+
Overview: Provides grouping utilities for organizing code blocks by file. Used by ViolationDeduplicator
|
|
7
|
+
to process blocks on a per-file basis for overlap detection. Separates grouping logic to maintain
|
|
8
|
+
SRP compliance.
|
|
9
|
+
|
|
10
|
+
Dependencies: CodeBlock, Violation
|
|
11
|
+
|
|
12
|
+
Exports: BlockGrouper class
|
|
13
|
+
|
|
14
|
+
Interfaces: BlockGrouper.group_blocks_by_file(blocks), group_violations_by_file(violations)
|
|
15
|
+
|
|
16
|
+
Implementation: Simple dictionary-based grouping by file path
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from src.core.types import Violation
|
|
22
|
+
|
|
23
|
+
from .cache import CodeBlock
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BlockGrouper:
|
|
27
|
+
"""Groups blocks and violations by file path."""
|
|
28
|
+
|
|
29
|
+
def group_blocks_by_file(self, blocks: list[CodeBlock]) -> dict[Path, list[CodeBlock]]:
|
|
30
|
+
"""Group blocks by file path.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
blocks: List of code blocks
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Dictionary mapping file paths to lists of blocks
|
|
37
|
+
"""
|
|
38
|
+
grouped: dict[Path, list[CodeBlock]] = {}
|
|
39
|
+
for block in blocks:
|
|
40
|
+
if block.file_path not in grouped:
|
|
41
|
+
grouped[block.file_path] = []
|
|
42
|
+
grouped[block.file_path].append(block)
|
|
43
|
+
return grouped
|
|
44
|
+
|
|
45
|
+
def group_violations_by_file(self, violations: list[Violation]) -> dict[str, list[Violation]]:
|
|
46
|
+
"""Group violations by file path.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
violations: List of violations
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Dictionary mapping file paths to lists of violations
|
|
53
|
+
"""
|
|
54
|
+
grouped: dict[str, list[Violation]] = {}
|
|
55
|
+
for violation in violations:
|
|
56
|
+
if violation.file_path not in grouped:
|
|
57
|
+
grouped[violation.file_path] = []
|
|
58
|
+
grouped[violation.file_path].append(violation)
|
|
59
|
+
return grouped
|
src/linters/dry/cache.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: SQLite cache manager for DRY linter with mtime-based invalidation
|
|
3
|
+
|
|
4
|
+
Scope: Code block storage, cache operations, and duplicate detection queries
|
|
5
|
+
|
|
6
|
+
Overview: Implements persistent caching layer for duplicate code detection using SQLite database.
|
|
7
|
+
Stores code blocks with hash values, file locations, and metadata. Provides mtime-based cache
|
|
8
|
+
invalidation to detect stale entries. Serves dual purpose as both cache (avoid re-hashing) and
|
|
9
|
+
hash table (query duplicates across project). Includes indexes for fast hash lookups enabling
|
|
10
|
+
cross-file duplicate detection with minimal overhead.
|
|
11
|
+
|
|
12
|
+
Dependencies: Python sqlite3 module (stdlib), pathlib.Path, dataclasses
|
|
13
|
+
|
|
14
|
+
Exports: CodeBlock dataclass, DRYCache class
|
|
15
|
+
|
|
16
|
+
Interfaces: DRYCache.__init__, is_fresh, load, save, find_duplicates_by_hash, get_blocks_for_file,
|
|
17
|
+
add_blocks, cleanup_stale, close
|
|
18
|
+
|
|
19
|
+
Implementation: SQLite with two tables (files, code_blocks), indexed on hash_value for performance,
|
|
20
|
+
ACID transactions for reliability, foreign key constraints for data integrity
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import sqlite3
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
from .cache_query import CacheQueryService
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class CodeBlock:
|
|
32
|
+
"""Represents a code block location with hash."""
|
|
33
|
+
|
|
34
|
+
file_path: Path
|
|
35
|
+
start_line: int
|
|
36
|
+
end_line: int
|
|
37
|
+
snippet: str
|
|
38
|
+
hash_value: int
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DRYCache:
|
|
42
|
+
"""SQLite-backed cache for duplicate detection."""
|
|
43
|
+
|
|
44
|
+
SCHEMA_VERSION = 1
|
|
45
|
+
|
|
46
|
+
def __init__(self, cache_path: Path) -> None:
|
|
47
|
+
"""Initialize cache with SQLite database.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
cache_path: Path to SQLite database file
|
|
51
|
+
"""
|
|
52
|
+
# Ensure parent directory exists
|
|
53
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
self.db = sqlite3.connect(str(cache_path))
|
|
56
|
+
self._query_service = CacheQueryService()
|
|
57
|
+
|
|
58
|
+
# Create schema
|
|
59
|
+
self.db.execute(
|
|
60
|
+
"""CREATE TABLE IF NOT EXISTS files (
|
|
61
|
+
file_path TEXT PRIMARY KEY,
|
|
62
|
+
mtime REAL NOT NULL,
|
|
63
|
+
hash_count INTEGER,
|
|
64
|
+
last_scanned TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
65
|
+
)"""
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
self.db.execute(
|
|
69
|
+
"""CREATE TABLE IF NOT EXISTS code_blocks (
|
|
70
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
71
|
+
file_path TEXT NOT NULL,
|
|
72
|
+
hash_value INTEGER NOT NULL,
|
|
73
|
+
start_line INTEGER NOT NULL,
|
|
74
|
+
end_line INTEGER NOT NULL,
|
|
75
|
+
snippet TEXT NOT NULL,
|
|
76
|
+
FOREIGN KEY (file_path) REFERENCES files(file_path) ON DELETE CASCADE
|
|
77
|
+
)"""
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
self.db.execute("CREATE INDEX IF NOT EXISTS idx_hash_value ON code_blocks(hash_value)")
|
|
81
|
+
self.db.execute("CREATE INDEX IF NOT EXISTS idx_file_path ON code_blocks(file_path)")
|
|
82
|
+
|
|
83
|
+
self.db.commit()
|
|
84
|
+
|
|
85
|
+
def is_fresh(self, file_path: Path, current_mtime: float) -> bool:
|
|
86
|
+
"""Check if cached data is fresh (mtime matches).
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
file_path: Path to file
|
|
90
|
+
current_mtime: Current modification time
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
True if cache is fresh, False if stale or missing
|
|
94
|
+
"""
|
|
95
|
+
cursor = self.db.execute("SELECT mtime FROM files WHERE file_path = ?", (str(file_path),))
|
|
96
|
+
row = cursor.fetchone()
|
|
97
|
+
|
|
98
|
+
if not row:
|
|
99
|
+
return False # Not in cache
|
|
100
|
+
|
|
101
|
+
cached_mtime = row[0]
|
|
102
|
+
return cached_mtime == current_mtime
|
|
103
|
+
|
|
104
|
+
def load(self, file_path: Path) -> list[CodeBlock]:
|
|
105
|
+
"""Load cached code blocks for file.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
file_path: Path to file
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List of CodeBlock instances from cache
|
|
112
|
+
"""
|
|
113
|
+
cursor = self.db.execute(
|
|
114
|
+
"""SELECT hash_value, start_line, end_line, snippet
|
|
115
|
+
FROM code_blocks
|
|
116
|
+
WHERE file_path = ?""",
|
|
117
|
+
(str(file_path),),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
blocks = []
|
|
121
|
+
for hash_val, start, end, snippet in cursor:
|
|
122
|
+
block = CodeBlock(
|
|
123
|
+
file_path=file_path,
|
|
124
|
+
start_line=start,
|
|
125
|
+
end_line=end,
|
|
126
|
+
snippet=snippet,
|
|
127
|
+
hash_value=hash_val,
|
|
128
|
+
)
|
|
129
|
+
blocks.append(block)
|
|
130
|
+
|
|
131
|
+
return blocks
|
|
132
|
+
|
|
133
|
+
def save(self, file_path: Path, mtime: float, blocks: list[CodeBlock]) -> None:
|
|
134
|
+
"""Save code blocks to cache.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
file_path: Path to file
|
|
138
|
+
mtime: File modification time
|
|
139
|
+
blocks: List of CodeBlock instances to cache
|
|
140
|
+
"""
|
|
141
|
+
# Delete old data for this file
|
|
142
|
+
self.db.execute("DELETE FROM files WHERE file_path = ?", (str(file_path),))
|
|
143
|
+
|
|
144
|
+
# Insert file metadata
|
|
145
|
+
self.db.execute(
|
|
146
|
+
"INSERT INTO files (file_path, mtime, hash_count) VALUES (?, ?, ?)",
|
|
147
|
+
(str(file_path), mtime, len(blocks)),
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Insert code blocks
|
|
151
|
+
for block in blocks:
|
|
152
|
+
self.db.execute(
|
|
153
|
+
"""INSERT INTO code_blocks
|
|
154
|
+
(file_path, hash_value, start_line, end_line, snippet)
|
|
155
|
+
VALUES (?, ?, ?, ?, ?)""",
|
|
156
|
+
(
|
|
157
|
+
str(file_path),
|
|
158
|
+
block.hash_value,
|
|
159
|
+
block.start_line,
|
|
160
|
+
block.end_line,
|
|
161
|
+
block.snippet,
|
|
162
|
+
),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
self.db.commit()
|
|
166
|
+
|
|
167
|
+
def cleanup_stale(self, max_age_days: int) -> None:
|
|
168
|
+
"""Remove cache entries older than max_age_days.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
max_age_days: Maximum age in days for cache entries
|
|
172
|
+
"""
|
|
173
|
+
# Use parameterized query to prevent SQL injection
|
|
174
|
+
self.db.execute(
|
|
175
|
+
"""DELETE FROM files
|
|
176
|
+
WHERE last_scanned < datetime('now', ? || ' days')""",
|
|
177
|
+
(f"-{max_age_days}",),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Vacuum to reclaim space
|
|
181
|
+
self.db.execute("VACUUM")
|
|
182
|
+
self.db.commit()
|
|
183
|
+
|
|
184
|
+
def find_duplicates_by_hash(self, hash_value: int) -> list[CodeBlock]:
|
|
185
|
+
"""Find all code blocks with the given hash value.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
hash_value: Hash value to search for
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
List of ALL CodeBlock instances with this hash (from all files)
|
|
192
|
+
"""
|
|
193
|
+
rows = self._query_service.find_blocks_by_hash(self.db, hash_value)
|
|
194
|
+
|
|
195
|
+
blocks = []
|
|
196
|
+
for file_path_str, start, end, snippet, hash_val in rows:
|
|
197
|
+
block = CodeBlock(
|
|
198
|
+
file_path=Path(file_path_str),
|
|
199
|
+
start_line=start,
|
|
200
|
+
end_line=end,
|
|
201
|
+
snippet=snippet,
|
|
202
|
+
hash_value=hash_val,
|
|
203
|
+
)
|
|
204
|
+
blocks.append(block)
|
|
205
|
+
|
|
206
|
+
return blocks
|
|
207
|
+
|
|
208
|
+
def get_duplicate_hashes(self) -> list[int]:
|
|
209
|
+
"""Get all hash values that appear 2+ times.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
List of hash values with 2 or more occurrences
|
|
213
|
+
"""
|
|
214
|
+
return self._query_service.get_duplicate_hashes(self.db)
|
|
215
|
+
|
|
216
|
+
def close(self) -> None:
|
|
217
|
+
"""Close database connection."""
|
|
218
|
+
self.db.close()
|