thailint 0.1.6__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. src/__init__.py +7 -2
  2. src/analyzers/__init__.py +23 -0
  3. src/analyzers/typescript_base.py +148 -0
  4. src/api.py +1 -1
  5. src/cli.py +498 -141
  6. src/config.py +6 -31
  7. src/core/base.py +12 -0
  8. src/core/cli_utils.py +206 -0
  9. src/core/config_parser.py +99 -0
  10. src/core/linter_utils.py +168 -0
  11. src/core/registry.py +17 -92
  12. src/core/rule_discovery.py +132 -0
  13. src/core/violation_builder.py +122 -0
  14. src/linter_config/ignore.py +112 -40
  15. src/linter_config/loader.py +3 -13
  16. src/linters/dry/__init__.py +23 -0
  17. src/linters/dry/base_token_analyzer.py +76 -0
  18. src/linters/dry/block_filter.py +262 -0
  19. src/linters/dry/block_grouper.py +59 -0
  20. src/linters/dry/cache.py +218 -0
  21. src/linters/dry/cache_query.py +61 -0
  22. src/linters/dry/config.py +130 -0
  23. src/linters/dry/config_loader.py +44 -0
  24. src/linters/dry/deduplicator.py +120 -0
  25. src/linters/dry/duplicate_storage.py +126 -0
  26. src/linters/dry/file_analyzer.py +127 -0
  27. src/linters/dry/inline_ignore.py +140 -0
  28. src/linters/dry/linter.py +170 -0
  29. src/linters/dry/python_analyzer.py +517 -0
  30. src/linters/dry/storage_initializer.py +51 -0
  31. src/linters/dry/token_hasher.py +115 -0
  32. src/linters/dry/typescript_analyzer.py +590 -0
  33. src/linters/dry/violation_builder.py +74 -0
  34. src/linters/dry/violation_filter.py +91 -0
  35. src/linters/dry/violation_generator.py +174 -0
  36. src/linters/file_placement/config_loader.py +86 -0
  37. src/linters/file_placement/directory_matcher.py +80 -0
  38. src/linters/file_placement/linter.py +252 -472
  39. src/linters/file_placement/path_resolver.py +61 -0
  40. src/linters/file_placement/pattern_matcher.py +55 -0
  41. src/linters/file_placement/pattern_validator.py +106 -0
  42. src/linters/file_placement/rule_checker.py +229 -0
  43. src/linters/file_placement/violation_factory.py +177 -0
  44. src/linters/nesting/config.py +13 -3
  45. src/linters/nesting/linter.py +76 -152
  46. src/linters/nesting/typescript_analyzer.py +38 -102
  47. src/linters/nesting/typescript_function_extractor.py +130 -0
  48. src/linters/nesting/violation_builder.py +139 -0
  49. src/linters/srp/__init__.py +99 -0
  50. src/linters/srp/class_analyzer.py +113 -0
  51. src/linters/srp/config.py +76 -0
  52. src/linters/srp/heuristics.py +89 -0
  53. src/linters/srp/linter.py +225 -0
  54. src/linters/srp/metrics_evaluator.py +47 -0
  55. src/linters/srp/python_analyzer.py +72 -0
  56. src/linters/srp/typescript_analyzer.py +75 -0
  57. src/linters/srp/typescript_metrics_calculator.py +90 -0
  58. src/linters/srp/violation_builder.py +117 -0
  59. src/orchestrator/core.py +42 -7
  60. src/utils/__init__.py +4 -0
  61. src/utils/project_root.py +84 -0
  62. {thailint-0.1.6.dist-info → thailint-0.2.0.dist-info}/METADATA +414 -63
  63. thailint-0.2.0.dist-info/RECORD +75 -0
  64. src/.ai/layout.yaml +0 -48
  65. thailint-0.1.6.dist-info/RECORD +0 -28
  66. {thailint-0.1.6.dist-info → thailint-0.2.0.dist-info}/LICENSE +0 -0
  67. {thailint-0.1.6.dist-info → thailint-0.2.0.dist-info}/WHEEL +0 -0
  68. {thailint-0.1.6.dist-info → thailint-0.2.0.dist-info}/entry_points.txt +0 -0
@@ -25,11 +25,10 @@ Implementation: Extension-based format detection (.yaml/.yml vs .json), yaml.saf
25
25
  for security, empty dict handling for null YAML, ValueError for unsupported formats
26
26
  """
27
27
 
28
- import json
29
28
  from pathlib import Path
30
29
  from typing import Any
31
30
 
32
- import yaml
31
+ from src.core.config_parser import parse_config_file
33
32
 
34
33
 
35
34
  class LinterConfigLoader:
@@ -49,21 +48,12 @@ class LinterConfigLoader:
49
48
  Configuration dictionary.
50
49
 
51
50
  Raises:
52
- ValueError: If file format is unsupported.
53
- yaml.YAMLError: If YAML is malformed.
54
- json.JSONDecodeError: If JSON is malformed.
51
+ ConfigParseError: If file format is unsupported or parsing fails.
55
52
  """
56
53
  if not config_path.exists():
57
54
  return self.get_defaults()
58
55
 
59
- suffix = config_path.suffix.lower()
60
-
61
- with config_path.open(encoding="utf-8") as f:
62
- if suffix in [".yaml", ".yml"]:
63
- return yaml.safe_load(f) or {}
64
- if suffix == ".json":
65
- return json.load(f)
66
- raise ValueError(f"Unsupported config format: {suffix}")
56
+ return parse_config_file(config_path)
67
57
 
68
58
  def get_defaults(self) -> dict[str, Any]:
69
59
  """Get default configuration.
@@ -0,0 +1,23 @@
1
+ """
2
+ Purpose: DRY (Don't Repeat Yourself) linter module exports
3
+
4
+ Scope: Module-level exports for DRY linter components
5
+
6
+ Overview: Provides centralized exports for the DRY linter module components. Exposes the main
7
+ DRYRule class for duplicate code detection, configuration dataclass, and analyzer components.
8
+ Simplifies imports for consumers by providing a single import point for all DRY linter
9
+ functionality. Follows the established pattern from nesting and SRP linters.
10
+
11
+ Dependencies: linter.DRYRule, config.DRYConfig
12
+
13
+ Exports: DRYRule (main rule class), DRYConfig (configuration)
14
+
15
+ Interfaces: Module-level __all__ list defining public API
16
+
17
+ Implementation: Standard Python module with explicit exports via __all__
18
+ """
19
+
20
+ from .config import DRYConfig
21
+ from .linter import DRYRule
22
+
23
+ __all__ = ["DRYRule", "DRYConfig"]
@@ -0,0 +1,76 @@
1
+ """
2
+ Purpose: Base class for token-based duplicate code analysis
3
+
4
+ Scope: Common duplicate detection workflow for Python and TypeScript analyzers
5
+
6
+ Overview: Provides shared infrastructure for token-based duplicate code detection across different
7
+ programming languages. Implements common workflow of tokenization, rolling hash window generation,
8
+ and CodeBlock creation. Subclasses provide language-specific filtering (e.g., interface filtering
9
+ for TypeScript). Eliminates duplication between PythonDuplicateAnalyzer and TypeScriptDuplicateAnalyzer
10
+ by extracting shared analyze() method pattern and CodeBlock creation logic.
11
+
12
+ Dependencies: TokenHasher, CodeBlock, DRYConfig, pathlib.Path
13
+
14
+ Exports: BaseTokenAnalyzer class
15
+
16
+ Interfaces: BaseTokenAnalyzer.analyze(file_path: Path, content: str, config: DRYConfig) -> list[CodeBlock]
17
+
18
+ Implementation: Template method pattern with extension point for language-specific block filtering
19
+ """
20
+
21
+ from pathlib import Path
22
+
23
+ from .cache import CodeBlock
24
+ from .config import DRYConfig
25
+ from .token_hasher import TokenHasher
26
+
27
+
28
+ class BaseTokenAnalyzer:
29
+ """Base analyzer for token-based duplicate detection."""
30
+
31
+ def __init__(self) -> None:
32
+ """Initialize analyzer with token hasher."""
33
+ self._hasher = TokenHasher()
34
+
35
+ def analyze(self, file_path: Path, content: str, config: DRYConfig) -> list[CodeBlock]:
36
+ """Analyze file for duplicate code blocks.
37
+
38
+ Args:
39
+ file_path: Path to source file
40
+ content: File content
41
+ config: DRY configuration
42
+
43
+ Returns:
44
+ List of CodeBlock instances with hash values
45
+ """
46
+ lines = self._hasher.tokenize(content)
47
+ windows = self._hasher.rolling_hash(lines, config.min_duplicate_lines)
48
+
49
+ blocks = []
50
+ for hash_val, start_line, end_line, snippet in windows:
51
+ if self._should_include_block(content, start_line, end_line):
52
+ block = CodeBlock(
53
+ file_path=file_path,
54
+ start_line=start_line,
55
+ end_line=end_line,
56
+ snippet=snippet,
57
+ hash_value=hash_val,
58
+ )
59
+ blocks.append(block)
60
+
61
+ return blocks
62
+
63
+ def _should_include_block(self, content: str, start_line: int, end_line: int) -> bool:
64
+ """Determine if block should be included.
65
+
66
+ Extension point for language-specific filtering.
67
+
68
+ Args:
69
+ content: File content
70
+ start_line: Block start line
71
+ end_line: Block end line
72
+
73
+ Returns:
74
+ True if block should be included, False to filter out
75
+ """
76
+ return True
@@ -0,0 +1,262 @@
1
+ """
2
+ Purpose: Extensible filter system for DRY duplicate detection
3
+
4
+ Scope: Filters out false positive duplications (API boilerplate, keyword arguments, etc.)
5
+
6
+ Overview: Provides an extensible architecture for filtering duplicate code blocks that are
7
+ not meaningful duplications. Includes base filter interface and built-in filters for
8
+ common false positive patterns like keyword-only function arguments, import groups,
9
+ and API call boilerplate. New filters can be added by subclassing BaseBlockFilter.
10
+
11
+ Dependencies: ast, re, typing
12
+
13
+ Exports: BaseBlockFilter, BlockFilterRegistry, KeywordArgumentFilter, ImportGroupFilter
14
+
15
+ Interfaces: BaseBlockFilter.should_filter(code_block, file_content) -> bool
16
+
17
+ Implementation: Strategy pattern with filter registry for extensibility
18
+ """
19
+
20
+ import ast
21
+ import re
22
+ from abc import ABC, abstractmethod
23
+ from pathlib import Path
24
+ from typing import Protocol
25
+
26
+
27
+ class CodeBlock(Protocol):
28
+ """Protocol for code blocks (matches cache.CodeBlock)."""
29
+
30
+ file_path: Path
31
+ start_line: int
32
+ end_line: int
33
+ snippet: str
34
+ hash_value: int
35
+
36
+
37
+ class BaseBlockFilter(ABC):
38
+ """Base class for duplicate block filters."""
39
+
40
+ @abstractmethod
41
+ def should_filter(self, block: CodeBlock, file_content: str) -> bool:
42
+ """Determine if a code block should be filtered out.
43
+
44
+ Args:
45
+ block: Code block to evaluate
46
+ file_content: Full file content for context
47
+
48
+ Returns:
49
+ True if block should be filtered (not reported as duplicate)
50
+ """
51
+ pass
52
+
53
+ @abstractmethod
54
+ def get_name(self) -> str:
55
+ """Get filter name for configuration and logging."""
56
+ pass
57
+
58
+
59
+ class KeywordArgumentFilter(BaseBlockFilter):
60
+ """Filters blocks that are primarily keyword arguments in function calls.
61
+
62
+ Detects patterns like:
63
+ message=message,
64
+ severity=Severity.ERROR,
65
+ suggestion=suggestion,
66
+
67
+ These are common in builder patterns and API calls.
68
+ """
69
+
70
+ def __init__(self, threshold: float = 0.8):
71
+ """Initialize filter.
72
+
73
+ Args:
74
+ threshold: Minimum percentage of lines that must be keyword args (0.0-1.0)
75
+ """
76
+ self.threshold = threshold
77
+ # Pattern: optional whitespace, identifier, =, value, optional comma
78
+ self._kwarg_pattern = re.compile(r"^\s*\w+\s*=\s*.+,?\s*$")
79
+
80
+ def should_filter(self, block: CodeBlock, file_content: str) -> bool:
81
+ """Check if block is primarily keyword arguments.
82
+
83
+ Args:
84
+ block: Code block to evaluate
85
+ file_content: Full file content for context
86
+
87
+ Returns:
88
+ True if block should be filtered
89
+ """
90
+ lines = file_content.split("\n")[block.start_line - 1 : block.end_line]
91
+
92
+ if not lines:
93
+ return False
94
+
95
+ # Count lines that match keyword argument pattern
96
+ kwarg_lines = sum(1 for line in lines if self._kwarg_pattern.match(line))
97
+
98
+ # Filter if most lines are keyword arguments
99
+ ratio = kwarg_lines / len(lines)
100
+ if ratio >= self.threshold:
101
+ return self._is_inside_function_call(block, file_content)
102
+
103
+ return False
104
+
105
+ def _is_inside_function_call(self, block: CodeBlock, file_content: str) -> bool:
106
+ """Verify the block is inside a function call, not standalone code."""
107
+ try:
108
+ tree = ast.parse(file_content)
109
+ except SyntaxError:
110
+ return False
111
+
112
+ # Find if any Call node contains the block
113
+ for node in ast.walk(tree):
114
+ if isinstance(node, ast.Call) and self._check_multiline_containment(node, block):
115
+ return True
116
+ return False
117
+
118
+ @staticmethod
119
+ def _check_multiline_containment(node: ast.Call, block: CodeBlock) -> bool:
120
+ """Check if Call node is multiline and contains block."""
121
+ if not KeywordArgumentFilter._has_valid_line_info(node):
122
+ return False
123
+
124
+ # After validation, these are guaranteed to be non-None integers
125
+ # Use type: ignore to suppress MyPy's inability to understand runtime validation
126
+ is_multiline = node.lineno < node.end_lineno # type: ignore[operator]
127
+ contains_block = (
128
+ node.lineno <= block.start_line and node.end_lineno >= block.end_line # type: ignore[operator]
129
+ )
130
+ return is_multiline and contains_block
131
+
132
+ @staticmethod
133
+ def _has_valid_line_info(node: ast.Call) -> bool:
134
+ """Check if node has valid line information.
135
+
136
+ Args:
137
+ node: AST Call node to check
138
+
139
+ Returns:
140
+ True if node has valid line number attributes
141
+ """
142
+ if not hasattr(node, "lineno"):
143
+ return False
144
+ if not hasattr(node, "end_lineno"):
145
+ return False
146
+ if node.lineno is None:
147
+ return False
148
+ if node.end_lineno is None:
149
+ return False
150
+ return True
151
+
152
+ def get_name(self) -> str:
153
+ """Get filter name."""
154
+ return "keyword_argument_filter"
155
+
156
+
157
+ class ImportGroupFilter(BaseBlockFilter):
158
+ """Filters blocks that are just import statements.
159
+
160
+ Import organization often creates similar patterns that aren't meaningful duplication.
161
+ """
162
+
163
+ def should_filter(self, block: CodeBlock, file_content: str) -> bool:
164
+ """Check if block is only import statements.
165
+
166
+ Args:
167
+ block: Code block to evaluate
168
+ file_content: Full file content
169
+
170
+ Returns:
171
+ True if block should be filtered
172
+ """
173
+ lines = file_content.split("\n")[block.start_line - 1 : block.end_line]
174
+
175
+ for line in lines:
176
+ stripped = line.strip()
177
+ if not stripped:
178
+ continue
179
+ if not (stripped.startswith("import ") or stripped.startswith("from ")):
180
+ return False
181
+
182
+ return True
183
+
184
+ def get_name(self) -> str:
185
+ """Get filter name."""
186
+ return "import_group_filter"
187
+
188
+
189
+ class BlockFilterRegistry:
190
+ """Registry for managing duplicate block filters."""
191
+
192
+ def __init__(self) -> None:
193
+ """Initialize empty registry."""
194
+ self._filters: list[BaseBlockFilter] = []
195
+ self._enabled_filters: set[str] = set()
196
+
197
+ def register(self, filter_instance: BaseBlockFilter) -> None:
198
+ """Register a filter.
199
+
200
+ Args:
201
+ filter_instance: Filter to register
202
+ """
203
+ self._filters.append(filter_instance)
204
+ self._enabled_filters.add(filter_instance.get_name())
205
+
206
+ def enable_filter(self, filter_name: str) -> None:
207
+ """Enable a specific filter by name.
208
+
209
+ Args:
210
+ filter_name: Name of filter to enable
211
+ """
212
+ self._enabled_filters.add(filter_name)
213
+
214
+ def disable_filter(self, filter_name: str) -> None:
215
+ """Disable a specific filter by name.
216
+
217
+ Args:
218
+ filter_name: Name of filter to disable
219
+ """
220
+ self._enabled_filters.discard(filter_name)
221
+
222
+ def should_filter_block(self, block: CodeBlock, file_content: str) -> bool:
223
+ """Check if any enabled filter wants to filter this block.
224
+
225
+ Args:
226
+ block: Code block to evaluate
227
+ file_content: Full file content
228
+
229
+ Returns:
230
+ True if block should be filtered out
231
+ """
232
+ for filter_instance in self._filters:
233
+ if filter_instance.get_name() not in self._enabled_filters:
234
+ continue
235
+
236
+ if filter_instance.should_filter(block, file_content):
237
+ return True
238
+
239
+ return False
240
+
241
+ def get_enabled_filters(self) -> list[str]:
242
+ """Get list of enabled filter names.
243
+
244
+ Returns:
245
+ List of enabled filter names
246
+ """
247
+ return sorted(self._enabled_filters)
248
+
249
+
250
+ def create_default_registry() -> BlockFilterRegistry:
251
+ """Create registry with default filters.
252
+
253
+ Returns:
254
+ BlockFilterRegistry with common filters registered
255
+ """
256
+ registry = BlockFilterRegistry()
257
+
258
+ # Register built-in filters
259
+ registry.register(KeywordArgumentFilter(threshold=0.8))
260
+ registry.register(ImportGroupFilter())
261
+
262
+ return registry
@@ -0,0 +1,59 @@
1
+ """
2
+ Purpose: Block grouping utilities for duplicate detection
3
+
4
+ Scope: Groups code blocks by file path
5
+
6
+ Overview: Provides grouping utilities for organizing code blocks by file. Used by ViolationDeduplicator
7
+ to process blocks on a per-file basis for overlap detection. Separates grouping logic to maintain
8
+ SRP compliance.
9
+
10
+ Dependencies: CodeBlock, Violation
11
+
12
+ Exports: BlockGrouper class
13
+
14
+ Interfaces: BlockGrouper.group_blocks_by_file(blocks), group_violations_by_file(violations)
15
+
16
+ Implementation: Simple dictionary-based grouping by file path
17
+ """
18
+
19
+ from pathlib import Path
20
+
21
+ from src.core.types import Violation
22
+
23
+ from .cache import CodeBlock
24
+
25
+
26
+ class BlockGrouper:
27
+ """Groups blocks and violations by file path."""
28
+
29
+ def group_blocks_by_file(self, blocks: list[CodeBlock]) -> dict[Path, list[CodeBlock]]:
30
+ """Group blocks by file path.
31
+
32
+ Args:
33
+ blocks: List of code blocks
34
+
35
+ Returns:
36
+ Dictionary mapping file paths to lists of blocks
37
+ """
38
+ grouped: dict[Path, list[CodeBlock]] = {}
39
+ for block in blocks:
40
+ if block.file_path not in grouped:
41
+ grouped[block.file_path] = []
42
+ grouped[block.file_path].append(block)
43
+ return grouped
44
+
45
+ def group_violations_by_file(self, violations: list[Violation]) -> dict[str, list[Violation]]:
46
+ """Group violations by file path.
47
+
48
+ Args:
49
+ violations: List of violations
50
+
51
+ Returns:
52
+ Dictionary mapping file paths to lists of violations
53
+ """
54
+ grouped: dict[str, list[Violation]] = {}
55
+ for violation in violations:
56
+ if violation.file_path not in grouped:
57
+ grouped[violation.file_path] = []
58
+ grouped[violation.file_path].append(violation)
59
+ return grouped
@@ -0,0 +1,218 @@
1
+ """
2
+ Purpose: SQLite cache manager for DRY linter with mtime-based invalidation
3
+
4
+ Scope: Code block storage, cache operations, and duplicate detection queries
5
+
6
+ Overview: Implements persistent caching layer for duplicate code detection using SQLite database.
7
+ Stores code blocks with hash values, file locations, and metadata. Provides mtime-based cache
8
+ invalidation to detect stale entries. Serves dual purpose as both cache (avoid re-hashing) and
9
+ hash table (query duplicates across project). Includes indexes for fast hash lookups enabling
10
+ cross-file duplicate detection with minimal overhead.
11
+
12
+ Dependencies: Python sqlite3 module (stdlib), pathlib.Path, dataclasses
13
+
14
+ Exports: CodeBlock dataclass, DRYCache class
15
+
16
+ Interfaces: DRYCache.__init__, is_fresh, load, save, find_duplicates_by_hash, get_blocks_for_file,
17
+ add_blocks, cleanup_stale, close
18
+
19
+ Implementation: SQLite with two tables (files, code_blocks), indexed on hash_value for performance,
20
+ ACID transactions for reliability, foreign key constraints for data integrity
21
+ """
22
+
23
+ import sqlite3
24
+ from dataclasses import dataclass
25
+ from pathlib import Path
26
+
27
+ from .cache_query import CacheQueryService
28
+
29
+
30
+ @dataclass
31
+ class CodeBlock:
32
+ """Represents a code block location with hash."""
33
+
34
+ file_path: Path
35
+ start_line: int
36
+ end_line: int
37
+ snippet: str
38
+ hash_value: int
39
+
40
+
41
+ class DRYCache:
42
+ """SQLite-backed cache for duplicate detection."""
43
+
44
+ SCHEMA_VERSION = 1
45
+
46
+ def __init__(self, cache_path: Path) -> None:
47
+ """Initialize cache with SQLite database.
48
+
49
+ Args:
50
+ cache_path: Path to SQLite database file
51
+ """
52
+ # Ensure parent directory exists
53
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
54
+
55
+ self.db = sqlite3.connect(str(cache_path))
56
+ self._query_service = CacheQueryService()
57
+
58
+ # Create schema
59
+ self.db.execute(
60
+ """CREATE TABLE IF NOT EXISTS files (
61
+ file_path TEXT PRIMARY KEY,
62
+ mtime REAL NOT NULL,
63
+ hash_count INTEGER,
64
+ last_scanned TIMESTAMP DEFAULT CURRENT_TIMESTAMP
65
+ )"""
66
+ )
67
+
68
+ self.db.execute(
69
+ """CREATE TABLE IF NOT EXISTS code_blocks (
70
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
71
+ file_path TEXT NOT NULL,
72
+ hash_value INTEGER NOT NULL,
73
+ start_line INTEGER NOT NULL,
74
+ end_line INTEGER NOT NULL,
75
+ snippet TEXT NOT NULL,
76
+ FOREIGN KEY (file_path) REFERENCES files(file_path) ON DELETE CASCADE
77
+ )"""
78
+ )
79
+
80
+ self.db.execute("CREATE INDEX IF NOT EXISTS idx_hash_value ON code_blocks(hash_value)")
81
+ self.db.execute("CREATE INDEX IF NOT EXISTS idx_file_path ON code_blocks(file_path)")
82
+
83
+ self.db.commit()
84
+
85
+ def is_fresh(self, file_path: Path, current_mtime: float) -> bool:
86
+ """Check if cached data is fresh (mtime matches).
87
+
88
+ Args:
89
+ file_path: Path to file
90
+ current_mtime: Current modification time
91
+
92
+ Returns:
93
+ True if cache is fresh, False if stale or missing
94
+ """
95
+ cursor = self.db.execute("SELECT mtime FROM files WHERE file_path = ?", (str(file_path),))
96
+ row = cursor.fetchone()
97
+
98
+ if not row:
99
+ return False # Not in cache
100
+
101
+ cached_mtime = row[0]
102
+ return cached_mtime == current_mtime
103
+
104
+ def load(self, file_path: Path) -> list[CodeBlock]:
105
+ """Load cached code blocks for file.
106
+
107
+ Args:
108
+ file_path: Path to file
109
+
110
+ Returns:
111
+ List of CodeBlock instances from cache
112
+ """
113
+ cursor = self.db.execute(
114
+ """SELECT hash_value, start_line, end_line, snippet
115
+ FROM code_blocks
116
+ WHERE file_path = ?""",
117
+ (str(file_path),),
118
+ )
119
+
120
+ blocks = []
121
+ for hash_val, start, end, snippet in cursor:
122
+ block = CodeBlock(
123
+ file_path=file_path,
124
+ start_line=start,
125
+ end_line=end,
126
+ snippet=snippet,
127
+ hash_value=hash_val,
128
+ )
129
+ blocks.append(block)
130
+
131
+ return blocks
132
+
133
+ def save(self, file_path: Path, mtime: float, blocks: list[CodeBlock]) -> None:
134
+ """Save code blocks to cache.
135
+
136
+ Args:
137
+ file_path: Path to file
138
+ mtime: File modification time
139
+ blocks: List of CodeBlock instances to cache
140
+ """
141
+ # Delete old data for this file
142
+ self.db.execute("DELETE FROM files WHERE file_path = ?", (str(file_path),))
143
+
144
+ # Insert file metadata
145
+ self.db.execute(
146
+ "INSERT INTO files (file_path, mtime, hash_count) VALUES (?, ?, ?)",
147
+ (str(file_path), mtime, len(blocks)),
148
+ )
149
+
150
+ # Insert code blocks
151
+ for block in blocks:
152
+ self.db.execute(
153
+ """INSERT INTO code_blocks
154
+ (file_path, hash_value, start_line, end_line, snippet)
155
+ VALUES (?, ?, ?, ?, ?)""",
156
+ (
157
+ str(file_path),
158
+ block.hash_value,
159
+ block.start_line,
160
+ block.end_line,
161
+ block.snippet,
162
+ ),
163
+ )
164
+
165
+ self.db.commit()
166
+
167
+ def cleanup_stale(self, max_age_days: int) -> None:
168
+ """Remove cache entries older than max_age_days.
169
+
170
+ Args:
171
+ max_age_days: Maximum age in days for cache entries
172
+ """
173
+ # Use parameterized query to prevent SQL injection
174
+ self.db.execute(
175
+ """DELETE FROM files
176
+ WHERE last_scanned < datetime('now', ? || ' days')""",
177
+ (f"-{max_age_days}",),
178
+ )
179
+
180
+ # Vacuum to reclaim space
181
+ self.db.execute("VACUUM")
182
+ self.db.commit()
183
+
184
+ def find_duplicates_by_hash(self, hash_value: int) -> list[CodeBlock]:
185
+ """Find all code blocks with the given hash value.
186
+
187
+ Args:
188
+ hash_value: Hash value to search for
189
+
190
+ Returns:
191
+ List of ALL CodeBlock instances with this hash (from all files)
192
+ """
193
+ rows = self._query_service.find_blocks_by_hash(self.db, hash_value)
194
+
195
+ blocks = []
196
+ for file_path_str, start, end, snippet, hash_val in rows:
197
+ block = CodeBlock(
198
+ file_path=Path(file_path_str),
199
+ start_line=start,
200
+ end_line=end,
201
+ snippet=snippet,
202
+ hash_value=hash_val,
203
+ )
204
+ blocks.append(block)
205
+
206
+ return blocks
207
+
208
+ def get_duplicate_hashes(self) -> list[int]:
209
+ """Get all hash values that appear 2+ times.
210
+
211
+ Returns:
212
+ List of hash values with 2 or more occurrences
213
+ """
214
+ return self._query_service.get_duplicate_hashes(self.db)
215
+
216
+ def close(self) -> None:
217
+ """Close database connection."""
218
+ self.db.close()