thailint 0.1.5__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. src/__init__.py +7 -2
  2. src/analyzers/__init__.py +23 -0
  3. src/analyzers/typescript_base.py +148 -0
  4. src/api.py +1 -1
  5. src/cli.py +1111 -144
  6. src/config.py +12 -33
  7. src/core/base.py +102 -5
  8. src/core/cli_utils.py +206 -0
  9. src/core/config_parser.py +126 -0
  10. src/core/linter_utils.py +168 -0
  11. src/core/registry.py +17 -92
  12. src/core/rule_discovery.py +132 -0
  13. src/core/violation_builder.py +122 -0
  14. src/linter_config/ignore.py +112 -40
  15. src/linter_config/loader.py +3 -13
  16. src/linters/dry/__init__.py +23 -0
  17. src/linters/dry/base_token_analyzer.py +76 -0
  18. src/linters/dry/block_filter.py +265 -0
  19. src/linters/dry/block_grouper.py +59 -0
  20. src/linters/dry/cache.py +172 -0
  21. src/linters/dry/cache_query.py +61 -0
  22. src/linters/dry/config.py +134 -0
  23. src/linters/dry/config_loader.py +44 -0
  24. src/linters/dry/deduplicator.py +120 -0
  25. src/linters/dry/duplicate_storage.py +63 -0
  26. src/linters/dry/file_analyzer.py +90 -0
  27. src/linters/dry/inline_ignore.py +140 -0
  28. src/linters/dry/linter.py +163 -0
  29. src/linters/dry/python_analyzer.py +668 -0
  30. src/linters/dry/storage_initializer.py +42 -0
  31. src/linters/dry/token_hasher.py +169 -0
  32. src/linters/dry/typescript_analyzer.py +592 -0
  33. src/linters/dry/violation_builder.py +74 -0
  34. src/linters/dry/violation_filter.py +94 -0
  35. src/linters/dry/violation_generator.py +174 -0
  36. src/linters/file_header/__init__.py +24 -0
  37. src/linters/file_header/atemporal_detector.py +87 -0
  38. src/linters/file_header/config.py +66 -0
  39. src/linters/file_header/field_validator.py +69 -0
  40. src/linters/file_header/linter.py +313 -0
  41. src/linters/file_header/python_parser.py +86 -0
  42. src/linters/file_header/violation_builder.py +78 -0
  43. src/linters/file_placement/config_loader.py +86 -0
  44. src/linters/file_placement/directory_matcher.py +80 -0
  45. src/linters/file_placement/linter.py +262 -471
  46. src/linters/file_placement/path_resolver.py +61 -0
  47. src/linters/file_placement/pattern_matcher.py +55 -0
  48. src/linters/file_placement/pattern_validator.py +106 -0
  49. src/linters/file_placement/rule_checker.py +229 -0
  50. src/linters/file_placement/violation_factory.py +177 -0
  51. src/linters/magic_numbers/__init__.py +48 -0
  52. src/linters/magic_numbers/config.py +82 -0
  53. src/linters/magic_numbers/context_analyzer.py +247 -0
  54. src/linters/magic_numbers/linter.py +516 -0
  55. src/linters/magic_numbers/python_analyzer.py +76 -0
  56. src/linters/magic_numbers/typescript_analyzer.py +218 -0
  57. src/linters/magic_numbers/violation_builder.py +98 -0
  58. src/linters/nesting/__init__.py +6 -2
  59. src/linters/nesting/config.py +17 -4
  60. src/linters/nesting/linter.py +81 -168
  61. src/linters/nesting/typescript_analyzer.py +39 -102
  62. src/linters/nesting/typescript_function_extractor.py +130 -0
  63. src/linters/nesting/violation_builder.py +139 -0
  64. src/linters/print_statements/__init__.py +53 -0
  65. src/linters/print_statements/config.py +83 -0
  66. src/linters/print_statements/linter.py +430 -0
  67. src/linters/print_statements/python_analyzer.py +155 -0
  68. src/linters/print_statements/typescript_analyzer.py +135 -0
  69. src/linters/print_statements/violation_builder.py +98 -0
  70. src/linters/srp/__init__.py +99 -0
  71. src/linters/srp/class_analyzer.py +113 -0
  72. src/linters/srp/config.py +82 -0
  73. src/linters/srp/heuristics.py +89 -0
  74. src/linters/srp/linter.py +234 -0
  75. src/linters/srp/metrics_evaluator.py +47 -0
  76. src/linters/srp/python_analyzer.py +72 -0
  77. src/linters/srp/typescript_analyzer.py +75 -0
  78. src/linters/srp/typescript_metrics_calculator.py +90 -0
  79. src/linters/srp/violation_builder.py +117 -0
  80. src/orchestrator/core.py +54 -9
  81. src/templates/thailint_config_template.yaml +158 -0
  82. src/utils/__init__.py +4 -0
  83. src/utils/project_root.py +203 -0
  84. thailint-0.5.0.dist-info/METADATA +1286 -0
  85. thailint-0.5.0.dist-info/RECORD +96 -0
  86. {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info}/WHEEL +1 -1
  87. src/.ai/layout.yaml +0 -48
  88. thailint-0.1.5.dist-info/METADATA +0 -629
  89. thailint-0.1.5.dist-info/RECORD +0 -28
  90. {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info}/entry_points.txt +0 -0
  91. {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,59 @@
1
+ """
2
+ Purpose: Block grouping utilities for duplicate detection
3
+
4
+ Scope: Groups code blocks by file path
5
+
6
+ Overview: Provides grouping utilities for organizing code blocks by file. Used by ViolationDeduplicator
7
+ to process blocks on a per-file basis for overlap detection. Separates grouping logic to maintain
8
+ SRP compliance.
9
+
10
+ Dependencies: CodeBlock, Violation
11
+
12
+ Exports: BlockGrouper class
13
+
14
+ Interfaces: BlockGrouper.group_blocks_by_file(blocks), group_violations_by_file(violations)
15
+
16
+ Implementation: Simple dictionary-based grouping by file path
17
+ """
18
+
19
+ from pathlib import Path
20
+
21
+ from src.core.types import Violation
22
+
23
+ from .cache import CodeBlock
24
+
25
+
26
+ class BlockGrouper:
27
+ """Groups blocks and violations by file path."""
28
+
29
+ def group_blocks_by_file(self, blocks: list[CodeBlock]) -> dict[Path, list[CodeBlock]]:
30
+ """Group blocks by file path.
31
+
32
+ Args:
33
+ blocks: List of code blocks
34
+
35
+ Returns:
36
+ Dictionary mapping file paths to lists of blocks
37
+ """
38
+ grouped: dict[Path, list[CodeBlock]] = {}
39
+ for block in blocks:
40
+ if block.file_path not in grouped:
41
+ grouped[block.file_path] = []
42
+ grouped[block.file_path].append(block)
43
+ return grouped
44
+
45
+ def group_violations_by_file(self, violations: list[Violation]) -> dict[str, list[Violation]]:
46
+ """Group violations by file path.
47
+
48
+ Args:
49
+ violations: List of violations
50
+
51
+ Returns:
52
+ Dictionary mapping file paths to lists of violations
53
+ """
54
+ grouped: dict[str, list[Violation]] = {}
55
+ for violation in violations:
56
+ if violation.file_path not in grouped:
57
+ grouped[violation.file_path] = []
58
+ grouped[violation.file_path].append(violation)
59
+ return grouped
@@ -0,0 +1,172 @@
1
+ """
2
+ Purpose: SQLite storage manager for DRY linter duplicate detection
3
+
4
+ Scope: Code block storage and duplicate detection queries
5
+
6
+ Overview: Implements in-memory or temporary-file SQLite storage for duplicate code detection.
7
+ Stores code blocks with hash values, file locations, and metadata during a single linter run.
8
+ Supports both :memory: mode (fast, RAM-only) and tempfile mode (disk-backed for large projects).
9
+ No persistence between runs - storage is cleared when linter completes. Includes indexes for
10
+ fast hash lookups enabling cross-file duplicate detection with minimal overhead.
11
+
12
+ Dependencies: Python sqlite3 module (stdlib), tempfile module (stdlib), pathlib.Path, dataclasses
13
+
14
+ Exports: CodeBlock dataclass, DRYCache class
15
+
16
+ Interfaces: DRYCache.__init__(storage_mode), add_blocks(file_path, blocks),
17
+ find_duplicates_by_hash(hash_value), get_duplicate_hashes(), close()
18
+
19
+ Implementation: SQLite with two tables (files, code_blocks), indexed on hash_value for performance,
20
+ storage_mode determines :memory: vs tempfile location, ACID transactions for reliability
21
+ """
22
+
23
+ import sqlite3
24
+ import tempfile
25
+ from dataclasses import dataclass
26
+ from pathlib import Path
27
+
28
+ from .cache_query import CacheQueryService
29
+
30
+
31
+ @dataclass
32
+ class CodeBlock:
33
+ """Represents a code block location with hash."""
34
+
35
+ file_path: Path
36
+ start_line: int
37
+ end_line: int
38
+ snippet: str
39
+ hash_value: int
40
+
41
+
42
+ class DRYCache:
43
+ """SQLite-backed storage for duplicate detection."""
44
+
45
+ SCHEMA_VERSION = 1
46
+
47
+ def __init__(self, storage_mode: str = "memory") -> None:
48
+ """Initialize storage with SQLite database.
49
+
50
+ Args:
51
+ storage_mode: Storage mode - "memory" (default) or "tempfile"
52
+ """
53
+ self._storage_mode = storage_mode
54
+ self._tempfile = None
55
+
56
+ # Create SQLite connection based on storage mode
57
+ if storage_mode == "memory":
58
+ self.db = sqlite3.connect(":memory:")
59
+ elif storage_mode == "tempfile":
60
+ # Create temporary file that auto-deletes on close
61
+ # pylint: disable=consider-using-with
62
+ # Justification: tempfile must remain open for SQLite connection lifetime.
63
+ # It is explicitly closed in close() method when cache is finalized.
64
+ self._tempfile = tempfile.NamedTemporaryFile(suffix=".db", delete=True)
65
+ self.db = sqlite3.connect(self._tempfile.name)
66
+ else:
67
+ raise ValueError(f"Invalid storage_mode: {storage_mode}")
68
+
69
+ self._query_service = CacheQueryService()
70
+
71
+ # Create schema
72
+ self.db.execute(
73
+ """CREATE TABLE IF NOT EXISTS files (
74
+ file_path TEXT PRIMARY KEY,
75
+ mtime REAL NOT NULL,
76
+ hash_count INTEGER,
77
+ last_scanned TIMESTAMP DEFAULT CURRENT_TIMESTAMP
78
+ )"""
79
+ )
80
+
81
+ self.db.execute(
82
+ """CREATE TABLE IF NOT EXISTS code_blocks (
83
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
84
+ file_path TEXT NOT NULL,
85
+ hash_value INTEGER NOT NULL,
86
+ start_line INTEGER NOT NULL,
87
+ end_line INTEGER NOT NULL,
88
+ snippet TEXT NOT NULL,
89
+ FOREIGN KEY (file_path) REFERENCES files(file_path) ON DELETE CASCADE
90
+ )"""
91
+ )
92
+
93
+ self.db.execute("CREATE INDEX IF NOT EXISTS idx_hash_value ON code_blocks(hash_value)")
94
+ self.db.execute("CREATE INDEX IF NOT EXISTS idx_file_path ON code_blocks(file_path)")
95
+
96
+ self.db.commit()
97
+
98
+ def add_blocks(self, file_path: Path, blocks: list[CodeBlock]) -> None:
99
+ """Add code blocks to storage.
100
+
101
+ Args:
102
+ file_path: Path to source file
103
+ blocks: List of CodeBlock instances to store
104
+ """
105
+ if not blocks:
106
+ return
107
+
108
+ # Insert file metadata
109
+ try:
110
+ mtime = file_path.stat().st_mtime
111
+ except OSError:
112
+ mtime = 0.0 # File doesn't exist, use placeholder
113
+
114
+ self.db.execute(
115
+ "INSERT OR REPLACE INTO files (file_path, mtime, hash_count) VALUES (?, ?, ?)",
116
+ (str(file_path), mtime, len(blocks)),
117
+ )
118
+
119
+ # Insert code blocks
120
+ for block in blocks:
121
+ self.db.execute(
122
+ """INSERT INTO code_blocks
123
+ (file_path, hash_value, start_line, end_line, snippet)
124
+ VALUES (?, ?, ?, ?, ?)""",
125
+ (
126
+ str(file_path),
127
+ block.hash_value,
128
+ block.start_line,
129
+ block.end_line,
130
+ block.snippet,
131
+ ),
132
+ )
133
+
134
+ self.db.commit()
135
+
136
+ def find_duplicates_by_hash(self, hash_value: int) -> list[CodeBlock]:
137
+ """Find all code blocks with the given hash value.
138
+
139
+ Args:
140
+ hash_value: Hash value to search for
141
+
142
+ Returns:
143
+ List of ALL CodeBlock instances with this hash (from all files)
144
+ """
145
+ rows = self._query_service.find_blocks_by_hash(self.db, hash_value)
146
+
147
+ blocks = []
148
+ for file_path_str, start, end, snippet, hash_val in rows:
149
+ block = CodeBlock(
150
+ file_path=Path(file_path_str),
151
+ start_line=start,
152
+ end_line=end,
153
+ snippet=snippet,
154
+ hash_value=hash_val,
155
+ )
156
+ blocks.append(block)
157
+
158
+ return blocks
159
+
160
+ def get_duplicate_hashes(self) -> list[int]:
161
+ """Get all hash values that appear 2+ times.
162
+
163
+ Returns:
164
+ List of hash values with 2 or more occurrences
165
+ """
166
+ return self._query_service.get_duplicate_hashes(self.db)
167
+
168
+ def close(self) -> None:
169
+ """Close database connection and cleanup tempfile if used."""
170
+ self.db.close()
171
+ if self._tempfile:
172
+ self._tempfile.close()
@@ -0,0 +1,61 @@
1
+ """
2
+ Purpose: Query service for DRY cache database
3
+
4
+ Scope: Handles SQL queries for duplicate hash detection
5
+
6
+ Overview: Provides query methods for finding duplicate code blocks in the SQLite cache. Extracts
7
+ query logic from DRYCache to maintain SRP compliance. Handles queries for duplicate hashes
8
+ and blocks by hash value.
9
+
10
+ Dependencies: sqlite3.Connection
11
+
12
+ Exports: CacheQueryService class
13
+
14
+ Interfaces: CacheQueryService.get_duplicate_hashes(db), find_duplicates_by_hash(db, hash_value)
15
+
16
+ Implementation: SQL queries for duplicate detection, returns hash values and block data
17
+ """
18
+
19
+ import sqlite3
20
+
21
+
22
+ class CacheQueryService:
23
+ """Handles cache database queries."""
24
+
25
+ def get_duplicate_hashes(self, db: sqlite3.Connection) -> list[int]:
26
+ """Get all hash values that appear 2+ times.
27
+
28
+ Args:
29
+ db: Database connection
30
+
31
+ Returns:
32
+ List of hash values with 2 or more occurrences
33
+ """
34
+ cursor = db.execute(
35
+ """SELECT hash_value
36
+ FROM code_blocks
37
+ GROUP BY hash_value
38
+ HAVING COUNT(*) >= 2"""
39
+ )
40
+
41
+ return [row[0] for row in cursor]
42
+
43
+ def find_blocks_by_hash(self, db: sqlite3.Connection, hash_value: int) -> list[tuple]:
44
+ """Find all blocks with given hash value.
45
+
46
+ Args:
47
+ db: Database connection
48
+ hash_value: Hash to search for
49
+
50
+ Returns:
51
+ List of tuples (file_path, start_line, end_line, snippet, hash_value)
52
+ """
53
+ cursor = db.execute(
54
+ """SELECT file_path, start_line, end_line, snippet, hash_value
55
+ FROM code_blocks
56
+ WHERE hash_value = ?
57
+ ORDER BY file_path, start_line""",
58
+ (hash_value,),
59
+ )
60
+
61
+ return cursor.fetchall()
@@ -0,0 +1,134 @@
1
+ """
2
+ Purpose: Configuration schema for DRY linter with caching support
3
+
4
+ Scope: DRYConfig dataclass with validation, defaults, and loading from dictionary
5
+
6
+ Overview: Defines configuration structure for the DRY linter including duplicate detection thresholds,
7
+ caching settings, and ignore patterns. Provides validation of configuration values to ensure
8
+ sensible defaults and prevent misconfiguration. Supports loading from YAML configuration files
9
+ through from_dict classmethod. Cache enabled by default for performance on large codebases.
10
+
11
+ Dependencies: Python dataclasses module
12
+
13
+ Exports: DRYConfig dataclass
14
+
15
+ Interfaces: DRYConfig.__init__, DRYConfig.from_dict(config: dict) -> DRYConfig
16
+
17
+ Implementation: Dataclass with field defaults, __post_init__ validation, and dict-based construction
18
+ """
19
+
20
+ from dataclasses import dataclass, field
21
+ from typing import Any
22
+
23
+ # Default configuration constants
24
+ DEFAULT_MIN_DUPLICATE_LINES = 3
25
+ DEFAULT_MIN_DUPLICATE_TOKENS = 30
26
+
27
+
28
+ @dataclass
29
+ class DRYConfig: # pylint: disable=too-many-instance-attributes
30
+ """Configuration for DRY linter.
31
+
32
+ Note: Pylint too-many-instance-attributes disabled. This is a configuration
33
+ dataclass serving as a data container for related DRY linter settings.
34
+ All attributes are cohesively related (detection thresholds, language
35
+ overrides, storage mode, filtering). Splitting would reduce cohesion and make
36
+ configuration loading more complex without meaningful benefit.
37
+ """
38
+
39
+ enabled: bool = False # Must be explicitly enabled
40
+ min_duplicate_lines: int = DEFAULT_MIN_DUPLICATE_LINES
41
+ min_duplicate_tokens: int = DEFAULT_MIN_DUPLICATE_TOKENS
42
+ min_occurrences: int = 2 # Minimum occurrences to report (default: 2)
43
+
44
+ # Language-specific overrides
45
+ python_min_occurrences: int | None = None
46
+ typescript_min_occurrences: int | None = None
47
+ javascript_min_occurrences: int | None = None
48
+
49
+ # Storage settings
50
+ storage_mode: str = "memory" # Options: "memory" (default) or "tempfile"
51
+
52
+ # Ignore patterns
53
+ ignore_patterns: list[str] = field(default_factory=lambda: ["tests/", "__init__.py"])
54
+
55
+ # Block filters (extensible false positive filtering)
56
+ filters: dict[str, bool] = field(
57
+ default_factory=lambda: {
58
+ "keyword_argument_filter": True, # Filter keyword argument blocks
59
+ "import_group_filter": True, # Filter import statement groups
60
+ }
61
+ )
62
+
63
+ def __post_init__(self) -> None:
64
+ """Validate configuration values."""
65
+ if self.min_duplicate_lines <= 0:
66
+ raise ValueError(
67
+ f"min_duplicate_lines must be positive, got {self.min_duplicate_lines}"
68
+ )
69
+ if self.min_duplicate_tokens <= 0:
70
+ raise ValueError(
71
+ f"min_duplicate_tokens must be positive, got {self.min_duplicate_tokens}"
72
+ )
73
+ if self.min_occurrences <= 0:
74
+ raise ValueError(f"min_occurrences must be positive, got {self.min_occurrences}")
75
+ if self.storage_mode not in ("memory", "tempfile"):
76
+ raise ValueError(
77
+ f"storage_mode must be 'memory' or 'tempfile', got '{self.storage_mode}'"
78
+ )
79
+
80
+ def get_min_occurrences_for_language(self, language: str) -> int:
81
+ """Get minimum occurrences threshold for a specific language.
82
+
83
+ Args:
84
+ language: Language identifier (e.g., "python", "typescript", "javascript")
85
+
86
+ Returns:
87
+ Minimum occurrences threshold for the language, or global default
88
+ """
89
+ language_lower = language.lower()
90
+
91
+ language_overrides = {
92
+ "python": self.python_min_occurrences,
93
+ "typescript": self.typescript_min_occurrences,
94
+ "javascript": self.javascript_min_occurrences,
95
+ }
96
+
97
+ override = language_overrides.get(language_lower)
98
+ return override if override is not None else self.min_occurrences
99
+
100
+ @classmethod
101
+ def from_dict(cls, config: dict[str, Any]) -> "DRYConfig":
102
+ """Load configuration from dictionary.
103
+
104
+ Args:
105
+ config: Dictionary containing configuration values
106
+
107
+ Returns:
108
+ DRYConfig instance with values from dictionary
109
+ """
110
+ # Extract language-specific min_occurrences
111
+ python_config = config.get("python", {})
112
+ typescript_config = config.get("typescript", {})
113
+ javascript_config = config.get("javascript", {})
114
+
115
+ # Load filter configuration (merge with defaults)
116
+ default_filters = {
117
+ "keyword_argument_filter": True,
118
+ "import_group_filter": True,
119
+ }
120
+ custom_filters = config.get("filters", {})
121
+ filters = {**default_filters, **custom_filters}
122
+
123
+ return cls(
124
+ enabled=config.get("enabled", False),
125
+ min_duplicate_lines=config.get("min_duplicate_lines", DEFAULT_MIN_DUPLICATE_LINES),
126
+ min_duplicate_tokens=config.get("min_duplicate_tokens", DEFAULT_MIN_DUPLICATE_TOKENS),
127
+ min_occurrences=config.get("min_occurrences", 2),
128
+ python_min_occurrences=python_config.get("min_occurrences"),
129
+ typescript_min_occurrences=typescript_config.get("min_occurrences"),
130
+ javascript_min_occurrences=javascript_config.get("min_occurrences"),
131
+ storage_mode=config.get("storage_mode", "memory"),
132
+ ignore_patterns=config.get("ignore", []),
133
+ filters=filters,
134
+ )
@@ -0,0 +1,44 @@
1
+ """
2
+ Purpose: Configuration loading from lint context metadata
3
+
4
+ Scope: Extracts and validates DRY configuration from context
5
+
6
+ Overview: Handles extraction of DRY configuration from BaseLintContext metadata dictionary.
7
+ Validates configuration structure and converts to DRYConfig instance. Separates config
8
+ loading logic from main linter rule to maintain SRP compliance.
9
+
10
+ Dependencies: BaseLintContext, DRYConfig
11
+
12
+ Exports: ConfigLoader class
13
+
14
+ Interfaces: ConfigLoader.load_config(context) -> DRYConfig
15
+
16
+ Implementation: Extracts from context metadata, validates dict structure, uses DRYConfig.from_dict()
17
+ """
18
+
19
+ from src.core.base import BaseLintContext
20
+
21
+ from .config import DRYConfig
22
+
23
+
24
+ class ConfigLoader:
25
+ """Loads DRY configuration from lint context."""
26
+
27
+ def load_config(self, context: BaseLintContext) -> DRYConfig:
28
+ """Load configuration from context metadata.
29
+
30
+ Args:
31
+ context: Lint context containing metadata
32
+
33
+ Returns:
34
+ DRYConfig instance
35
+ """
36
+ metadata = getattr(context, "metadata", None)
37
+ if not isinstance(metadata, dict):
38
+ return DRYConfig()
39
+
40
+ config_dict = metadata.get("dry", {})
41
+ if not isinstance(config_dict, dict):
42
+ return DRYConfig()
43
+
44
+ return DRYConfig.from_dict(config_dict)
@@ -0,0 +1,120 @@
1
+ """
2
+ Purpose: Deduplication utility for overlapping code block violations
3
+
4
+ Scope: Handles filtering of overlapping duplicate code violations
5
+
6
+ Overview: Provides utilities to remove overlapping violations from duplicate code detection results.
7
+ Delegates grouping to BlockGrouper and filtering to ViolationFilter. Handles both block-level
8
+ deduplication (one block per file) and violation-level deduplication (removing overlaps).
9
+
10
+ Dependencies: CodeBlock, Violation, BlockGrouper, ViolationFilter
11
+
12
+ Exports: ViolationDeduplicator class
13
+
14
+ Interfaces: ViolationDeduplicator.deduplicate_blocks(blocks), deduplicate_violations(violations)
15
+
16
+ Implementation: Delegates to BlockGrouper and ViolationFilter for SRP compliance
17
+ """
18
+
19
+ from src.core.types import Violation
20
+
21
+ from .block_grouper import BlockGrouper
22
+ from .cache import CodeBlock
23
+ from .violation_filter import ViolationFilter
24
+
25
+
26
+ class ViolationDeduplicator:
27
+ """Removes overlapping duplicate code violations."""
28
+
29
+ def __init__(self) -> None:
30
+ """Initialize with helper components."""
31
+ self._grouper = BlockGrouper()
32
+ self._filter = ViolationFilter()
33
+
34
+ def deduplicate_blocks(self, blocks: list[CodeBlock]) -> list[CodeBlock]:
35
+ """Remove overlapping blocks from same file.
36
+
37
+ When rolling hash creates overlapping windows, keep non-overlapping blocks.
38
+ Blocks are overlapping if they share any line numbers in the same file.
39
+
40
+ Args:
41
+ blocks: List of code blocks (may have overlaps from rolling hash)
42
+
43
+ Returns:
44
+ Deduplicated list of blocks (non-overlapping blocks preserved)
45
+ """
46
+ if not blocks:
47
+ return []
48
+
49
+ grouped = self._grouper.group_blocks_by_file(blocks)
50
+ deduplicated = []
51
+
52
+ for file_blocks in grouped.values():
53
+ kept = self._remove_overlaps_from_file(file_blocks)
54
+ deduplicated.extend(kept)
55
+
56
+ return deduplicated
57
+
58
+ def _remove_overlaps_from_file(self, file_blocks: list[CodeBlock]) -> list[CodeBlock]:
59
+ """Remove overlapping blocks from single file.
60
+
61
+ Args:
62
+ file_blocks: Blocks from same file
63
+
64
+ Returns:
65
+ Non-overlapping blocks
66
+ """
67
+ sorted_blocks = sorted(file_blocks, key=lambda b: b.start_line)
68
+ kept_blocks: list[CodeBlock] = []
69
+
70
+ for block in sorted_blocks:
71
+ if not self._overlaps_any_kept(block, kept_blocks):
72
+ kept_blocks.append(block)
73
+
74
+ return kept_blocks
75
+
76
+ def _overlaps_any_kept(self, block: CodeBlock, kept_blocks: list[CodeBlock]) -> bool:
77
+ """Check if block overlaps with any kept blocks.
78
+
79
+ Args:
80
+ block: Block to check
81
+ kept_blocks: Previously kept blocks
82
+
83
+ Returns:
84
+ True if block overlaps with any kept block
85
+ """
86
+ return any(self._blocks_overlap(block, kept) for kept in kept_blocks)
87
+
88
+ def _blocks_overlap(self, block1: CodeBlock, block2: CodeBlock) -> bool:
89
+ """Check if two blocks overlap (share any lines).
90
+
91
+ Args:
92
+ block1: First code block
93
+ block2: Second code block
94
+
95
+ Returns:
96
+ True if blocks overlap
97
+ """
98
+ return block1.start_line <= block2.end_line and block2.start_line <= block1.end_line
99
+
100
+ def deduplicate_violations(self, violations: list[Violation]) -> list[Violation]:
101
+ """Remove overlapping violations from same file.
102
+
103
+ Args:
104
+ violations: List of violations (may overlap)
105
+
106
+ Returns:
107
+ Deduplicated list of violations
108
+ """
109
+ if not violations:
110
+ return []
111
+
112
+ grouped = self._grouper.group_violations_by_file(violations)
113
+ deduplicated = []
114
+
115
+ for file_violations in grouped.values():
116
+ sorted_violations = sorted(file_violations, key=lambda v: v.line or 0)
117
+ kept = self._filter.filter_overlapping(sorted_violations)
118
+ deduplicated.extend(kept)
119
+
120
+ return deduplicated
@@ -0,0 +1,63 @@
1
+ """
2
+ Purpose: Storage management for duplicate code blocks in SQLite
3
+
4
+ Scope: Manages storage of code blocks in SQLite for duplicate detection
5
+
6
+ Overview: Provides storage interface for code blocks using SQLite (in-memory or tempfile mode).
7
+ Handles block insertion and duplicate hash queries. Delegates all storage operations to
8
+ DRYCache SQLite layer. Separates storage concerns from linting logic to maintain SRP compliance.
9
+
10
+ Dependencies: DRYCache, CodeBlock, Path
11
+
12
+ Exports: DuplicateStorage class
13
+
14
+ Interfaces: DuplicateStorage.add_blocks(file_path, blocks), get_duplicate_hashes(),
15
+ get_blocks_for_hash(hash_value)
16
+
17
+ Implementation: Delegates to SQLite cache for all storage operations
18
+ """
19
+
20
+ from pathlib import Path
21
+
22
+ from .cache import CodeBlock, DRYCache
23
+
24
+
25
+ class DuplicateStorage:
26
+ """Manages storage of code blocks in SQLite."""
27
+
28
+ def __init__(self, cache: DRYCache) -> None:
29
+ """Initialize storage with SQLite cache.
30
+
31
+ Args:
32
+ cache: SQLite cache instance (in-memory or tempfile mode)
33
+ """
34
+ self._cache = cache
35
+
36
+ def add_blocks(self, file_path: Path, blocks: list[CodeBlock]) -> None:
37
+ """Add code blocks to SQLite storage.
38
+
39
+ Args:
40
+ file_path: Path to source file
41
+ blocks: List of code blocks to store
42
+ """
43
+ if blocks:
44
+ self._cache.add_blocks(file_path, blocks)
45
+
46
+ def get_duplicate_hashes(self) -> list[int]:
47
+ """Get all hash values with 2+ occurrences from SQLite.
48
+
49
+ Returns:
50
+ List of hash values that appear in multiple blocks
51
+ """
52
+ return self._cache.get_duplicate_hashes()
53
+
54
+ def get_blocks_for_hash(self, hash_value: int) -> list[CodeBlock]:
55
+ """Get all blocks with given hash value from SQLite.
56
+
57
+ Args:
58
+ hash_value: Hash to search for
59
+
60
+ Returns:
61
+ List of code blocks with this hash
62
+ """
63
+ return self._cache.find_duplicates_by_hash(hash_value)