thailint 0.1.5__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- src/__init__.py +7 -2
- src/analyzers/__init__.py +23 -0
- src/analyzers/typescript_base.py +148 -0
- src/api.py +1 -1
- src/cli.py +1111 -144
- src/config.py +12 -33
- src/core/base.py +102 -5
- src/core/cli_utils.py +206 -0
- src/core/config_parser.py +126 -0
- src/core/linter_utils.py +168 -0
- src/core/registry.py +17 -92
- src/core/rule_discovery.py +132 -0
- src/core/violation_builder.py +122 -0
- src/linter_config/ignore.py +112 -40
- src/linter_config/loader.py +3 -13
- src/linters/dry/__init__.py +23 -0
- src/linters/dry/base_token_analyzer.py +76 -0
- src/linters/dry/block_filter.py +265 -0
- src/linters/dry/block_grouper.py +59 -0
- src/linters/dry/cache.py +172 -0
- src/linters/dry/cache_query.py +61 -0
- src/linters/dry/config.py +134 -0
- src/linters/dry/config_loader.py +44 -0
- src/linters/dry/deduplicator.py +120 -0
- src/linters/dry/duplicate_storage.py +63 -0
- src/linters/dry/file_analyzer.py +90 -0
- src/linters/dry/inline_ignore.py +140 -0
- src/linters/dry/linter.py +163 -0
- src/linters/dry/python_analyzer.py +668 -0
- src/linters/dry/storage_initializer.py +42 -0
- src/linters/dry/token_hasher.py +169 -0
- src/linters/dry/typescript_analyzer.py +592 -0
- src/linters/dry/violation_builder.py +74 -0
- src/linters/dry/violation_filter.py +94 -0
- src/linters/dry/violation_generator.py +174 -0
- src/linters/file_header/__init__.py +24 -0
- src/linters/file_header/atemporal_detector.py +87 -0
- src/linters/file_header/config.py +66 -0
- src/linters/file_header/field_validator.py +69 -0
- src/linters/file_header/linter.py +313 -0
- src/linters/file_header/python_parser.py +86 -0
- src/linters/file_header/violation_builder.py +78 -0
- src/linters/file_placement/config_loader.py +86 -0
- src/linters/file_placement/directory_matcher.py +80 -0
- src/linters/file_placement/linter.py +262 -471
- src/linters/file_placement/path_resolver.py +61 -0
- src/linters/file_placement/pattern_matcher.py +55 -0
- src/linters/file_placement/pattern_validator.py +106 -0
- src/linters/file_placement/rule_checker.py +229 -0
- src/linters/file_placement/violation_factory.py +177 -0
- src/linters/magic_numbers/__init__.py +48 -0
- src/linters/magic_numbers/config.py +82 -0
- src/linters/magic_numbers/context_analyzer.py +247 -0
- src/linters/magic_numbers/linter.py +516 -0
- src/linters/magic_numbers/python_analyzer.py +76 -0
- src/linters/magic_numbers/typescript_analyzer.py +218 -0
- src/linters/magic_numbers/violation_builder.py +98 -0
- src/linters/nesting/__init__.py +6 -2
- src/linters/nesting/config.py +17 -4
- src/linters/nesting/linter.py +81 -168
- src/linters/nesting/typescript_analyzer.py +39 -102
- src/linters/nesting/typescript_function_extractor.py +130 -0
- src/linters/nesting/violation_builder.py +139 -0
- src/linters/print_statements/__init__.py +53 -0
- src/linters/print_statements/config.py +83 -0
- src/linters/print_statements/linter.py +430 -0
- src/linters/print_statements/python_analyzer.py +155 -0
- src/linters/print_statements/typescript_analyzer.py +135 -0
- src/linters/print_statements/violation_builder.py +98 -0
- src/linters/srp/__init__.py +99 -0
- src/linters/srp/class_analyzer.py +113 -0
- src/linters/srp/config.py +82 -0
- src/linters/srp/heuristics.py +89 -0
- src/linters/srp/linter.py +234 -0
- src/linters/srp/metrics_evaluator.py +47 -0
- src/linters/srp/python_analyzer.py +72 -0
- src/linters/srp/typescript_analyzer.py +75 -0
- src/linters/srp/typescript_metrics_calculator.py +90 -0
- src/linters/srp/violation_builder.py +117 -0
- src/orchestrator/core.py +54 -9
- src/templates/thailint_config_template.yaml +158 -0
- src/utils/__init__.py +4 -0
- src/utils/project_root.py +203 -0
- thailint-0.5.0.dist-info/METADATA +1286 -0
- thailint-0.5.0.dist-info/RECORD +96 -0
- {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info}/WHEEL +1 -1
- src/.ai/layout.yaml +0 -48
- thailint-0.1.5.dist-info/METADATA +0 -629
- thailint-0.1.5.dist-info/RECORD +0 -28
- {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info}/entry_points.txt +0 -0
- {thailint-0.1.5.dist-info → thailint-0.5.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Block grouping utilities for duplicate detection
|
|
3
|
+
|
|
4
|
+
Scope: Groups code blocks by file path
|
|
5
|
+
|
|
6
|
+
Overview: Provides grouping utilities for organizing code blocks by file. Used by ViolationDeduplicator
|
|
7
|
+
to process blocks on a per-file basis for overlap detection. Separates grouping logic to maintain
|
|
8
|
+
SRP compliance.
|
|
9
|
+
|
|
10
|
+
Dependencies: CodeBlock, Violation
|
|
11
|
+
|
|
12
|
+
Exports: BlockGrouper class
|
|
13
|
+
|
|
14
|
+
Interfaces: BlockGrouper.group_blocks_by_file(blocks), group_violations_by_file(violations)
|
|
15
|
+
|
|
16
|
+
Implementation: Simple dictionary-based grouping by file path
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from src.core.types import Violation
|
|
22
|
+
|
|
23
|
+
from .cache import CodeBlock
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BlockGrouper:
|
|
27
|
+
"""Groups blocks and violations by file path."""
|
|
28
|
+
|
|
29
|
+
def group_blocks_by_file(self, blocks: list[CodeBlock]) -> dict[Path, list[CodeBlock]]:
|
|
30
|
+
"""Group blocks by file path.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
blocks: List of code blocks
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Dictionary mapping file paths to lists of blocks
|
|
37
|
+
"""
|
|
38
|
+
grouped: dict[Path, list[CodeBlock]] = {}
|
|
39
|
+
for block in blocks:
|
|
40
|
+
if block.file_path not in grouped:
|
|
41
|
+
grouped[block.file_path] = []
|
|
42
|
+
grouped[block.file_path].append(block)
|
|
43
|
+
return grouped
|
|
44
|
+
|
|
45
|
+
def group_violations_by_file(self, violations: list[Violation]) -> dict[str, list[Violation]]:
|
|
46
|
+
"""Group violations by file path.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
violations: List of violations
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Dictionary mapping file paths to lists of violations
|
|
53
|
+
"""
|
|
54
|
+
grouped: dict[str, list[Violation]] = {}
|
|
55
|
+
for violation in violations:
|
|
56
|
+
if violation.file_path not in grouped:
|
|
57
|
+
grouped[violation.file_path] = []
|
|
58
|
+
grouped[violation.file_path].append(violation)
|
|
59
|
+
return grouped
|
src/linters/dry/cache.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: SQLite storage manager for DRY linter duplicate detection
|
|
3
|
+
|
|
4
|
+
Scope: Code block storage and duplicate detection queries
|
|
5
|
+
|
|
6
|
+
Overview: Implements in-memory or temporary-file SQLite storage for duplicate code detection.
|
|
7
|
+
Stores code blocks with hash values, file locations, and metadata during a single linter run.
|
|
8
|
+
Supports both :memory: mode (fast, RAM-only) and tempfile mode (disk-backed for large projects).
|
|
9
|
+
No persistence between runs - storage is cleared when linter completes. Includes indexes for
|
|
10
|
+
fast hash lookups enabling cross-file duplicate detection with minimal overhead.
|
|
11
|
+
|
|
12
|
+
Dependencies: Python sqlite3 module (stdlib), tempfile module (stdlib), pathlib.Path, dataclasses
|
|
13
|
+
|
|
14
|
+
Exports: CodeBlock dataclass, DRYCache class
|
|
15
|
+
|
|
16
|
+
Interfaces: DRYCache.__init__(storage_mode), add_blocks(file_path, blocks),
|
|
17
|
+
find_duplicates_by_hash(hash_value), get_duplicate_hashes(), close()
|
|
18
|
+
|
|
19
|
+
Implementation: SQLite with two tables (files, code_blocks), indexed on hash_value for performance,
|
|
20
|
+
storage_mode determines :memory: vs tempfile location, ACID transactions for reliability
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import sqlite3
|
|
24
|
+
import tempfile
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
from .cache_query import CacheQueryService
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class CodeBlock:
|
|
33
|
+
"""Represents a code block location with hash."""
|
|
34
|
+
|
|
35
|
+
file_path: Path
|
|
36
|
+
start_line: int
|
|
37
|
+
end_line: int
|
|
38
|
+
snippet: str
|
|
39
|
+
hash_value: int
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DRYCache:
|
|
43
|
+
"""SQLite-backed storage for duplicate detection."""
|
|
44
|
+
|
|
45
|
+
SCHEMA_VERSION = 1
|
|
46
|
+
|
|
47
|
+
def __init__(self, storage_mode: str = "memory") -> None:
|
|
48
|
+
"""Initialize storage with SQLite database.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
storage_mode: Storage mode - "memory" (default) or "tempfile"
|
|
52
|
+
"""
|
|
53
|
+
self._storage_mode = storage_mode
|
|
54
|
+
self._tempfile = None
|
|
55
|
+
|
|
56
|
+
# Create SQLite connection based on storage mode
|
|
57
|
+
if storage_mode == "memory":
|
|
58
|
+
self.db = sqlite3.connect(":memory:")
|
|
59
|
+
elif storage_mode == "tempfile":
|
|
60
|
+
# Create temporary file that auto-deletes on close
|
|
61
|
+
# pylint: disable=consider-using-with
|
|
62
|
+
# Justification: tempfile must remain open for SQLite connection lifetime.
|
|
63
|
+
# It is explicitly closed in close() method when cache is finalized.
|
|
64
|
+
self._tempfile = tempfile.NamedTemporaryFile(suffix=".db", delete=True)
|
|
65
|
+
self.db = sqlite3.connect(self._tempfile.name)
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError(f"Invalid storage_mode: {storage_mode}")
|
|
68
|
+
|
|
69
|
+
self._query_service = CacheQueryService()
|
|
70
|
+
|
|
71
|
+
# Create schema
|
|
72
|
+
self.db.execute(
|
|
73
|
+
"""CREATE TABLE IF NOT EXISTS files (
|
|
74
|
+
file_path TEXT PRIMARY KEY,
|
|
75
|
+
mtime REAL NOT NULL,
|
|
76
|
+
hash_count INTEGER,
|
|
77
|
+
last_scanned TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
78
|
+
)"""
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
self.db.execute(
|
|
82
|
+
"""CREATE TABLE IF NOT EXISTS code_blocks (
|
|
83
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
84
|
+
file_path TEXT NOT NULL,
|
|
85
|
+
hash_value INTEGER NOT NULL,
|
|
86
|
+
start_line INTEGER NOT NULL,
|
|
87
|
+
end_line INTEGER NOT NULL,
|
|
88
|
+
snippet TEXT NOT NULL,
|
|
89
|
+
FOREIGN KEY (file_path) REFERENCES files(file_path) ON DELETE CASCADE
|
|
90
|
+
)"""
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
self.db.execute("CREATE INDEX IF NOT EXISTS idx_hash_value ON code_blocks(hash_value)")
|
|
94
|
+
self.db.execute("CREATE INDEX IF NOT EXISTS idx_file_path ON code_blocks(file_path)")
|
|
95
|
+
|
|
96
|
+
self.db.commit()
|
|
97
|
+
|
|
98
|
+
def add_blocks(self, file_path: Path, blocks: list[CodeBlock]) -> None:
|
|
99
|
+
"""Add code blocks to storage.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
file_path: Path to source file
|
|
103
|
+
blocks: List of CodeBlock instances to store
|
|
104
|
+
"""
|
|
105
|
+
if not blocks:
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
# Insert file metadata
|
|
109
|
+
try:
|
|
110
|
+
mtime = file_path.stat().st_mtime
|
|
111
|
+
except OSError:
|
|
112
|
+
mtime = 0.0 # File doesn't exist, use placeholder
|
|
113
|
+
|
|
114
|
+
self.db.execute(
|
|
115
|
+
"INSERT OR REPLACE INTO files (file_path, mtime, hash_count) VALUES (?, ?, ?)",
|
|
116
|
+
(str(file_path), mtime, len(blocks)),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Insert code blocks
|
|
120
|
+
for block in blocks:
|
|
121
|
+
self.db.execute(
|
|
122
|
+
"""INSERT INTO code_blocks
|
|
123
|
+
(file_path, hash_value, start_line, end_line, snippet)
|
|
124
|
+
VALUES (?, ?, ?, ?, ?)""",
|
|
125
|
+
(
|
|
126
|
+
str(file_path),
|
|
127
|
+
block.hash_value,
|
|
128
|
+
block.start_line,
|
|
129
|
+
block.end_line,
|
|
130
|
+
block.snippet,
|
|
131
|
+
),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
self.db.commit()
|
|
135
|
+
|
|
136
|
+
def find_duplicates_by_hash(self, hash_value: int) -> list[CodeBlock]:
|
|
137
|
+
"""Find all code blocks with the given hash value.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
hash_value: Hash value to search for
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
List of ALL CodeBlock instances with this hash (from all files)
|
|
144
|
+
"""
|
|
145
|
+
rows = self._query_service.find_blocks_by_hash(self.db, hash_value)
|
|
146
|
+
|
|
147
|
+
blocks = []
|
|
148
|
+
for file_path_str, start, end, snippet, hash_val in rows:
|
|
149
|
+
block = CodeBlock(
|
|
150
|
+
file_path=Path(file_path_str),
|
|
151
|
+
start_line=start,
|
|
152
|
+
end_line=end,
|
|
153
|
+
snippet=snippet,
|
|
154
|
+
hash_value=hash_val,
|
|
155
|
+
)
|
|
156
|
+
blocks.append(block)
|
|
157
|
+
|
|
158
|
+
return blocks
|
|
159
|
+
|
|
160
|
+
def get_duplicate_hashes(self) -> list[int]:
|
|
161
|
+
"""Get all hash values that appear 2+ times.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of hash values with 2 or more occurrences
|
|
165
|
+
"""
|
|
166
|
+
return self._query_service.get_duplicate_hashes(self.db)
|
|
167
|
+
|
|
168
|
+
def close(self) -> None:
|
|
169
|
+
"""Close database connection and cleanup tempfile if used."""
|
|
170
|
+
self.db.close()
|
|
171
|
+
if self._tempfile:
|
|
172
|
+
self._tempfile.close()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Query service for DRY cache database
|
|
3
|
+
|
|
4
|
+
Scope: Handles SQL queries for duplicate hash detection
|
|
5
|
+
|
|
6
|
+
Overview: Provides query methods for finding duplicate code blocks in the SQLite cache. Extracts
|
|
7
|
+
query logic from DRYCache to maintain SRP compliance. Handles queries for duplicate hashes
|
|
8
|
+
and blocks by hash value.
|
|
9
|
+
|
|
10
|
+
Dependencies: sqlite3.Connection
|
|
11
|
+
|
|
12
|
+
Exports: CacheQueryService class
|
|
13
|
+
|
|
14
|
+
Interfaces: CacheQueryService.get_duplicate_hashes(db), find_duplicates_by_hash(db, hash_value)
|
|
15
|
+
|
|
16
|
+
Implementation: SQL queries for duplicate detection, returns hash values and block data
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sqlite3
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CacheQueryService:
|
|
23
|
+
"""Handles cache database queries."""
|
|
24
|
+
|
|
25
|
+
def get_duplicate_hashes(self, db: sqlite3.Connection) -> list[int]:
|
|
26
|
+
"""Get all hash values that appear 2+ times.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
db: Database connection
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
List of hash values with 2 or more occurrences
|
|
33
|
+
"""
|
|
34
|
+
cursor = db.execute(
|
|
35
|
+
"""SELECT hash_value
|
|
36
|
+
FROM code_blocks
|
|
37
|
+
GROUP BY hash_value
|
|
38
|
+
HAVING COUNT(*) >= 2"""
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
return [row[0] for row in cursor]
|
|
42
|
+
|
|
43
|
+
def find_blocks_by_hash(self, db: sqlite3.Connection, hash_value: int) -> list[tuple]:
|
|
44
|
+
"""Find all blocks with given hash value.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
db: Database connection
|
|
48
|
+
hash_value: Hash to search for
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
List of tuples (file_path, start_line, end_line, snippet, hash_value)
|
|
52
|
+
"""
|
|
53
|
+
cursor = db.execute(
|
|
54
|
+
"""SELECT file_path, start_line, end_line, snippet, hash_value
|
|
55
|
+
FROM code_blocks
|
|
56
|
+
WHERE hash_value = ?
|
|
57
|
+
ORDER BY file_path, start_line""",
|
|
58
|
+
(hash_value,),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return cursor.fetchall()
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Configuration schema for DRY linter with caching support
|
|
3
|
+
|
|
4
|
+
Scope: DRYConfig dataclass with validation, defaults, and loading from dictionary
|
|
5
|
+
|
|
6
|
+
Overview: Defines configuration structure for the DRY linter including duplicate detection thresholds,
|
|
7
|
+
caching settings, and ignore patterns. Provides validation of configuration values to ensure
|
|
8
|
+
sensible defaults and prevent misconfiguration. Supports loading from YAML configuration files
|
|
9
|
+
through from_dict classmethod. Cache enabled by default for performance on large codebases.
|
|
10
|
+
|
|
11
|
+
Dependencies: Python dataclasses module
|
|
12
|
+
|
|
13
|
+
Exports: DRYConfig dataclass
|
|
14
|
+
|
|
15
|
+
Interfaces: DRYConfig.__init__, DRYConfig.from_dict(config: dict) -> DRYConfig
|
|
16
|
+
|
|
17
|
+
Implementation: Dataclass with field defaults, __post_init__ validation, and dict-based construction
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
# Default configuration constants
|
|
24
|
+
DEFAULT_MIN_DUPLICATE_LINES = 3
|
|
25
|
+
DEFAULT_MIN_DUPLICATE_TOKENS = 30
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class DRYConfig: # pylint: disable=too-many-instance-attributes
|
|
30
|
+
"""Configuration for DRY linter.
|
|
31
|
+
|
|
32
|
+
Note: Pylint too-many-instance-attributes disabled. This is a configuration
|
|
33
|
+
dataclass serving as a data container for related DRY linter settings.
|
|
34
|
+
All attributes are cohesively related (detection thresholds, language
|
|
35
|
+
overrides, storage mode, filtering). Splitting would reduce cohesion and make
|
|
36
|
+
configuration loading more complex without meaningful benefit.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
enabled: bool = False # Must be explicitly enabled
|
|
40
|
+
min_duplicate_lines: int = DEFAULT_MIN_DUPLICATE_LINES
|
|
41
|
+
min_duplicate_tokens: int = DEFAULT_MIN_DUPLICATE_TOKENS
|
|
42
|
+
min_occurrences: int = 2 # Minimum occurrences to report (default: 2)
|
|
43
|
+
|
|
44
|
+
# Language-specific overrides
|
|
45
|
+
python_min_occurrences: int | None = None
|
|
46
|
+
typescript_min_occurrences: int | None = None
|
|
47
|
+
javascript_min_occurrences: int | None = None
|
|
48
|
+
|
|
49
|
+
# Storage settings
|
|
50
|
+
storage_mode: str = "memory" # Options: "memory" (default) or "tempfile"
|
|
51
|
+
|
|
52
|
+
# Ignore patterns
|
|
53
|
+
ignore_patterns: list[str] = field(default_factory=lambda: ["tests/", "__init__.py"])
|
|
54
|
+
|
|
55
|
+
# Block filters (extensible false positive filtering)
|
|
56
|
+
filters: dict[str, bool] = field(
|
|
57
|
+
default_factory=lambda: {
|
|
58
|
+
"keyword_argument_filter": True, # Filter keyword argument blocks
|
|
59
|
+
"import_group_filter": True, # Filter import statement groups
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def __post_init__(self) -> None:
|
|
64
|
+
"""Validate configuration values."""
|
|
65
|
+
if self.min_duplicate_lines <= 0:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"min_duplicate_lines must be positive, got {self.min_duplicate_lines}"
|
|
68
|
+
)
|
|
69
|
+
if self.min_duplicate_tokens <= 0:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"min_duplicate_tokens must be positive, got {self.min_duplicate_tokens}"
|
|
72
|
+
)
|
|
73
|
+
if self.min_occurrences <= 0:
|
|
74
|
+
raise ValueError(f"min_occurrences must be positive, got {self.min_occurrences}")
|
|
75
|
+
if self.storage_mode not in ("memory", "tempfile"):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"storage_mode must be 'memory' or 'tempfile', got '{self.storage_mode}'"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def get_min_occurrences_for_language(self, language: str) -> int:
|
|
81
|
+
"""Get minimum occurrences threshold for a specific language.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
language: Language identifier (e.g., "python", "typescript", "javascript")
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Minimum occurrences threshold for the language, or global default
|
|
88
|
+
"""
|
|
89
|
+
language_lower = language.lower()
|
|
90
|
+
|
|
91
|
+
language_overrides = {
|
|
92
|
+
"python": self.python_min_occurrences,
|
|
93
|
+
"typescript": self.typescript_min_occurrences,
|
|
94
|
+
"javascript": self.javascript_min_occurrences,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
override = language_overrides.get(language_lower)
|
|
98
|
+
return override if override is not None else self.min_occurrences
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def from_dict(cls, config: dict[str, Any]) -> "DRYConfig":
|
|
102
|
+
"""Load configuration from dictionary.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
config: Dictionary containing configuration values
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
DRYConfig instance with values from dictionary
|
|
109
|
+
"""
|
|
110
|
+
# Extract language-specific min_occurrences
|
|
111
|
+
python_config = config.get("python", {})
|
|
112
|
+
typescript_config = config.get("typescript", {})
|
|
113
|
+
javascript_config = config.get("javascript", {})
|
|
114
|
+
|
|
115
|
+
# Load filter configuration (merge with defaults)
|
|
116
|
+
default_filters = {
|
|
117
|
+
"keyword_argument_filter": True,
|
|
118
|
+
"import_group_filter": True,
|
|
119
|
+
}
|
|
120
|
+
custom_filters = config.get("filters", {})
|
|
121
|
+
filters = {**default_filters, **custom_filters}
|
|
122
|
+
|
|
123
|
+
return cls(
|
|
124
|
+
enabled=config.get("enabled", False),
|
|
125
|
+
min_duplicate_lines=config.get("min_duplicate_lines", DEFAULT_MIN_DUPLICATE_LINES),
|
|
126
|
+
min_duplicate_tokens=config.get("min_duplicate_tokens", DEFAULT_MIN_DUPLICATE_TOKENS),
|
|
127
|
+
min_occurrences=config.get("min_occurrences", 2),
|
|
128
|
+
python_min_occurrences=python_config.get("min_occurrences"),
|
|
129
|
+
typescript_min_occurrences=typescript_config.get("min_occurrences"),
|
|
130
|
+
javascript_min_occurrences=javascript_config.get("min_occurrences"),
|
|
131
|
+
storage_mode=config.get("storage_mode", "memory"),
|
|
132
|
+
ignore_patterns=config.get("ignore", []),
|
|
133
|
+
filters=filters,
|
|
134
|
+
)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Configuration loading from lint context metadata
|
|
3
|
+
|
|
4
|
+
Scope: Extracts and validates DRY configuration from context
|
|
5
|
+
|
|
6
|
+
Overview: Handles extraction of DRY configuration from BaseLintContext metadata dictionary.
|
|
7
|
+
Validates configuration structure and converts to DRYConfig instance. Separates config
|
|
8
|
+
loading logic from main linter rule to maintain SRP compliance.
|
|
9
|
+
|
|
10
|
+
Dependencies: BaseLintContext, DRYConfig
|
|
11
|
+
|
|
12
|
+
Exports: ConfigLoader class
|
|
13
|
+
|
|
14
|
+
Interfaces: ConfigLoader.load_config(context) -> DRYConfig
|
|
15
|
+
|
|
16
|
+
Implementation: Extracts from context metadata, validates dict structure, uses DRYConfig.from_dict()
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from src.core.base import BaseLintContext
|
|
20
|
+
|
|
21
|
+
from .config import DRYConfig
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ConfigLoader:
|
|
25
|
+
"""Loads DRY configuration from lint context."""
|
|
26
|
+
|
|
27
|
+
def load_config(self, context: BaseLintContext) -> DRYConfig:
|
|
28
|
+
"""Load configuration from context metadata.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
context: Lint context containing metadata
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
DRYConfig instance
|
|
35
|
+
"""
|
|
36
|
+
metadata = getattr(context, "metadata", None)
|
|
37
|
+
if not isinstance(metadata, dict):
|
|
38
|
+
return DRYConfig()
|
|
39
|
+
|
|
40
|
+
config_dict = metadata.get("dry", {})
|
|
41
|
+
if not isinstance(config_dict, dict):
|
|
42
|
+
return DRYConfig()
|
|
43
|
+
|
|
44
|
+
return DRYConfig.from_dict(config_dict)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Deduplication utility for overlapping code block violations
|
|
3
|
+
|
|
4
|
+
Scope: Handles filtering of overlapping duplicate code violations
|
|
5
|
+
|
|
6
|
+
Overview: Provides utilities to remove overlapping violations from duplicate code detection results.
|
|
7
|
+
Delegates grouping to BlockGrouper and filtering to ViolationFilter. Handles both block-level
|
|
8
|
+
deduplication (one block per file) and violation-level deduplication (removing overlaps).
|
|
9
|
+
|
|
10
|
+
Dependencies: CodeBlock, Violation, BlockGrouper, ViolationFilter
|
|
11
|
+
|
|
12
|
+
Exports: ViolationDeduplicator class
|
|
13
|
+
|
|
14
|
+
Interfaces: ViolationDeduplicator.deduplicate_blocks(blocks), deduplicate_violations(violations)
|
|
15
|
+
|
|
16
|
+
Implementation: Delegates to BlockGrouper and ViolationFilter for SRP compliance
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from src.core.types import Violation
|
|
20
|
+
|
|
21
|
+
from .block_grouper import BlockGrouper
|
|
22
|
+
from .cache import CodeBlock
|
|
23
|
+
from .violation_filter import ViolationFilter
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ViolationDeduplicator:
|
|
27
|
+
"""Removes overlapping duplicate code violations."""
|
|
28
|
+
|
|
29
|
+
def __init__(self) -> None:
|
|
30
|
+
"""Initialize with helper components."""
|
|
31
|
+
self._grouper = BlockGrouper()
|
|
32
|
+
self._filter = ViolationFilter()
|
|
33
|
+
|
|
34
|
+
def deduplicate_blocks(self, blocks: list[CodeBlock]) -> list[CodeBlock]:
|
|
35
|
+
"""Remove overlapping blocks from same file.
|
|
36
|
+
|
|
37
|
+
When rolling hash creates overlapping windows, keep non-overlapping blocks.
|
|
38
|
+
Blocks are overlapping if they share any line numbers in the same file.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
blocks: List of code blocks (may have overlaps from rolling hash)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Deduplicated list of blocks (non-overlapping blocks preserved)
|
|
45
|
+
"""
|
|
46
|
+
if not blocks:
|
|
47
|
+
return []
|
|
48
|
+
|
|
49
|
+
grouped = self._grouper.group_blocks_by_file(blocks)
|
|
50
|
+
deduplicated = []
|
|
51
|
+
|
|
52
|
+
for file_blocks in grouped.values():
|
|
53
|
+
kept = self._remove_overlaps_from_file(file_blocks)
|
|
54
|
+
deduplicated.extend(kept)
|
|
55
|
+
|
|
56
|
+
return deduplicated
|
|
57
|
+
|
|
58
|
+
def _remove_overlaps_from_file(self, file_blocks: list[CodeBlock]) -> list[CodeBlock]:
|
|
59
|
+
"""Remove overlapping blocks from single file.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
file_blocks: Blocks from same file
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Non-overlapping blocks
|
|
66
|
+
"""
|
|
67
|
+
sorted_blocks = sorted(file_blocks, key=lambda b: b.start_line)
|
|
68
|
+
kept_blocks: list[CodeBlock] = []
|
|
69
|
+
|
|
70
|
+
for block in sorted_blocks:
|
|
71
|
+
if not self._overlaps_any_kept(block, kept_blocks):
|
|
72
|
+
kept_blocks.append(block)
|
|
73
|
+
|
|
74
|
+
return kept_blocks
|
|
75
|
+
|
|
76
|
+
def _overlaps_any_kept(self, block: CodeBlock, kept_blocks: list[CodeBlock]) -> bool:
|
|
77
|
+
"""Check if block overlaps with any kept blocks.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
block: Block to check
|
|
81
|
+
kept_blocks: Previously kept blocks
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
True if block overlaps with any kept block
|
|
85
|
+
"""
|
|
86
|
+
return any(self._blocks_overlap(block, kept) for kept in kept_blocks)
|
|
87
|
+
|
|
88
|
+
def _blocks_overlap(self, block1: CodeBlock, block2: CodeBlock) -> bool:
|
|
89
|
+
"""Check if two blocks overlap (share any lines).
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
block1: First code block
|
|
93
|
+
block2: Second code block
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
True if blocks overlap
|
|
97
|
+
"""
|
|
98
|
+
return block1.start_line <= block2.end_line and block2.start_line <= block1.end_line
|
|
99
|
+
|
|
100
|
+
def deduplicate_violations(self, violations: list[Violation]) -> list[Violation]:
|
|
101
|
+
"""Remove overlapping violations from same file.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
violations: List of violations (may overlap)
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Deduplicated list of violations
|
|
108
|
+
"""
|
|
109
|
+
if not violations:
|
|
110
|
+
return []
|
|
111
|
+
|
|
112
|
+
grouped = self._grouper.group_violations_by_file(violations)
|
|
113
|
+
deduplicated = []
|
|
114
|
+
|
|
115
|
+
for file_violations in grouped.values():
|
|
116
|
+
sorted_violations = sorted(file_violations, key=lambda v: v.line or 0)
|
|
117
|
+
kept = self._filter.filter_overlapping(sorted_violations)
|
|
118
|
+
deduplicated.extend(kept)
|
|
119
|
+
|
|
120
|
+
return deduplicated
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Purpose: Storage management for duplicate code blocks in SQLite
|
|
3
|
+
|
|
4
|
+
Scope: Manages storage of code blocks in SQLite for duplicate detection
|
|
5
|
+
|
|
6
|
+
Overview: Provides storage interface for code blocks using SQLite (in-memory or tempfile mode).
|
|
7
|
+
Handles block insertion and duplicate hash queries. Delegates all storage operations to
|
|
8
|
+
DRYCache SQLite layer. Separates storage concerns from linting logic to maintain SRP compliance.
|
|
9
|
+
|
|
10
|
+
Dependencies: DRYCache, CodeBlock, Path
|
|
11
|
+
|
|
12
|
+
Exports: DuplicateStorage class
|
|
13
|
+
|
|
14
|
+
Interfaces: DuplicateStorage.add_blocks(file_path, blocks), get_duplicate_hashes(),
|
|
15
|
+
get_blocks_for_hash(hash_value)
|
|
16
|
+
|
|
17
|
+
Implementation: Delegates to SQLite cache for all storage operations
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from .cache import CodeBlock, DRYCache
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DuplicateStorage:
|
|
26
|
+
"""Manages storage of code blocks in SQLite."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, cache: DRYCache) -> None:
|
|
29
|
+
"""Initialize storage with SQLite cache.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
cache: SQLite cache instance (in-memory or tempfile mode)
|
|
33
|
+
"""
|
|
34
|
+
self._cache = cache
|
|
35
|
+
|
|
36
|
+
def add_blocks(self, file_path: Path, blocks: list[CodeBlock]) -> None:
|
|
37
|
+
"""Add code blocks to SQLite storage.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
file_path: Path to source file
|
|
41
|
+
blocks: List of code blocks to store
|
|
42
|
+
"""
|
|
43
|
+
if blocks:
|
|
44
|
+
self._cache.add_blocks(file_path, blocks)
|
|
45
|
+
|
|
46
|
+
def get_duplicate_hashes(self) -> list[int]:
|
|
47
|
+
"""Get all hash values with 2+ occurrences from SQLite.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of hash values that appear in multiple blocks
|
|
51
|
+
"""
|
|
52
|
+
return self._cache.get_duplicate_hashes()
|
|
53
|
+
|
|
54
|
+
def get_blocks_for_hash(self, hash_value: int) -> list[CodeBlock]:
|
|
55
|
+
"""Get all blocks with given hash value from SQLite.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
hash_value: Hash to search for
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
List of code blocks with this hash
|
|
62
|
+
"""
|
|
63
|
+
return self._cache.find_duplicates_by_hash(hash_value)
|